In [72]:
! pip install tensorflow keras --upgrade --no-index --find-links /kaggle/input/package

Looking in links: /kaggle/input/package


In [73]:
import tensorflow as tf
import tensorflow.keras as tf_keras

tf.__version__

'2.15.0'

In [74]:
from typing import Dict, Optional

from tensorflow.keras import Model
from tensorflow.keras.layers import Dense
from tensorflow.python.framework.dtypes import DType

class DeepCrossNetwork(tf_keras.Model):
    def __init__(self,
                 transformations_by_feature: Dict[str, object],
                 **kwargs):
        super(DeepCrossNetwork, self).__init__(**kwargs)
        self.transformations_by_feature = transformations_by_feature
        self._build_layers()


    def _parse_into_layer(self, transformation: Dict[str, object]):
        type = transformation['type']
        feature_props = transformation['properties']

        if type == 'numerical_embedding':
            return tf_keras.layers.Embedding(feature_props['vocab_size'], feature_props['embedding_size'])
        elif type == 'onehot':
            return tf_keras.layers.StringLookup(vocabulary=feature_props['vocab'] + ['NA'], output_mode='one_hot')
        elif type == 'binning':
            return tf_keras.layers.Discretization(bin_boundaries=feature_props['boundaries'])
        elif type == 'standardization':
            tf_keras.layers.Normalization(mean=feature_props['mean'], variance=feature_props['stddev'])
        else:
            return tf_keras.layers.Identity()

    def _build_layers(self):
        input_by_feature_name, transformed_by_feature_name = {}, {}
        for feature, transformation in self.transformations_by_feature.items():
            transformation = self._parse_into_layer(transformation)
            dtype = self._get_dtype_by_transformation(transformation)
            inputs_placeholder = tf_keras.Input((1, ),
                                                dtype=dtype,
                                                name=feature)
            transformed = tf.cast(transformation(inputs_placeholder), tf.float32)

            input_by_feature_name[feature] = inputs_placeholder
            transformed_by_feature_name[feature] = transformed

        concatenated_input = tf_keras.layers.Concatenate(axis=-1)(list(transformed_by_feature_name.values()))
        cross_layer_output = self._build_cross_layers(concatenated_input)
        logits = self._build_dense_layers(inputs=cross_layer_output)

        self.model = Model(input_by_feature_name, logits)

    def _get_dtype_by_transformation(self, transformation: tf_keras.layers.Layer) -> DType:
        if isinstance(transformation, tf_keras.layers.StringLookup):
            return tf.string
        return tf.float32

    def _build_cross_layers(self, x0):
        # TODO: Please add parameters
        x1 = self._cross(x0, x0)
        x2 = self._cross(x0, x1)

        return x2


    def _cross(self,
               x0,
               x1,
               use_bias: bool = True,
               activation: tf_keras.layers.Activation = None,
               kernel_initializer: tf_keras.initializers.Initializer = tf_keras.initializers.truncated_normal,
               bias_initializer: tf_keras.initializers.Initializer = tf_keras.initializers.zeros,
               kernel_regularizer: Optional[tf_keras.regularizers.Regularizer] = None,
               bias_regularizer: Optional[tf_keras.regularizers.Regularizer] = None) -> tf.Tensor:
        layer = tf_keras.layers.Dense(
            x0.shape[-1],
            kernel_initializer=kernel_initializer,
            bias_initializer=bias_initializer,
            kernel_regularizer=kernel_regularizer,
            bias_regularizer=bias_regularizer,
            use_bias=use_bias,
            dtype=x0.dtype,
            activation=activation,
        )

        result = layer(x1)
        result = tf.cast(result, x0.dtype)

        return x0 * result + result


    def _build_dense_layers(self, inputs):
        # TODO: Please add parameters
        layer1 = Dense(50, activation=tf_keras.activations.relu)
        layer2 = Dense(30, activation=tf_keras.activations.relu)

        output = layer2(layer1(inputs))
        logits = Dense(units=1, activation=tf_keras.activations.sigmoid)(output)

        return logits

    def call(self, inputs):
        return self.model(inputs)

    def get_config(self):
        return {"transformations_by_feature": self.transformations_by_feature}

    @classmethod
    def from_config(cls, config):
        return cls(**config)


DeepCrossNetwork.__module__ = 'model.dcn'

In [75]:
import numpy as np

DATA_PATH = "/kaggle/input/home-credit-credit-risk-model-stability/"
TEST_DATA_PATH = "/tmp/test.parquet"
SEED = 617

import os
os.environ['TF_USE_LEGACY_KERAS'] = '1'
import pandas as pd
from pathlib import Path
from argparse import Namespace
from dataclasses import dataclass
from pyarrow.parquet import ParquetFile

POSTFIXES = {
    "P": "Transform DPD (Days Past Due)",
    "M": "Masking Categories",
    "A": "Transform Amount",
    "D": "Transform Date",
    "T": "Unspecified Transform",
    "L": "Unspecified Transform",
}


class RawFile:
    def __init__(self, file_name: str = "") -> None:
        self.file_name = str(file_name)

        if isinstance(self.file_name, str) and self.file_name:
            (
                self._type,
                self._name,
                self._depth,
                self._index,
                self._file_format
            ) = self._parse_file_name()
        else:
            raise ValueError(f"file_name should be a non-empty string. Not {file_name}.")

    def __repr__(self) -> str:
        return f"{self.file_name}"

    def __str__(self) -> str:
        return self.file_name

    def __lt__(self, other) -> bool:
        return self.file_name < other.file_name

    @property
    def type(self) -> str:
        return self._type

    @property
    def depth(self) -> str:
        return self._depth

    @property
    def index(self) -> str:
        return self._index

    @property
    def format(self) -> str:
        return self._file_format

    @property
    def name(self) -> str:
        return self._name

    @property
    def fullname(self) -> str:
        return self.file_name.rsplit(".", 1)[0]

    def _parse_file_name(self) -> tuple[str, str, str, str]:
        fullname = self.fullname
        file_format = self.file_name.rsplit(".", 1)[1]

        names = fullname.split("_")
        if names[-2].isdigit():
            return names[0], "_".join(names[1:-2]), names[-2], names[-1], file_format
        elif names[-1].isdigit():
            return names[0], "_".join(names[1:-1]), names[-1], "", file_format
        else:
            return names[0], "_".join(names[1:]), "", "", file_format

    def get_path(self, data_dir: Path = None) -> Path:
        if data_dir is None:
            data_dir = DATA_PATH
        return Path(data_dir) / f"{self.format}_files" / self.type / self.file_name

    def startswith(self, keyword: str) -> bool:
        return self.file_name.startswith(keyword)


@dataclass
class ColInfo:
    name: str

    def __repr__(self) -> str:
        return self.name

    def __str__(self) -> str:
        return self.name

    @property
    def desc(self) -> str:
        return self.describe()

    def describe(
            self,
            description_file: str = "feature_definitions.csv"
    ) -> str:
        description_df = pd.read_csv(DATA_PATH / description_file, usecols=["Variable", "Description"])

        if self.name in description_df["Variable"].values:
            result_description = (
                description_df.loc[
                    description_df["Variable"] == self.name, "Description"
                ].values[0])
        else:
            result_description = self.name

        if self.name[-1] in POSTFIXES:
            return f"{self.name}: {result_description} ({POSTFIXES[self.name[-1]]})"
        else:
            return f"{self.name}: {result_description}"


class RawReader:
    def __init__(self, format_: str = "parquet") -> None:
        self.format = format_

        if format_ == "parquet":
            self.reader = pd.read_parquet
            self.column_getter = self._get_parquet_columns
        elif format_ == "csv":
            self.reader = pd.read_csv
            self.column_getter = self._get_csv_columns
        else:
            raise ValueError(f"format_ should be either 'parquet' or 'csv'. Not {format_}.")

    def read(self, file_path: Path) -> pd.DataFrame:
        return self.reader(file_path)

    def columns(self, file_path: Path) -> list[ColInfo]:
        return [ColInfo(c) for c in self.column_getter(file_path)]

    def _get_csv_columns(self, file_path: Path) -> list[ColInfo]:
        return [c for c in self.reader(file_path, nrows=0).columns]

    def _get_parquet_columns(self, file_path: Path) -> list[ColInfo]:
        return [c for c in ParquetFile(file_path).columns]

    def __call__(self, file_path) -> pd.DataFrame:
        return self.read(file_path)


class RawInfo:
    VALID_TYPES = ["", "train", "test"]
    VALID_DEPTHS = ["", "0", "1", "2"]

    def __init__(self, config: dict = None) -> None:
        self.config = config
        if self.config is None:
            self.config = Namespace(**{
                "data_path": DATA_PATH,
                "raw_format": "parquet",
            })

        self.format = self.config.raw_format
        self.data_dir_path = Path(self.config.data_path)
        self.file_dir_path = self.data_dir_path / f"{self.format}_files"

        if not self.file_dir_path.exists():
            raise FileNotFoundError(f"{self.file_dir_path} does not exist.")

        self.reader = RawReader(self.format)

    def show_files(self, type_: str = "train") -> list[RawFile]:
        return sorted([RawFile(f) for f in os.listdir(self.file_dir_path / type_)])

    def get_files(self, filename: str, *, depth: int = None, type_: str = "train") -> list[RawFile]:
        if depth is None:
            return sorted([
                f for f in self.show_files(type_) if f.name == filename])
        else:
            return sorted([
                f for f in self.show_files(type_)
                if f.name == filename and f.depth == str(depth)])

    def get_depths_by_name(self, file_name: str, type_: str = "train") -> list[int]:
        return sorted(list(set([int(f.depth) for f in self.get_files(file_name, type_=type_)])))

    def get_files_by_depth(self, depth: int, type_: str = "train") -> list[RawFile]:
        return [f for f in self.show_files(type_) if f.depth == str(depth)]

    def read_raw(
            self,
            file_name: str,
            *,
            depth: int = None,
            type_: str = "train",
    ) -> pd.DataFrame:
        raw_files = self.get_files(file_name, depth=depth, type_=type_)

        if len(raw_files) > 0:
            raw_df = pd.concat([self.reader(rf.get_path(self.data_dir_path)) for rf in raw_files])
        else:
            raise FileNotFoundError(f"{file_name} (depth: {depth}) does not exist in {type_} files.")

        return raw_df


def prepare_base_data(conf: Namespace = None, type_: str = "train"):
    print("prepare_base_data ...")
    infos = RawInfo(conf)
    base_df = infos.read_raw("base", type_=type_)
    static_df = infos.read_raw("static", depth=0, type_=type_)
    static_cb_df = infos.read_raw("static_cb", depth=0, type_=type_)

    joined_df = pd.merge(base_df, static_df, on="case_id", how="left", suffixes=("_base", "_static"))
    joined_df = pd.merge(joined_df, static_cb_df, on="case_id", how="left", suffixes=("", "_static_cb"))
    
    return joined_df


def devval(df):
    conditions = [
        df["MONTH"].between(201909, 202008),
        df["MONTH"].between(201901, 201908)
    ]
    choices = [0, 1]
    df['devval'] = np.select(conditions, choices, default=2)

train_base_static = prepare_base_data()
test_base_static = prepare_base_data(type_="test")

devval(train_base_static)

# only use test path
# dev = train_base_static[train_base_static["devval"] == 0].drop("devval", axis=1)
# val = train_base_static[train_base_static["devval"] == 1].drop("devval", axis=1)
test = train_base_static[train_base_static["devval"] == 2].drop("devval", axis=1)

test.to_parquet(TEST_DATA_PATH)

prepare_base_data ...
prepare_base_data ...


In [76]:
import os
import kagglehub
import tensorflow.keras as tf_keras
import tensorflow_io as tfio

path = kagglehub.model_download("josh9191/homecredit/tensorFlow2/dcn")
model_path = "/kaggle/input/homecredit/tensorflow2/dcn/5"

keras_model = tf_keras.models.load_model(model_path)

# df = pd.read_parquet(TEST_DATA_PATH, , engine='pyarrow')
df = pd.read_parquet(TEST_DATA_PATH, columns=['case_id', 'annuity_780A', 'credamount_770A', 'credtype_322L', 'disbursedcredamount_1113A'], engine='pyarrow')
case_id = df['case_id'].to_numpy()
df.drop(columns=['case_id'], inplace=True)

float64_cols = list(df.select_dtypes(include='float64'))
df[float64_cols] = df[float64_cols].astype('float32')

array_dict = {k: v.to_numpy() for k, v in df.to_dict("series").items()}
preds = keras_model.predict(array_dict).reshape((-1,))

submission = pd.DataFrame({
    "case_id": case_id,
    "score": preds
}).set_index('case_id')
submission.to_csv("./submission.csv")

Attaching model 'josh9191/homecredit/tensorFlow2/dcn' to your Kaggle notebook...


