# Import Libraries

In [170]:
import pandas as pd
import matplotlib.pyplot as plt
import keras
import numpy as np
import warnings
warnings.simplefilter('ignore')
from tensorflow.keras import layers
from tensorflow import feature_column
from keras.layers import Dense, Activation, Flatten, Dropout, InputLayer
from keras.models import Sequential
from sklearn.preprocessing import OneHotEncoder
from keras import layers
from keras import Input
import tensorflow as tf
import os
import zipfile
from tqdm import tqdm
import pandas as pd
import scipy as sp
import time

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from pytorch_tabnet.tab_model import TabNetClassifier
from pytorch_tabnet.pretraining import TabNetPretrainer
import torch

# Loading Data

In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
curdir = os.getcwd()
DATA_DIR = f'{curdir}/Data'
TRAIN_DIR = f'{DATA_DIR}/train'
TEST_DIR = f'{DATA_DIR}/test'
OUTPUT_DIR = f'{DATA_DIR}/output'

for pth in [TRAIN_DIR, TEST_DIR, OUTPUT_DIR]:
    if not os.path.exists(pth):
        os.makedirs(pth)
train_zip = "train.zip"
test_zip = "test.zip"
sample_sub = "SampleSubmission.csv"
with zipfile.ZipFile(f'{TRAIN_DIR}/train.zip', 'r') as zip_ref:
    zip_ref.extractall(TRAIN_DIR)
with zipfile.ZipFile(f'{TEST_DIR}/test.zip', 'r') as zip_ref:
        zip_ref.extractall(TEST_DIR)
client_train = reduce_mem_usage(pd.read_csv(f'{TRAIN_DIR}/client_train.csv', low_memory=False))
invoice_train = reduce_mem_usage(pd.read_csv(f'{TRAIN_DIR}/invoice_train.csv', low_memory=False))

client_test = reduce_mem_usage(pd.read_csv(f'{TEST_DIR}/client_test.csv', low_memory=False))
invoice_test = reduce_mem_usage(pd.read_csv(f'{TEST_DIR}/invoice_test.csv', low_memory=False))
sample_submission = pd.read_csv(f'{DATA_DIR}/SampleSubmission.csv', low_memory=False)

Mem. usage decreased to  2.84 Mb (54.2% reduction)
Mem. usage decreased to 315.93 Mb (42.2% reduction)
Mem. usage decreased to  1.11 Mb (50.0% reduction)
Mem. usage decreased to 122.09 Mb (48.4% reduction)


# Data Prep

Data Cleaning: get rid of useless/inconsistent data that are not useful in training phase.
Remove clients with bad counter_statue and remarque in train set

In [4]:
invoice_train = invoice_train.drop(invoice_train[invoice_train.tarif_type == "18"].index)
invoice_train = invoice_train.query("counter_statue in ['0','1','2','3','4','5']")
bad_statue_client = ['train_Client_78338','train_Client_13203','train_Client_53725','train_Client_47780','train_Client_30467']
invoice_train = invoice_train[~invoice_train["client_id"].isin(bad_statue_client)]
invoice_train = invoice_train[~((invoice_train["client_id"]=='train_Client_79075') & (invoice_train['counter_type']=='GAZ'))].reset_index(drop=True)

client_train = client_train[~client_train["client_id"].isin(bad_statue_client)].reset_index(drop=True)

In [5]:
def clean_clients(client, Train = False):
    client['client_catg'] = client['client_catg'].astype("string")
    client['region_group'] =  client['region'].astype("int64").apply(lambda x: 1 if x<=100 else 3 if x>=300 else 2)
    client['region'] = client['region'].astype("string")
    client['disrict'] = client['disrict'].astype("string")
    client["creation_date"] = pd.to_datetime(client['creation_date'],dayfirst=True)
    client['creation_day'] = client['creation_date'].dt.day
    client['creation_month'] = client['creation_date'].dt.month
    client['creation_year'] = client['creation_date'].dt.year
    client['duration']=(2022 - client['creation_date'].dt.year)*12 - client['creation_date'].dt.month
    client['CreationYear'] = client['creation_date'].dt.strftime('%Y').astype(float)
    client = reduce_mem_usage(client)
    return client

In [6]:
client_train = clean_clients(client_train)
client_test = clean_clients(client_test)

Mem. usage decreased to  6.59 Mb (31.1% reduction)
Mem. usage decreased to  2.71 Mb (31.9% reduction)


In [7]:
ohe = OneHotEncoder(sparse_output=False,categories='auto')

ohe_columns = []
ohe_cat_cols = ['disrict','client_catg','region']

ohe.fit(client_train[ohe_cat_cols])

In [8]:
for i,c in enumerate(ohe_cat_cols):
    ohe_columns += [f'{c}_{v}' for v in ohe.categories_[i]]
dummy_train = pd.DataFrame(ohe.transform(client_train[ohe_cat_cols]),columns=ohe_columns)
client_train = reduce_mem_usage(pd.concat([client_train.drop(ohe_cat_cols,axis=1),dummy_train],axis=1))
dummy_test = pd.DataFrame(ohe.transform(client_test[ohe_cat_cols]),columns=ohe_columns)
client_test = reduce_mem_usage(pd.concat([client_test.drop(ohe_cat_cols,axis=1),dummy_test],axis=1))
del dummy_train
del dummy_test

Mem. usage decreased to 11.76 Mb (67.8% reduction)
Mem. usage decreased to  4.93 Mb (68.3% reduction)


In [9]:
data = pd.concat([invoice_train,invoice_test],axis=0).reset_index(drop=True)
data = reduce_mem_usage(data)
# Adjust Wrong Column Vlues
idx = data['months_number']>100
data.loc[idx,['old_index','new_index']] = data.loc[idx,['new_index','months_number']].values
data.loc[idx,['months_number']] = 4
data.loc[data['months_number']==0,['months_number']] = 4
data = reduce_mem_usage(data)
# Adjust Consumption
data['consommation_level_1'] = round(data['consommation_level_1']/data['months_number'],0)
data['consommation_level_2'] = round(data['consommation_level_2']/data['months_number'],0)
data['consommation_level_3'] = round(data['consommation_level_3']/data['months_number'],0)
data['consommation_level_4'] = round(data['consommation_level_4']/data['months_number'],0)
data = reduce_mem_usage(data)
# Adjust Column Type
data['invoice_date'] = pd.to_datetime(data['invoice_date'])

data['tarif_type_str'] = data['tarif_type'].astype(str)

data['counter_statue'] = data['counter_statue'].astype(str)
data['counter_statue_str'] = data['counter_statue']

data['counter_code_str'] = data['counter_code'].astype(str)

data['reading_remarque_str'] = data['reading_remarque'].astype(str)
data = reduce_mem_usage(data)
# Helper Column for Counter Type Split
data['GAZ']='GAZ'
data['ELEC']='ELEC'

data['invoice_date_day'] = data['invoice_date'].dt.day
data['invoice_date_month'] = data['invoice_date'].dt.month
data['invoice_date_year'] = data['invoice_date'].dt.year

data['index_diff'] = data['new_index']-data['old_index']

data['invoice_diff']=data.sort_values(by=['client_id','counter_type','invoice_date']).groupby(['client_id','counter_type'])['invoice_date'].diff().dt.days
data['con_1_diff']=data.sort_values(by=['client_id','counter_type','counter_number','old_index']).groupby(['client_id','counter_type','counter_number'])['consommation_level_1'].diff()
data['con_2_diff']=data.sort_values(by=['client_id','counter_type','counter_number','old_index']).groupby(['client_id','counter_type','counter_number'])['consommation_level_2'].diff()
data['con_3_diff']=data.sort_values(by=['client_id','counter_type','counter_number','old_index']).groupby(['client_id','counter_type','counter_number'])['consommation_level_3'].diff()
data['con_4_diff']=data.sort_values(by=['client_id','counter_type','counter_number','old_index']).groupby(['client_id','counter_type','counter_number'])['consommation_level_4'].diff()

data['index_diff_diff']=data.sort_values(by=['client_id','counter_type','counter_number','old_index']).groupby(['client_id','counter_type','counter_number'])['index_diff'].diff()
# Interaction Columns
data['number_plus_code'] = data['counter_number'] + data['counter_code']
data['number_minus_code'] = data['counter_number'] - data['counter_code']
data['number_multi_code'] = data['counter_number'] * data['counter_code']
data['number_div_code'] = data['counter_number'] / data['counter_code']
data = reduce_mem_usage(data)
# Feature Aggregation
# Create base dataframe for aggregation
base = data[['client_id','ELEC','GAZ']].drop_duplicates()
# Create helper columns for quick mapping
base['ELEC'] = list(zip(base['client_id'],base['ELEC']))
base['GAZ'] = list(zip(base['client_id'],base['GAZ']))

num_feature = ['consommation_level_1','consommation_level_2','consommation_level_3','consommation_level_4',
               'old_index','new_index','number_plus_code','number_minus_code','number_multi_code','number_div_code']

num_diff_feature = ['invoice_diff','con_1_diff','con_2_diff','con_3_diff','con_4_diff','index_diff_diff']

cate_feature = ['tarif_type', 'counter_number','counter_statue', 'counter_code', 
                'reading_remarque','counter_coefficient',
                'invoice_date_day','invoice_date_month','invoice_date_year']

cate_freq_feature = ['tarif_type_str','counter_statue_str', 'counter_code_str','reading_remarque_str']

le = LabelEncoder()
for feature in cate_feature:
    le.fit(data[feature])
    data[feature] = le.transform(data[feature])

for feature in (num_feature):
    mean_dict = dict(data.groupby(['client_id','counter_type'])[feature].mean())
    min_dict = dict(data.groupby(['client_id','counter_type'])[feature].min())
    max_dict = dict(data.groupby(['client_id','counter_type'])[feature].max())
    std_dict = dict(data.groupby(['client_id','counter_type'])[feature].std())
    sum_dict = dict(data.groupby(['client_id','counter_type'])[feature].sum())
    for type_ in ['ELEC','GAZ']:
        base = reduce_mem_usage(base)
        base[f'{feature}_mean_{type_}'] = base[f'{type_}'].map(mean_dict)
        base[f'{feature}_min_{type_}'] = base[f'{type_}'].map(min_dict)
        base[f'{feature}_max_{type_}'] = base[f'{type_}'].map(max_dict)
        base[f'{feature}_std_{type_}'] = base[f'{type_}'].map(std_dict)
        base[f'{feature}_sum_{type_}'] = base[f'{type_}'].map(sum_dict)
               
        base[f'{feature}_range_{type_}'] = base[f'{feature}_max_{type_}'] - base[f'{feature}_min_{type_}']
        base[f'{feature}_max_mean_diff_{type_}'] = base[f'{feature}_max_{type_}'] - base[f'{feature}_mean_{type_}']
# Numerical Diff Feature aggregation
for feature in num_diff_feature:
    mean_dict = dict(data.groupby(['client_id','counter_type'])[feature].mean())
    min_dict = dict(data.groupby(['client_id','counter_type'])[feature].min())
    max_dict = dict(data.groupby(['client_id','counter_type'])[feature].max())
    std_dict = dict(data.groupby(['client_id','counter_type'])[feature].std())
    sum_dict = dict(data.groupby(['client_id','counter_type'])[feature].sum())
    for type_ in ['ELEC','GAZ']:
        base = reduce_mem_usage(base)
        base[f'{feature}_mean_{type_}'] = base[f'{type_}'].map(mean_dict)
        base[f'{feature}_min_{type_}'] = base[f'{type_}'].map(min_dict)
        base[f'{feature}_max_{type_}'] = base[f'{type_}'].map(max_dict)
        base[f'{feature}_std_{type_}'] = base[f'{type_}'].map(std_dict)
        base[f'{feature}_sum_{type_}'] = base[f'{type_}'].map(sum_dict)
        base[f'{feature}_range_{type_}'] = base[f'{feature}_max_{type_}'] - base[f'{feature}_min_{type_}']
        base[f'{feature}_max_mean_diff_{type_}'] = base[f'{feature}_max_{type_}'] - base[f'{feature}_mean_{type_}']
# Categorical Feature aggregation

for feature in cate_feature:
    nunique_dict = dict(data.groupby(['client_id','counter_type'])[feature].nunique())
    mode_dict = dict(data.groupby(['client_id','counter_type'])[feature].agg(lambda x: pd.Series.mode(x)[0]))
    for type_ in ['ELEC','GAZ']:
        base = reduce_mem_usage(base)
        base[f'{feature}_nunique_{type_}'] = base[f'{type_}'].map(nunique_dict)
        base[f'{feature}_mode_{type_}'] = base[f'{type_}'].map(mode_dict)
# Categorical Feature Frequency Aggregation
# 1. Tarif_type
tarif_group = data.groupby(['client_id','counter_type','tarif_type_str']).agg(Percent=('tarif_type_str', 'count'))
tarif_group = (tarif_group / tarif_group.groupby(level=[0, 1]).transform("sum")).reset_index()

tarif_group=tarif_group.set_index(['client_id','counter_type','tarif_type_str']).stack().unstack([2,1])
tarif_group.columns = tarif_group.columns.map('_'.join)
tarif_group.sort_index(axis=1,inplace=True)
tarif_group=tarif_group.add_prefix('Tarif_Type_').reset_index().drop(columns=['level_1']).fillna(0)

#  2. Counter_statue
statue_group = data.groupby(['client_id','counter_type','counter_statue_str']).agg(Percent=('counter_statue_str', 'count'))
statue_group = (statue_group / statue_group.groupby(level=[0, 1]).transform("sum")).reset_index()

statue_group=statue_group.set_index(['client_id','counter_type','counter_statue_str']).stack().unstack([2,1])
statue_group.columns = statue_group.columns.map('_'.join)
statue_group.sort_index(axis=1,inplace=True)
statue_group=statue_group.add_prefix('Statue_').reset_index().drop(columns=['level_1']).fillna(0)

#  3.  Counter_code  
code_group = data.groupby(['client_id','counter_type','counter_code_str']).agg(Percent=('counter_code_str', 'count'))
code_group = (code_group / code_group.groupby(level=[0, 1]).transform("sum")).reset_index()

code_group=code_group.set_index(['client_id','counter_type','counter_code_str']).stack().unstack([2,1])
code_group.columns = code_group.columns.map('_'.join)
code_group.sort_index(axis=1,inplace=True)
code_group=code_group.add_prefix('Code_').reset_index().drop(columns=['level_1']).fillna(0)

#  4.  Reading_remarque 
rem_group = data.groupby(['client_id','counter_type','reading_remarque_str']).agg(Percent=('reading_remarque_str', 'count'))
rem_group = (rem_group / rem_group.groupby(level=[0, 1]).transform("sum")).reset_index()

rem_group=rem_group.set_index(['client_id','counter_type','reading_remarque_str']).stack().unstack([2,1])
rem_group.columns = rem_group.columns.map('_'.join)
rem_group.sort_index(axis=1,inplace=True)
rem_group=rem_group.add_prefix('Rem_').reset_index().drop(columns=['level_1']).fillna(0)
# Extra Features
#  1. Invoice_Count
count_group=data.groupby(['client_id','counter_type']).size().reset_index(name='Invoice_Count')
count_group=count_group.set_index(['client_id','counter_type']).stack().unstack([2,1])
count_group.columns = count_group.columns.map('_'.join)
count_group = count_group.reset_index().fillna(0)

#  2. Invoice Date Range
invoice_range_group=data.groupby(['client_id','counter_type']).agg(first_date=('invoice_date', np.min),
                                                               last_date=('invoice_date', np.max)).reset_index()

invoice_range_group['date_range'] = (invoice_range_group['last_date']-invoice_range_group['first_date']).dt.days

invoice_range_group=invoice_range_group.set_index(['client_id','counter_type']).stack().unstack([2,1])
invoice_range_group.columns = invoice_range_group.columns.map('_'.join)
invoice_range_group = invoice_range_group.reset_index()
# Combine All Features
df_list = [base, count_group, invoice_range_group, tarif_group,statue_group,code_group, rem_group]
final = df_list[0]
for df_ in df_list[1:]:
    final = final.merge(df_, how='left',on='client_id')

final.drop(columns=['ELEC','GAZ'],inplace=True)
final = reduce_mem_usage(final)

Mem. usage decreased to 446.69 Mb (1.4% reduction)
Mem. usage decreased to 428.34 Mb (4.1% reduction)
Mem. usage decreased to 416.10 Mb (20.9% reduction)
Mem. usage decreased to 611.91 Mb (0.0% reduction)
Mem. usage decreased to 1052.48 Mb (11.3% reduction)
Mem. usage decreased to  5.91 Mb (0.0% reduction)
Mem. usage decreased to  8.86 Mb (25.0% reduction)
Mem. usage decreased to 12.92 Mb (12.5% reduction)
Mem. usage decreased to 16.98 Mb (9.8% reduction)
Mem. usage decreased to 19.94 Mb (12.9% reduction)
Mem. usage decreased to 22.89 Mb (7.5% reduction)
Mem. usage decreased to 25.47 Mb (8.0% reduction)
Mem. usage decreased to 29.90 Mb (4.7% reduction)
Mem. usage decreased to 32.49 Mb (9.3% reduction)
Mem. usage decreased to 37.66 Mb (12.1% reduction)
Mem. usage decreased to 42.82 Mb (10.8% reduction)
Mem. usage decreased to 47.99 Mb (9.7% reduction)
Mem. usage decreased to 53.16 Mb (8.9% reduction)
Mem. usage decreased to 58.33 Mb (8.1% reduction)
Mem. usage decreased to 63.50 Mb (7.5

In [10]:
train_invoice_agg = final[final['client_id'].str.contains('train')].reset_index(drop=True)
test_invoice_agg = final[final['client_id'].str.contains('test')].reset_index(drop=True)

In [11]:
def merge_client_invoice(client,invoice_agg):
    df = client.merge(invoice_agg,how='inner',on='client_id')
    
    df['date_range_ELEC'] = df['date_range_ELEC'].astype('float64')
    df['date_range_GAZ'] = df['date_range_GAZ'].astype('float64')
    df['first_date_ELEC'] = pd.to_datetime(df['first_date_ELEC'])
    df['last_date_ELEC'] = pd.to_datetime(df['last_date_ELEC'])
    df['first_date_GAZ'] = pd.to_datetime(df['first_date_GAZ'])
    df['last_date_GAZ'] = pd.to_datetime(df['last_date_GAZ'])
    
    df['first_invoice_gap_ELEC']  = (df['first_date_ELEC']-df['creation_date']).dt.days
    df['last_invoice_gap_ELEC']  = (df['last_date_ELEC']-df['creation_date']).dt.days
    df['first_invoice_gap_GAZ']  = (df['first_date_GAZ']-df['creation_date']).dt.days
    df['last_invoice_gap_GAZ']  = (df['last_date_GAZ']-df['creation_date']).dt.days
    df.drop(columns = ['client_id','first_date_ELEC','last_date_ELEC','first_date_GAZ','last_date_GAZ','creation_date'],inplace=True)
    print(df.shape)
    return df

In [60]:
test_df = reduce_mem_usage(merge_client_invoice(client_test,test_invoice_agg).fillna(0))
train_df = reduce_mem_usage(merge_client_invoice(client_train,train_invoice_agg).fillna(0))

(58069, 383)
Mem. usage decreased to 56.65 Mb (3.8% reduction)
(135488, 384)
Mem. usage decreased to 132.44 Mb (3.8% reduction)


In [85]:
train_ds, val_ds = train_test_split(train_df, test_size = 0.2)
train_labels = np.array(train_ds.target)
train_features = train_ds.drop(["target"], axis = 1)
val_labels = np.array(val_ds.target)
val_features = val_ds.drop(["target"], axis = 1)
bool_train_labels = train_labels != 0
scaler = StandardScaler()
# train_ds = pd.DataFrame(pd.concat([pd.DataFrame(scaler.fit_transform(train_ds.drop(["target"], axis = 1))), pd.DataFrame(train_ds["target"])], axis = 1),columns = train_ds.columns)
# val_ds = pd.DataFrame(pd.concat([pd.DataFrame(scaler.transform(val_ds.drop(["target"], axis = 1))), pd.DataFrame(val_ds["target"])], axis = 1),columns = val_ds.columns)
# test_df = pd.DataFrame(scaler.transform(test_df),columns = test_df.columns)

In [86]:
initial_bias = np.log(train_df[train_df.target == 1].shape[0]/train_df[train_df.target == 0].shape[0])
output_bias = tf.keras.initializers.Constant(initial_bias)


In [89]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  df = dataframe.copy()
  labels = df.pop('target')
  df = {key: np.array(value)[:,tf.newaxis] for key, value in dataframe.items()}
  ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(batch_size)
  return ds

In [90]:
batch_size = 512
train_ds = df_to_dataset(train_ds, batch_size=batch_size)
val_ds = df_to_dataset(val_ds, shuffle=False, batch_size=batch_size)

Perform normalization and onehot encoding on numeric and categorical variables respectively.

Model formulation. The most important layers are the first layer, dropout and final dense layer. The rest are nice-to-have and seem to improve performance of the model.

In [151]:
model = keras.Sequential([
      keras.layers.Dense(2048, activation='elu',kernel_initializer='normal',input_shape=(383,)),
      keras.layers.Dense(2048, activation='sigmoid',kernel_initializer='normal'),
      keras.layers.Dense(1024, activation='sigmoid',kernel_initializer='normal'),
      #keras.layers.Dense(256, activation='sigmoid',kernel_initializer='normal'),
      #keras.layers.Dense(128, activation='sigmoid',kernel_initializer='normal'),
      #keras.layers.Dense(64, activation='sigmoid',kernel_initializer='normal'),
      #keras.layers.Dense(32, activation='sigmoid',kernel_initializer='normal'),
      keras.layers.Dropout(0.5),
      keras.layers.Dense(1,kernel_initializer='normal',bias_initializer=output_bias,activation='sigmoid'),
   ])
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=["AUC"])
model.summary()

In [152]:
tf.keras.utils.plot_model(model, show_shapes=True, rankdir="LR")

You must install graphviz (see instructions at https://graphviz.gitlab.io/download/) for `plot_model` to work.


In [153]:
model.fit(
    train_features,
    train_labels,
    batch_size=128,
    epochs=20,
    validation_data=(val_features, val_labels))

Epoch 1/20
[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 85ms/step - AUC: 0.6784 - loss: 0.2126 - val_AUC: 0.7656 - val_loss: 0.1892
Epoch 2/20
[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 81ms/step - AUC: 0.7225 - loss: 0.2002 - val_AUC: 0.7747 - val_loss: 0.1953
Epoch 3/20
[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 84ms/step - AUC: 0.7285 - loss: 0.1934 - val_AUC: 0.7529 - val_loss: 0.2061
Epoch 4/20
[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 81ms/step - AUC: 0.6976 - loss: 0.1963 - val_AUC: 0.7679 - val_loss: 0.1877
Epoch 5/20
[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 78ms/step - AUC: 0.7292 - loss: 0.1967 - val_AUC: 0.7720 - val_loss: 0.1885
Epoch 6/20
[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 82ms/step - AUC: 0.7173 - loss: 0.1953 - val_AUC: 0.7760 - val_loss: 0.1899
Epoch 7/20
[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0

<keras.src.callbacks.history.History at 0x55d1abe50>

In [154]:
model.fit(
    train_features,
    train_labels,
    batch_size=256,
    epochs=20,
    validation_data=(val_features, val_labels))

Epoch 1/20
[1m424/424[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 120ms/step - AUC: 0.7276 - loss: 0.1934 - val_AUC: 0.7603 - val_loss: 0.1953
Epoch 2/20
[1m424/424[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 113ms/step - AUC: 0.7154 - loss: 0.1947 - val_AUC: 0.7708 - val_loss: 0.1855
Epoch 3/20
[1m424/424[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 110ms/step - AUC: 0.7290 - loss: 0.1959 - val_AUC: 0.7512 - val_loss: 0.1908
Epoch 4/20
[1m424/424[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 109ms/step - AUC: 0.7159 - loss: 0.1943 - val_AUC: 0.7561 - val_loss: 0.1893
Epoch 5/20
[1m424/424[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 131ms/step - AUC: 0.7045 - loss: 0.1976 - val_AUC: 0.7603 - val_loss: 0.1891
Epoch 6/20
[1m424/424[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 119ms/step - AUC: 0.7113 - loss: 0.1979 - val_AUC: 0.7399 - val_loss: 0.1896
Epoch 7/20
[1m424/424[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

<keras.src.callbacks.history.History at 0x55d1fd490>

In [155]:
model.fit(
    train_features,
    train_labels,
    batch_size=512,
    epochs=20,
    validation_data=(val_features, val_labels))

Epoch 1/20
[1m212/212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1633s[0m 8s/step - AUC: 0.6308 - loss: 0.2029 - val_AUC: 0.6695 - val_loss: 0.1985
Epoch 2/20
[1m212/212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 198ms/step - AUC: 0.6359 - loss: 0.2018 - val_AUC: 0.6700 - val_loss: 0.1968
Epoch 3/20
[1m212/212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 187ms/step - AUC: 0.6402 - loss: 0.2032 - val_AUC: 0.6696 - val_loss: 0.1954
Epoch 4/20
[1m212/212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 189ms/step - AUC: 0.6472 - loss: 0.2003 - val_AUC: 0.6700 - val_loss: 0.1960
Epoch 5/20
[1m212/212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 190ms/step - AUC: 0.6331 - loss: 0.2037 - val_AUC: 0.6702 - val_loss: 0.1968
Epoch 6/20
[1m212/212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4356s[0m 21s/step - AUC: 0.6361 - loss: 0.2011 - val_AUC: 0.6782 - val_loss: 0.1960
Epoch 7/20
[1m212/212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4

<keras.src.callbacks.history.History at 0x58bde9790>

In [156]:
model.fit(
    train_features,
    train_labels,
    batch_size=1024,
    epochs=20,
    validation_data=(val_features, val_labels))

Epoch 1/20
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 418ms/step - AUC: 0.6528 - loss: 0.1969 - val_AUC: 0.6806 - val_loss: 0.1928
Epoch 2/20
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 422ms/step - AUC: 0.6427 - loss: 0.2026 - val_AUC: 0.6797 - val_loss: 0.1940
Epoch 3/20
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 422ms/step - AUC: 0.6491 - loss: 0.1993 - val_AUC: 0.6796 - val_loss: 0.1945
Epoch 4/20
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 417ms/step - AUC: 0.6412 - loss: 0.2011 - val_AUC: 0.6801 - val_loss: 0.1923
Epoch 5/20
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 414ms/step - AUC: 0.6451 - loss: 0.2008 - val_AUC: 0.6805 - val_loss: 0.1922
Epoch 6/20
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 417ms/step - AUC: 0.6463 - loss: 0.1985 - val_AUC: 0.6813 - val_loss: 0.1929
Epoch 7/20
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

<keras.src.callbacks.history.History at 0x41222ce10>

In [193]:
# TabNetPretrainer
unsupervised_model_no_preproc = TabNetPretrainer(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=1e-3),
    mask_type='entmax', # "sparsemax",
    )

# fit the model
unsupervised_model_no_preproc.fit(
    train_features.values,
    eval_set=[val_features.values],
    max_epochs=1000 , patience=50,
    batch_size=1024, virtual_batch_size=128,
    num_workers=0,
    drop_last=False,
    pretraining_ratio=0.8,

)


epoch 0  | loss: 4945.906| val_0_unsup_loss_numpy: 59912.22265625|  0:00:15s
epoch 1  | loss: 2873.67767| val_0_unsup_loss_numpy: 45900.0234375|  0:00:29s
epoch 2  | loss: 1792.92964| val_0_unsup_loss_numpy: 15902.4423828125|  0:00:46s
epoch 3  | loss: 1329.70732| val_0_unsup_loss_numpy: 11145.7001953125|  0:01:01s
epoch 4  | loss: 760.30628| val_0_unsup_loss_numpy: 13718.3095703125|  0:01:15s
epoch 5  | loss: 576.21725| val_0_unsup_loss_numpy: 2647.457275390625|  0:01:30s
epoch 6  | loss: 311.50916| val_0_unsup_loss_numpy: 1614.2958984375|  0:01:44s
epoch 7  | loss: 220.35642| val_0_unsup_loss_numpy: 599.8854370117188|  0:01:59s
epoch 8  | loss: 182.82305| val_0_unsup_loss_numpy: 848.2854614257812|  0:02:15s
epoch 9  | loss: 170.95851| val_0_unsup_loss_numpy: 1011.438232421875|  0:02:31s
epoch 10 | loss: 159.76219| val_0_unsup_loss_numpy: 1056.9515380859375|  0:02:45s
epoch 11 | loss: 160.13637| val_0_unsup_loss_numpy: 493.1054382324219|  0:03:00s
epoch 12 | loss: 158.43062| val_0_uns

In [194]:
# Make reconstruction from a dataset
reconstructed_X, embedded_X = unsupervised_model_no_preproc.predict(train_features.values,)
assert(reconstructed_X.shape==embedded_X.shape)

unsupervised_model_no_preproc.save_model('./test_pretrain2')
loaded_pretrain = TabNetPretrainer()
loaded_pretrain.load_model('./test_pretrain2.zip')

Successfully saved model at ./test_pretrain2.zip


In [195]:
clf2_preproc = TabNetClassifier(optimizer_fn=torch.optim.Adam,
                       optimizer_params=dict(lr=1e-3),
                       scheduler_params={"step_size":10, # how to use learning rate scheduler
                                         "gamma":0.9},
                       scheduler_fn=torch.optim.lr_scheduler.StepLR,
                       mask_type='entmax' # This will be overwritten if using pretrain model
                      )

In [196]:
clf2_preproc.fit(
    train_features.values, train_labels, \
    eval_set=[(train_features.values, train_labels), (val_features.values, val_labels)],
    eval_name=['train', 'valid'],
    eval_metric=['accuracy','auc'],
    max_epochs=1000 , patience=50,
    batch_size=1024, virtual_batch_size=128,
    num_workers=0,
    weights=1,
    drop_last=False,
    from_unsupervised=loaded_pretrain
    
)

epoch 0  | loss: 0.70578 | train_accuracy: 0.38254 | train_auc: 0.59549 | valid_accuracy: 0.38442 | valid_auc: 0.60294 |  0:00:12s
epoch 1  | loss: 0.67806 | train_accuracy: 0.59203 | train_auc: 0.65696 | valid_accuracy: 0.58949 | valid_auc: 0.67017 |  0:00:25s
epoch 2  | loss: 0.65512 | train_accuracy: 0.63884 | train_auc: 0.70863 | valid_accuracy: 0.63983 | valid_auc: 0.71901 |  0:00:38s
epoch 3  | loss: 0.62112 | train_accuracy: 0.74997 | train_auc: 0.76651 | valid_accuracy: 0.75006 | valid_auc: 0.76712 |  0:00:51s
epoch 4  | loss: 0.58415 | train_accuracy: 0.73968 | train_auc: 0.78339 | valid_accuracy: 0.74112 | valid_auc: 0.78754 |  0:01:04s
epoch 5  | loss: 0.56419 | train_accuracy: 0.77417 | train_auc: 0.79916 | valid_accuracy: 0.77714 | valid_auc: 0.80432 |  0:01:17s
epoch 6  | loss: 0.549   | train_accuracy: 0.80387 | train_auc: 0.81152 | valid_accuracy: 0.80571 | valid_auc: 0.81791 |  0:01:30s
epoch 7  | loss: 0.5367  | train_accuracy: 0.81127 | train_auc: 0.82423 | valid_acc

In [182]:
# define the model
clf1_nopreproc = TabNetClassifier(optimizer_fn=torch.optim.Adam,
                       optimizer_params=dict(lr=1e-3),
                       scheduler_params={"step_size":10, # how to use learning rate scheduler
                                         "gamma":0.9},
                       scheduler_fn=torch.optim.lr_scheduler.StepLR,
                       mask_type= 'entmax' #"sparsemax"
                      )

# fit the model 
clf1_nopreproc.fit(
    train_features.values, train_labels, \
    eval_set=[(train_features.values, train_labels), (val_features.values, val_labels)],
    eval_name=['train', 'valid'],
    eval_metric=['accuracy','auc'],
    max_epochs=1000 , patience=50,
    batch_size=1024, virtual_batch_size=256,
    num_workers=0,
    weights=1,
    drop_last=False
)  

epoch 0  | loss: 0.76007 | train_accuracy: 0.60151 | train_auc: 0.62778 | valid_accuracy: 0.60104 | valid_auc: 0.62084 |  0:00:16s
epoch 1  | loss: 0.70474 | train_accuracy: 0.66476 | train_auc: 0.74454 | valid_accuracy: 0.66448 | valid_auc: 0.75258 |  0:00:31s
epoch 2  | loss: 0.65375 | train_accuracy: 0.73641 | train_auc: 0.77078 | valid_accuracy: 0.73729 | valid_auc: 0.78115 |  0:00:47s
epoch 3  | loss: 0.61356 | train_accuracy: 0.77518 | train_auc: 0.80067 | valid_accuracy: 0.77633 | valid_auc: 0.80815 |  0:01:02s
epoch 4  | loss: 0.58724 | train_accuracy: 0.77423 | train_auc: 0.80556 | valid_accuracy: 0.77463 | valid_auc: 0.81103 |  0:01:17s
epoch 5  | loss: 0.57014 | train_accuracy: 0.77684 | train_auc: 0.81085 | valid_accuracy: 0.77847 | valid_auc: 0.81557 |  0:01:32s
epoch 6  | loss: 0.55908 | train_accuracy: 0.7844  | train_auc: 0.81458 | valid_accuracy: 0.78703 | valid_auc: 0.82131 |  0:01:48s
epoch 7  | loss: 0.55115 | train_accuracy: 0.76726 | train_auc: 0.81784 | valid_acc

In [183]:
# define the model
clf1_nopreproc = TabNetClassifier(optimizer_fn=torch.optim.Adam,
                       optimizer_params=dict(lr=1e-3),
                       scheduler_params={"step_size":10, # how to use learning rate scheduler
                                         "gamma":0.9},
                       scheduler_fn=torch.optim.lr_scheduler.StepLR,
                       mask_type= 'entmax' #"sparsemax"
                      )

# fit the model 
clf1_nopreproc.fit(
    train_features.values, train_labels, \
    eval_set=[(train_features.values, train_labels), (val_features.values, val_labels)],
    eval_name=['train', 'valid'],
    eval_metric=['accuracy','auc'],
    max_epochs=1000 , patience=50,
    batch_size=256, virtual_batch_size=64,
    num_workers=0,
    weights=1,
    drop_last=False
)  

epoch 0  | loss: 0.71458 | train_accuracy: 0.78669 | train_auc: 0.77344 | valid_accuracy: 0.7857  | valid_auc: 0.78314 |  0:00:31s
epoch 1  | loss: 0.622   | train_accuracy: 0.80857 | train_auc: 0.79493 | valid_accuracy: 0.80781 | valid_auc: 0.8001  |  0:01:02s
epoch 2  | loss: 0.58627 | train_accuracy: 0.79502 | train_auc: 0.80505 | valid_accuracy: 0.79644 | valid_auc: 0.81088 |  0:01:33s
epoch 3  | loss: 0.56348 | train_accuracy: 0.78316 | train_auc: 0.8157  | valid_accuracy: 0.78519 | valid_auc: 0.82266 |  0:02:04s
epoch 4  | loss: 0.54082 | train_accuracy: 0.79806 | train_auc: 0.82765 | valid_accuracy: 0.79818 | valid_auc: 0.83841 |  0:02:35s
epoch 5  | loss: 0.5268  | train_accuracy: 0.80553 | train_auc: 0.84107 | valid_accuracy: 0.80711 | valid_auc: 0.84757 |  0:03:06s
epoch 6  | loss: 0.50757 | train_accuracy: 0.80606 | train_auc: 0.84692 | valid_accuracy: 0.80655 | valid_auc: 0.85255 |  0:03:37s
epoch 7  | loss: 0.50008 | train_accuracy: 0.7979  | train_auc: 0.85191 | valid_acc

In [185]:
# TabNetPretrainer
unsupervised_model_no_preproc = TabNetPretrainer(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=1e-3),
    mask_type='entmax', # "sparsemax",
    )

# fit the model
unsupervised_model_no_preproc.fit(
    train_features.values,
    eval_set=[val_features.values],
    max_epochs=1000 , patience=50,
    batch_size=256, virtual_batch_size=64,
    num_workers=0,
    drop_last=False,
    pretraining_ratio=0.8,

)
# Make reconstruction from a dataset
reconstructed_X, embedded_X = unsupervised_model_no_preproc.predict(train_features.values,)
assert(reconstructed_X.shape==embedded_X.shape)

unsupervised_model_no_preproc.save_model('./test_pretrain3')
loaded_pretrain2 = TabNetPretrainer()
loaded_pretrain2.load_model('./test_pretrain3.zip')

epoch 0  | loss: 577.09662| val_0_unsup_loss_numpy: 12293.2880859375|  0:00:25s
epoch 1  | loss: 211.11926| val_0_unsup_loss_numpy: 3038.433837890625|  0:00:51s
epoch 2  | loss: 163.68941| val_0_unsup_loss_numpy: 2251.504638671875|  0:01:18s
epoch 3  | loss: 158.3954| val_0_unsup_loss_numpy: 12903.640625|  0:01:44s
epoch 4  | loss: 154.70088| val_0_unsup_loss_numpy: 31643.259765625|  0:02:10s
epoch 5  | loss: 149.59659| val_0_unsup_loss_numpy: 4816.8857421875|  0:02:36s
epoch 6  | loss: 146.00617| val_0_unsup_loss_numpy: 926.4320068359375|  0:03:02s
epoch 7  | loss: 141.55473| val_0_unsup_loss_numpy: 1984.1365966796875|  0:03:28s
epoch 8  | loss: 135.93664| val_0_unsup_loss_numpy: 790.2356567382812|  0:03:54s
epoch 9  | loss: 130.86364| val_0_unsup_loss_numpy: 890.3836669921875|  0:04:20s
epoch 10 | loss: 125.08631| val_0_unsup_loss_numpy: 535.3619995117188|  0:04:46s
epoch 11 | loss: 121.87061| val_0_unsup_loss_numpy: 744.951171875|  0:05:12s
epoch 12 | loss: 114.57169| val_0_unsup_lo

In [186]:
clf2_preproc = TabNetClassifier(optimizer_fn=torch.optim.Adam,
                       optimizer_params=dict(lr=1e-3),
                       scheduler_params={"step_size":10, # how to use learning rate scheduler
                                         "gamma":0.9},
                       scheduler_fn=torch.optim.lr_scheduler.StepLR,
                       mask_type='entmax' # This will be overwritten if using pretrain model
                      )
clf2_preproc.fit(
    train_features.values, train_labels, \
    eval_set=[(train_features.values, train_labels), (val_features.values, val_labels)],
    eval_name=['train', 'valid'],
    eval_metric=['accuracy','auc'],
    max_epochs=1000 , patience=50,
    batch_size=256, virtual_batch_size=64,
    num_workers=0,
    weights=1,
    drop_last=False,
    from_unsupervised=loaded_pretrain2
    
)

epoch 0  | loss: 0.68342 | train_accuracy: 0.78769 | train_auc: 0.72509 | valid_accuracy: 0.7915  | valid_auc: 0.72798 |  0:00:31s
epoch 1  | loss: 0.58104 | train_accuracy: 0.7695  | train_auc: 0.80835 | valid_accuracy: 0.76747 | valid_auc: 0.81478 |  0:01:01s
epoch 2  | loss: 0.53523 | train_accuracy: 0.80262 | train_auc: 0.83487 | valid_accuracy: 0.80139 | valid_auc: 0.84122 |  0:01:32s
epoch 3  | loss: 0.51123 | train_accuracy: 0.80904 | train_auc: 0.84577 | valid_accuracy: 0.8094  | valid_auc: 0.8528  |  0:02:03s
epoch 4  | loss: 0.49171 | train_accuracy: 0.81206 | train_auc: 0.85607 | valid_accuracy: 0.81405 | valid_auc: 0.86076 |  0:02:33s
epoch 5  | loss: 0.48075 | train_accuracy: 0.80756 | train_auc: 0.86093 | valid_accuracy: 0.81017 | valid_auc: 0.86484 |  0:03:04s
epoch 6  | loss: 0.47243 | train_accuracy: 0.81254 | train_auc: 0.86439 | valid_accuracy: 0.81202 | valid_auc: 0.86894 |  0:03:35s
epoch 7  | loss: 0.46473 | train_accuracy: 0.81199 | train_auc: 0.87048 | valid_acc

In [122]:
def df_to_dataset_testing(dataframe, shuffle=True, batch_size=32):
  df = dataframe.copy()
  df = {key: np.array(value)[:,tf.newaxis] for key, value in dataframe.items()}
  ds = tf.data.Dataset.from_tensor_slices((dict(df)))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(batch_size)
  return ds

In [76]:
train_labels.shape

(130003,)

In [None]:
test_dataset = df_to_dataset_testing(test_data, shuffle = False, batch_size = 512)

In [None]:
prediction = model.predict(test_dataset)

In [None]:
sample_submission["target"] = prediction
sample_submission["target"] = pd.DataFrame.fillna(sample_submission["target"], value = 0)
sample_submission.to_csv(f'{OUTPUT_DIR}/result.csv')

In [None]:
prediction = clf1_nonproc.predict()