In [1]:
"""
Before running the notebook, open two terminal windows in your environment containing Dask, and then run
'dask-scheduler' in one of them, and 'dask-worker <localhost address from the output of dask-scheduler>'
in the other. 
"""

from dask.distributed import Client
client = Client('tcp://127.0.0.1:8786')

In [2]:
"""
Code to convert and store our train, test CSV data files into parquet. Cell only needs to run once; uncomment below. 
This will make later tasks much more efficient (hopefully).
"""
import dask.dataframe as dd


# train_data = dd.read_csv('./train_data.csv')
# dd.to_parquet(train_data, './Train-Parquet', overwrite=True)

# test_data = dd.read_csv('./test_data.csv')
# dd.to_parquet(test_data, './Test-Parquet', overwrite=True)

In [3]:
"""
Creating a method to prepare our dataframe for training. Note that df is intended to be a Dask dataframe.
"""
import matplotlib.pyplot as plt
import gc
import numpy as np
from scipy import stats
import pandas as pd
import math
import pickle


def apply_restore_nan(df, column_sets):
    return df.apply(restore_nan_category, column_sets=column_sets, axis=1)

def restore_nan_category(series, column_sets):
    for column_set in column_sets:
        all_zero = True
        for col in column_set:
            if not series[col] == 0:
                all_zero = False
        if all_zero:
            for col in column_set:
                series[col] = float('NaN')
    return series 

# mode function written to obtain the most common value of a cat variable on each group.
def custom_mode(x):
    if not len(x) == len(x[x.isna()]):
        x1 = x[~x.isna()]
        return pd.Series.mode(x1,dropna=False)[0]
    else:
        return float('NaN')
    
        
def prepare_data(dfl0, test_time):
    

    print('Doing CAT stuff')
    """
    Preparing the categorical variables separately
    """
    
    cat_vars = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
    cat_data0 = dfl0[['customer_ID','B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 
                               'D_64', 'D_66', 'D_68']] #.compute()
    

    num_columns = []
    for key in list(dfl0.columns):
        if key[0:5] in [col[0:5] for col in cat_vars]:
            pass
        else:
            num_columns.append(key)
    num_columns.remove('S_2')  #Datetime; will have to remove this line once model becomes more sophisticated
    
    valtype_dict_cat = {}
    for key in cat_vars:
        if not key == 'customer_ID' and not key == 'S_2':
            valtype_dict_cat[key] = [custom_mode, 'last']
    
    
    print(cat_data0.columns)
    print(valtype_dict_cat)
    cat_data_grouped = cat_data0.groupby(by='customer_ID').aggregate(valtype_dict_cat)
    
    cat_data_grouped_dummies = dd.get_dummies(cat_data_grouped, drop_first=False, 
                               columns=['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 
                                        'D_64','D_66', 'D_68'])
    
    
    column_sets = []
    column_sets.append([col for col in cat_data_grouped_dummies.columns if 'B_30_' in col])
    column_sets.append([col for col in cat_data_grouped_dummies.columns if 'B_38_' in col])
    column_sets.append([col for col in cat_data_grouped_dummies.columns if 'D_114_' in col])
    column_sets.append([col for col in cat_data_grouped_dummies.columns if 'D_116_' in col])
    column_sets.append([col for col in cat_data_grouped_dummies.columns if 'D_117_' in col])
    column_sets.append([col for col in cat_data_grouped_dummies.columns if 'D_120_' in col])
    column_sets.append([col for col in cat_data_grouped_dummies.columns if 'D_126_' in col])
    column_sets.append([col for col in cat_data_grouped_dummies.columns if 'D_63_' in col])
    column_sets.append([col for col in cat_data_grouped_dummies.columns if 'D_64_' in col])
    column_sets.append([col for col in cat_data_grouped_dummies.columns if 'D_66_' in col])
    column_sets.append([col for col in cat_data_grouped_dummies.columns if 'D_68_' in col])

    """
    To ensure that data_onehot will have the desired shape when we use this function at test time.
    https://stackoverflow.com/questions/41335718/keep-same-dummy-variable-in-training-and-testing-data
    """
    if not test_time:
        # saves the onehot encoding columns we get from the training set so we can reshape the test data at test time
        pickle.dump( cat_data_grouped_dummies.columns, open( "./Train-Cat-Columns/train_cat_columns.p", "wb" ) )
        cat_data_almost_final = cat_data_grouped_dummies
    else:
        # reshaping onehot encoded test data to match the format of our onehot encoded training data
        train_cat_columns = pickle.load(open('./Train-Cat-Columns/train_cat_columns.p', 'rb'))
        cat_data_almost_final = cat_data_grouped_dummies.reindex(columns = train_cat_columns, fill_value=0)
        
    cat_data_final = cat_data_almost_final.apply(restore_nan_category, column_sets=column_sets, axis=1)
    
    print('DONE WITH ONLY DOING CAT STUFF')
    """
    Now preparing the numerical variables
    """
    dfl1 = dfl0[num_columns]
    valtype_dict_num = {}
    for key in dfl1.columns:
        if not key == 'customer_ID' and not key == 'S_2':
            if key in num_columns:
                if key == 'P_2':
                    # 'size' represents number of rows for given customer_ID; we only need to add this once
                    valtype_dict_num[key] = ['mean', 'std', 'min', 'max', 'size', 'last']
                else:
                    valtype_dict_num[key] = ['mean', 'std', 'min', 'max', 'last']
            else:
                pass
        
    dfl1_num_columns = dfl1
    # Added the split_out=5 argument because I was getting worker killed errors from running out of RAM
    dfl1_num_columns_grouped = dfl1_num_columns.groupby(by="customer_ID").aggregate(valtype_dict_num, split_out=5)
    print('finished aggregate on num data')
    """
    Joining the numerical and categorical variables back together
    """
    
    final_df = dfl1_num_columns_grouped.merge(cat_data_final, how="left", on=['customer_ID'])
    print('finished merge')
    

    return final_df

In [4]:
"""
Wrapper function to train the model. Uses https://xgboost.readthedocs.io/en/stable/tutorials/dask.html
"""


import xgboost as xgb
import dask.array as da
import dask.distributed
import pandas as pd
import dask.dataframe as dd


def train_xgboost(train_data_onehot, train_labels, n_boost_rounds, client, validate, X_valid=None, y_valid=None):

#     dtrain = xgb.dask.DaskDMatrix(client, train_data_onehot, train_labels)
    dtrain = xgb.DMatrix(train_data_onehot, train_labels)
    
    
    if validate:
#         dvalid = xgb.dask.DaskDMatrix(client, X_valid, y_valid)
        dvalid = xgb.DMatrix(X_valid, y_valid)
        evals=[(dtrain, "train"), (dvalid, 'valid')]
        early_stopping_rounds = 1000
    else:
        evals=[(dtrain, "train")]
        early_stopping_rounds = None
        
        

    # Using Chris Deotte's model parameters from here: https://www.kaggle.com/code/cdeotte/xgboost-starter-0-793
    
    output = xgb.train(
        {'max_depth':4, 
        'learning_rate':0.05, 
        'subsample':0.8,
        'colsample_bytree':0.6, 
        'eval_metric':'logloss',
        'objective':'binary:logistic'},
        
        dtrain,
        num_boost_round=n_boost_rounds,
        evals=evals,
        early_stopping_rounds=early_stopping_rounds
    )
    
#     output = xgb.dask.train(
#         client,
#         {'max_depth':4, 
#         'learning_rate':0.05, 
#         'subsample':0.8,
#         'colsample_bytree':0.6, 
#         'eval_metric':'logloss',
#         'objective':'binary:logistic'},
        
#         dtrain,
#         num_boost_round=n_boost_rounds,
#         evals=evals,
#         early_stopping_rounds=early_stopping_rounds
#     )
    return output

  from pandas import MultiIndex, Int64Index


In [12]:
"""
Uncomment here and comment out the 'train_data == dd.read_csv' line below if 
testing with 10,000 line subset of training data.
"""

# train_data = pd.read_csv('./train_data.csv', nrows=10000)
# train_data = dd.from_pandas(train_data, npartitions=1)

In [5]:
"""
Processing the data and training the model
"""

import pandas as pd
import dask.dataframe as dd
from sklearn.model_selection import train_test_split

# Uncomment below if using the full training dataset. Additionally, experiment with increasing blocksize above 25e6.
# train_data = dd.read_csv('./train_data.csv') #, blocksize=100e6)
# train_data = dd.read_parquet('./Train-Parquet')
"""
Using Raddar's lightweight version of the train and test datasets 
https://www.kaggle.com/datasets/raddar/amex-data-integer-dtypes-parquet-format?select=test.parquet
"""

train_data = pd.read_parquet('./Train-Parquet-Lightweight')

train_data2 = train_data
train_labels = pd.read_csv('./train_labels.csv')

train_data_prepared = prepare_data(train_data2, False)


# We do the merge below in order to ensure that the partitions of the dask dataframes line up
train_data_onehot_and_labels = dd.merge(train_data_prepared, train_labels, on=['customer_ID'])

# Split into train and validation sets
# train_final, valid_final = train_data_onehot_and_labels.random_split([0.8, 0.2])
X_train, X_valid, y_train, y_valid = train_test_split(
    train_data_onehot_and_labels.drop(columns=['target','customer_ID']), 
    train_data_onehot_and_labels['target'], test_size=0.20)

# X_train = train_final.drop(columns=['target','customer_ID'])
# y_train = train_final['target']

# X_valid = valid_final.drop(columns=['target','customer_ID'])
# y_valid = valid_final['target']

X_full_train = train_data_onehot_and_labels.drop(columns=['target','customer_ID'])
y_full_train = train_data_onehot_and_labels['target']


"""
Use first line below when validating; use second line when training on the full training dataset
"""
output = train_xgboost(X_train, y_train, 5000, client, True, X_valid, y_valid)
# output = train_xgboost(X_full_train, y_full_train, 1947, client, validate=False)


Doing CAT stuff
Index(['customer_ID', 'B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120',
       'D_126', 'D_63', 'D_64', 'D_66', 'D_68'],
      dtype='object')
{'B_30': [<function custom_mode at 0x160eb8e50>, 'last'], 'B_38': [<function custom_mode at 0x160eb8e50>, 'last'], 'D_114': [<function custom_mode at 0x160eb8e50>, 'last'], 'D_116': [<function custom_mode at 0x160eb8e50>, 'last'], 'D_117': [<function custom_mode at 0x160eb8e50>, 'last'], 'D_120': [<function custom_mode at 0x160eb8e50>, 'last'], 'D_126': [<function custom_mode at 0x160eb8e50>, 'last'], 'D_63': [<function custom_mode at 0x160eb8e50>, 'last'], 'D_64': [<function custom_mode at 0x160eb8e50>, 'last'], 'D_66': [<function custom_mode at 0x160eb8e50>, 'last'], 'D_68': [<function custom_mode at 0x160eb8e50>, 'last']}
DONE WITH ONLY DOING CAT STUFF
finished aggregate on num data


  final_df = dfl1_num_columns_grouped.merge(cat_data_final, how="left", on=['customer_ID'])
  final_df = dfl1_num_columns_grouped.merge(cat_data_final, how="left", on=['customer_ID'])


finished merge


  return pd.merge(
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[0]	train-logloss:0.66265	valid-logloss:0.66266
[1]	train-logloss:0.63504	valid-logloss:0.63530
[2]	train-logloss:0.60904	valid-logloss:0.60950
[3]	train-logloss:0.58615	valid-logloss:0.58667
[4]	train-logloss:0.56482	valid-logloss:0.56553
[5]	train-logloss:0.54544	valid-logloss:0.54618
[6]	train-logloss:0.52669	valid-logloss:0.52768
[7]	train-logloss:0.50983	valid-logloss:0.51089
[8]	train-logloss:0.49402	valid-logloss:0.49495
[9]	train-logloss:0.47916	valid-logloss:0.48022
[10]	train-logloss:0.46573	valid-logloss:0.46689
[11]	train-logloss:0.45307	valid-logloss:0.45436
[12]	train-logloss:0.44110	valid-logloss:0.44247
[13]	train-logloss:0.42999	valid-logloss:0.43143
[14]	train-logloss:0.41975	valid-logloss:0.42139
[15]	train-logloss:0.40999	valid-logloss:0.41170
[16]	train-logloss:0.40092	valid-logloss:0.40265
[17]	train-logloss:0.39261	valid-logloss:0.39439
[18]	train-logloss:0.38451	valid-logloss:0.38641
[19]	train-logloss:0.37695	valid-logloss:0.37897
[20]	train-logloss:0.37005	val

KeyboardInterrupt: 

In [7]:
# print(output)

In [8]:
import pickle 

pickle.dump( output, open( "./Models/mean_std_min_max_last_dart_4.p", "wb" ) )

In [9]:
"""
ToDo: Now we start writing code to load the model trained above, and then ultimately output a submission .csv file
containing our predictions over the test dataset.
"""
import dask.dataframe as dd
import pickle
import numpy as np
import pandas as pd


model_path = './Models/mean_std_min_max_last_dart_4.p' 
file = open(model_path, 'rb')
model_output = pickle.load(file)
booster = model_output["booster"]
print(booster.best_iteration)
best_model = booster[: booster.best_iteration]

# test_data = dd.read_csv('./test_data.csv')#, blocksize=100e6)
test_data = dd.read_parquet('./Test-Parquet')#, split_row_groups=1000)
# customers = pd.read_parquet('./Test-Parquet', columns=['customer_ID']).to_numpy()
print(1)
prepared_test_data = prepare_data(test_data, True)
customers = prepared_test_data.index
print(2)

dtest = xgb.dask.DaskDMatrix(client, prepared_test_data)
prediction = xgb.dask.predict(client, best_model, dtest)
print(3)

output_df = pd.DataFrame({'customer_ID' : list(customers), 'prediction' : np.array(prediction)})
print(4)
output_df.to_csv('./Outputs/submission8.csv', index=False)


1946
1
Doing CAT stuff
Index(['customer_ID', 'B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120',
       'D_126', 'D_63', 'D_64', 'D_66', 'D_68'],
      dtype='object')
{'B_30': [<function custom_mode at 0x15aa853f0>, 'last'], 'B_38': [<function custom_mode at 0x15aa853f0>, 'last'], 'D_114': [<function custom_mode at 0x15aa853f0>, 'last'], 'D_116': [<function custom_mode at 0x15aa853f0>, 'last'], 'D_117': [<function custom_mode at 0x15aa853f0>, 'last'], 'D_120': [<function custom_mode at 0x15aa853f0>, 'last'], 'D_126': [<function custom_mode at 0x15aa853f0>, 'last'], 'D_63': [<function custom_mode at 0x15aa853f0>, 'last'], 'D_64': [<function custom_mode at 0x15aa853f0>, 'last'], 'D_66': [<function custom_mode at 0x15aa853f0>, 'last'], 'D_68': [<function custom_mode at 0x15aa853f0>, 'last']}
DONE WITH ONLY DOING CAT STUFF
finished aggregate on num data


  meta = left._meta_nonempty.merge(right._meta_nonempty, **kwargs)


finished merge
2
3
4


In [None]:
"""
Hyperparameter tuning. Code adapted from:
https://aiinpractice.com/xgboost-hyperparameter-tuning-with-bayesian-optimization/
"""
import numpy as np
from xgboost import XGBClassifier
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score

global train_final

pbounds = {
    'learning_rate': (0.01, 1.0),
    'n_estimators': (100, 1000),
    'max_depth': (3,10),
    'subsample': (1.0, 1.0),  # Change for big datasets
    'colsample': (1.0, 1.0),  # Change for datasets with lots of features
    'gamma': (0, 5)}

def xgboost_hyper_param(learning_rate,
                        n_estimators,
                        max_depth,
                        subsample,
                        colsample,
                        gamma):

    max_depth = int(max_depth)
    n_estimators = int(n_estimators)

    clf = XGBClassifier(
        max_depth=max_depth,
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        gamma=gamma)
#     X_train = train_final.drop(columns=['target','customer_ID'])
#     y_train = train_final['target']
    return np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

train_data = pd.read_parquet('./Train-Parquet-Lightweight')
train_data2 = train_data
train_labels = pd.read_csv('./train_labels.csv')
train_data_prepared = prepare_data(train_data2, False)
train_final = dd.merge(train_data_prepared, train_labels, on=['customer_ID'])

optimizer = BayesianOptimization(
    f=xgboost_hyper_param,
    pbounds=pbounds,
    random_state=1,
)

optimizer.maximize(
    init_points=2,
    n_iter=3,
)

Doing CAT stuff
Index(['customer_ID', 'B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120',
       'D_126', 'D_63', 'D_64', 'D_66', 'D_68'],
      dtype='object')
{'B_30': [<function custom_mode at 0x160eb8e50>, 'last'], 'B_38': [<function custom_mode at 0x160eb8e50>, 'last'], 'D_114': [<function custom_mode at 0x160eb8e50>, 'last'], 'D_116': [<function custom_mode at 0x160eb8e50>, 'last'], 'D_117': [<function custom_mode at 0x160eb8e50>, 'last'], 'D_120': [<function custom_mode at 0x160eb8e50>, 'last'], 'D_126': [<function custom_mode at 0x160eb8e50>, 'last'], 'D_63': [<function custom_mode at 0x160eb8e50>, 'last'], 'D_64': [<function custom_mode at 0x160eb8e50>, 'last'], 'D_66': [<function custom_mode at 0x160eb8e50>, 'last'], 'D_68': [<function custom_mode at 0x160eb8e50>, 'last']}
DONE WITH ONLY DOING CAT STUFF
finished aggregate on num data


  final_df = dfl1_num_columns_grouped.merge(cat_data_final, how="left", on=['customer_ID'])
  final_df = dfl1_num_columns_grouped.merge(cat_data_final, how="left", on=['customer_ID'])


finished merge


  return pd.merge(


|   iter    |  target   | colsample |   gamma   | learni... | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------------------------------


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


