In [1]:
"""
Before running the notebook, open two terminal windows in your environment containing Dask, and then run
'dask-scheduler' in one of them, and 'dask-worker <localhost address from the output of dask-scheduler>'
in the other. 
"""

from dask.distributed import Client
client = Client('tcp://127.0.0.1:8786')

In [2]:
"""
Code to convert and store our train and test CSV data files into parquet. Cell only needs to run once. This will make
later tasks much more efficient than reading from CSV file.
"""
import dask.dataframe as dd


# train_data = dd.read_csv('./train_data.csv')
# dd.to_parquet(train_data, './Train-Parquet', overwrite=True)

# test_data = dd.read_csv('./test_data.csv')
# dd.to_parquet(test_data, './Test-Parquet', overwrite=True)

In [3]:
"""
Creating a method to prepare our dataframe for training. Note that df is intended to be a Dask dataframe.
"""
import matplotlib.pyplot as plt
import gc
import numpy as np
from scipy import stats
import pandas as pd
import math
import pickle

# global column_sets
# global train_cat_columns_global


def apply_restore_nan(df, column_sets):
    return df.apply(restore_nan_category, column_sets=column_sets, axis=1)

def restore_nan_category(series, column_sets):
#     print(series, flush=True)
    for column_set in column_sets:
        all_zero = True
        for col in column_set:
#             print(series)
            if not series[col] == 0:
                all_zero = False
        if all_zero:
            for col in column_set:
                series[col] = float('NaN')
#     del(column_sets)
#     gc.collect()
    return series 

def custom_mode(x):
#     print(x)
    if not len(x) == len(x[x.isna()]):
#         print('X DOESNT ONLY CONTAIN NANS!')
        x1 = x[~x.isna()]
        return pd.Series.mode(x1,dropna=False)[0]
    else:
        return float('NaN')
    
        
def prepare_data(dfl0, test_time):
    

    print('Doing CAT stuff')
    """
    Preparing the categorical variables separately
    """
    
    cat_vars = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
    cat_data0 = dfl0[['customer_ID','B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 
                               'D_64', 'D_66', 'D_68']].compute()
    
#     cat_data1 = cat_data0.categorize(columns=['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 
#                                             'D_63', 'D_64', 'D_66', 'D_68'])
    

    num_columns = []
    for key in list(dfl0.columns):
        if key[0:5] in [col[0:5] for col in cat_vars]:
            pass
        else:
            num_columns.append(key)
    num_columns.remove('S_2')  #Datetime; will have to remove this line once model becomes more sophisticated
    
    valtype_dict_cat = {}
    for key in cat_vars:
        if not key == 'customer_ID' and not key == 'S_2':
            valtype_dict_cat[key] = [custom_mode, 'last']
    
    
    print(cat_data0.columns)
    print(valtype_dict_cat)
    cat_data_grouped = cat_data0.groupby(by='customer_ID').aggregate(valtype_dict_cat)
    
    cat_data_grouped_dummies = dd.get_dummies(cat_data_grouped, drop_first=False, 
                               columns=['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 
                                        'D_64','D_66', 'D_68'])
    
    
    column_sets = []
    column_sets.append([col for col in cat_data_grouped_dummies.columns if 'B_30_' in col])
    column_sets.append([col for col in cat_data_grouped_dummies.columns if 'B_38_' in col])
    column_sets.append([col for col in cat_data_grouped_dummies.columns if 'D_114_' in col])
    column_sets.append([col for col in cat_data_grouped_dummies.columns if 'D_116_' in col])
    column_sets.append([col for col in cat_data_grouped_dummies.columns if 'D_117_' in col])
    column_sets.append([col for col in cat_data_grouped_dummies.columns if 'D_120_' in col])
    column_sets.append([col for col in cat_data_grouped_dummies.columns if 'D_126_' in col])
    column_sets.append([col for col in cat_data_grouped_dummies.columns if 'D_63_' in col])
    column_sets.append([col for col in cat_data_grouped_dummies.columns if 'D_64_' in col])
    column_sets.append([col for col in cat_data_grouped_dummies.columns if 'D_66_' in col])
    column_sets.append([col for col in cat_data_grouped_dummies.columns if 'D_68_' in col])

#     print(cat_data_grouped.head())

    """
    To ensure that data_onehot will have the desired shape when we use this function at test time.
    https://stackoverflow.com/questions/41335718/keep-same-dummy-variable-in-training-and-testing-data
    """
    if not test_time:
        pickle.dump( cat_data_grouped_dummies.columns, open( "./Train-Cat-Columns/train_cat_columns.p", "wb" ) )
        cat_data_almost_final = cat_data_grouped_dummies
#         train_cat_columns_global = cat_data_grouped_dummies.columns
    else:
        train_cat_columns = pickle.load(open('./Train-Cat-Columns/train_cat_columns.p', 'rb'))
        cat_data_almost_final = cat_data_grouped_dummies.reindex(columns = train_cat_columns, fill_value=0)
        
    cat_data_final = cat_data_almost_final.apply(restore_nan_category, column_sets=column_sets, axis=1)
    
    print('DONE WITH ONLY DOING CAT STUFF')
    """
    Now preparing the numerical variables
    """
    dfl1 = dfl0[num_columns] #.repartition(npartitions=100*dfl0.npartitions)
#     print(finished repartition)
    valtype_dict_num = {}
    for key in dfl1.columns:
        if not key == 'customer_ID' and not key == 'S_2':
            if key in num_columns:
                if key == 'P_2':
                    # size corresponds to number of rows for given customer_ID; we only need to add this once
                    valtype_dict_num[key] = ['mean', 'std', 'min', 'max', 'size', 'last']
                else:
                    valtype_dict_num[key] = ['mean', 'std', 'min', 'max', 'last']
            else:
                pass
                
#     print(valtype_dict)
        
    dfl1_num_columns = dfl1
    # Added the split_out parameter because I'm getting worker killed errors; should probably lower this to ~2
    dfl1_num_columns_grouped = dfl1_num_columns.groupby(by="customer_ID").aggregate(valtype_dict_num, split_out=10)
    print('finished aggregate on num data')
    """
    Joining the numerical and categorical variables back together
    """
    
    final_df = dfl1_num_columns_grouped.merge(cat_data_final, how="left", on=['customer_ID'])
    print('finished merge')
    
#     print(dfl1_num_columns_grouped.head(50))
#     print(dfl1_num_columns_grouped.columns)
#     print(dfl1_num_columns_grouped['B_30'].head(50))
#     dfl1_2 = dfl1_grouped.categorize(columns=['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 
#                                             'D_63', 'D_64', 'D_66', 'D_68'])
    
    
    
    
#     column_sets = []
#     column_sets.append([col for col in dfl2.columns if 'B_30_' in col])
#     column_sets.append([col for col in dfl2.columns if 'B_38_' in col])
#     column_sets.append([col for col in dfl2.columns if 'D_114_' in col])
#     column_sets.append([col for col in dfl2.columns if 'D_116_' in col])
#     column_sets.append([col for col in dfl2.columns if 'D_117_' in col])
#     column_sets.append([col for col in dfl2.columns if 'D_120_' in col])
#     column_sets.append([col for col in dfl2.columns if 'D_126_' in col])
#     column_sets.append([col for col in dfl2.columns if 'D_63_' in col])
#     column_sets.append([col for col in dfl2.columns if 'D_64_' in col])
#     column_sets.append([col for col in dfl2.columns if 'D_66_' in col])
#     column_sets.append([col for col in dfl2.columns if 'D_68_' in col])

    
    """
    To ensure that data_onehot will have the desired shape when we use this function at test time.
    For now, our model simply ignores categorical data, which 'if test_time' simply results in a pass.
    """
#     https://stackoverflow.com/questions/41335718/keep-same-dummy-variable-in-training-and-testing-data
#     if test_time:
#         dfl3 = dfl2
# #         _,data_onehot = train_data_onehot.align(df_onehot, join='outer', axis=1, fill_value=0)
#         #have to fix this df = df_onehot.reindex(columns = train_data_onehot.columns, fill_value=float('NaN'))
#     else:
#         dfl3 = dfl2
        
    #Assigning NaN values back to the columns representing the categorical variables that had NaN originally
#     dfl4 = dfl3.map_partitions(apply_restore_nan, column_sets=column_sets)
    
    # Setting the index to 'customer_ID' will help us do the following calculations
    
#     dfl5 = dfl4.set_index('customer_ID')
#     num_columns = []
#     cat_columns = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 
#                    'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
#     for key in list(dfl5.columns):
#         if key[0:5] in [col[0:5] for col in cat_columns]:
#             pass
#         else:
#             num_columns.append(key)
#     num_columns.remove('S_2')  #Datetime; will have to remove this line once model becomes more sophisticated

#     valtype_dict = {}
#     for key in dfl5.columns:
#         if not key == 'customer_ID' and not key == 'S_2':
#             if key in num_columns:
#                 valtype_dict[key] = 'mean'
#             else:
#                 valtype_dict[key] = stats.mode
        

    #removing all categorical columns (obviously we want to change this soon)
#     dfl6 = dfl5[num_columns]

#     dfl7 = dfl6.astype(float)
    
    #Generating some simple features by inserting the average value in each column for each customer
#     dfl8 = dfl7.groupby("customer_ID").agg(valtype_dict)

    return final_df

In [4]:
"""
Wrapper function to train the model. Uses https://xgboost.readthedocs.io/en/stable/tutorials/dask.html
"""


import xgboost as xgb
import dask.array as da
import dask.distributed
import pandas as pd
import dask.dataframe as dd


def train_xgboost(train_data_onehot, train_labels, n_boost_rounds, client):

    dtrain = xgb.dask.DaskDMatrix(client, train_data_onehot, train_labels)

    output = xgb.dask.train(
        client,
        {"verbosity": 2, "tree_method": "hist", "objective": "reg:logistic"},
        dtrain,
        num_boost_round=n_boost_rounds,
        evals=[(dtrain, "train")],
    )
    return output

  from pandas import MultiIndex, Int64Index


In [5]:
"""
Uncomment here and comment out the 'train_data == dd.read_csv' line below if 
testing with 10,000 line subset of training data.
"""

# train_data = pd.read_csv('./train_data.csv', nrows=10000)
# train_data = dd.from_pandas(train_data, npartitions=1)

"\nUncomment here and comment out the 'train_data == dd.read_csv' line below if \ntesting with 10,000 line subset of training data.\n"

In [6]:
"""
Processing the data and training the model
"""

import pandas as pd
import dask.dataframe as dd

# Uncomment below if using the full training dataset. Additionally, experiment with increasing blocksize above 25e6.
# train_data = dd.read_csv('./train_data.csv') #, blocksize=100e6)
train_data = dd.read_parquet('./Train-Parquet')
train_data2 = train_data
train_labels = pd.read_csv('./train_labels.csv')

train_data_prepared = prepare_data(train_data2, False)


# We do the merge below in order to ensure that the partitions of the dask dataframes line up
train_data_onehot_and_labels = dd.merge(train_data_prepared, train_labels, on=['customer_ID'])
X_train = train_data_onehot_and_labels.drop(columns=['target','customer_ID'])
y_train = train_data_onehot_and_labels['target']
output = train_xgboost(X_train, y_train, 100, client) #third arg is number of training iterations I think


Doing CAT stuff
Index(['customer_ID', 'B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120',
       'D_126', 'D_63', 'D_64', 'D_66', 'D_68'],
      dtype='object')
{'B_30': [<function custom_mode at 0x16352d3f0>, 'last'], 'B_38': [<function custom_mode at 0x16352d3f0>, 'last'], 'D_114': [<function custom_mode at 0x16352d3f0>, 'last'], 'D_116': [<function custom_mode at 0x16352d3f0>, 'last'], 'D_117': [<function custom_mode at 0x16352d3f0>, 'last'], 'D_120': [<function custom_mode at 0x16352d3f0>, 'last'], 'D_126': [<function custom_mode at 0x16352d3f0>, 'last'], 'D_63': [<function custom_mode at 0x16352d3f0>, 'last'], 'D_64': [<function custom_mode at 0x16352d3f0>, 'last'], 'D_66': [<function custom_mode at 0x16352d3f0>, 'last'], 'D_68': [<function custom_mode at 0x16352d3f0>, 'last']}
DONE WITH ONLY DOING CAT STUFF
finished aggregate on num data
finished merge


  meta = left._meta_nonempty.merge(right._meta_nonempty, **kwargs)


In [7]:
import pickle 

pickle.dump( output, open( "./Models/mean_std_min_max_last_2.p", "wb" ) )

In [8]:
"""
ToDo: Now we start writing code to load the model trained above, and then ultimately output a submission .csv file
containing our predictions over the test dataset.
"""
import dask.dataframe as dd
import pickle
import numpy as np
import pandas as pd


model_path = './Models/mean_std_min_max_last_2.p' 
file = open(model_path, 'rb')
model_output = pickle.load(file)

# test_data = dd.read_csv('./test_data.csv')#, blocksize=100e6)
test_data = dd.read_parquet('./Test-Parquet', split_row_groups=1000)
# customers = pd.read_parquet('./Test-Parquet', columns=['customer_ID']).to_numpy()
print(1)
prepared_test_data = prepare_data(test_data, True)
customers = prepared_test_data.index
print(2)

dtest = xgb.dask.DaskDMatrix(client, prepared_test_data)
prediction = xgb.dask.predict(client, model_output['booster'], dtest)
print(3)

output_df = pd.DataFrame({'customer_ID' : list(customers), 'prediction' : np.array(prediction)})
print(4)
output_df.to_csv('./Outputs/submission4.csv', index=False)

# prepared_test_data1 = prepared_test_data
# prepared_test_data['customer_ID'] = prepared_test_data.index  #.compute()
# test_results = prepared_test_data[['customer_ID']].compute()
# test_results['prediction'] = prediction
# print(test_results.head())

1
Doing CAT stuff
Index(['customer_ID', 'B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120',
       'D_126', 'D_63', 'D_64', 'D_66', 'D_68'],
      dtype='object')
{'B_30': [<function custom_mode at 0x16352d3f0>, 'last'], 'B_38': [<function custom_mode at 0x16352d3f0>, 'last'], 'D_114': [<function custom_mode at 0x16352d3f0>, 'last'], 'D_116': [<function custom_mode at 0x16352d3f0>, 'last'], 'D_117': [<function custom_mode at 0x16352d3f0>, 'last'], 'D_120': [<function custom_mode at 0x16352d3f0>, 'last'], 'D_126': [<function custom_mode at 0x16352d3f0>, 'last'], 'D_63': [<function custom_mode at 0x16352d3f0>, 'last'], 'D_64': [<function custom_mode at 0x16352d3f0>, 'last'], 'D_66': [<function custom_mode at 0x16352d3f0>, 'last'], 'D_68': [<function custom_mode at 0x16352d3f0>, 'last']}
DONE WITH ONLY DOING CAT STUFF
finished aggregate on num data


  meta = left._meta_nonempty.merge(right._meta_nonempty, **kwargs)


finished merge
2
3
4


In [9]:
# print(np.array(prediction))
# print(customers)
# customers_list = []
# for row in customers:
#     if not row[0] in customers_list:
#         customers_list.append(row[0])
        
# This is merely a stopgap function until I figure out how to really do this correctly
# customers_list = []
# customers_list.append(customers[0][0])
# for i in range(1, len(customers)):
#     if not customers[i] == customers[i-1]:
#         customers_list.append(customers[i][0])
    
# print(customers_list[0:100])
# print(len(customers_list), len(prediction))

# customers = np.array(pd.read_parquet('./Test-Parquet', columns=['customer_ID']).groupby('customer_ID').count().index)
# print(customers)

# customers1 = customers.to_numpy()
# # This is merely a stopgap function until I figure out how to really do this correctly
# customers_list = []
# customers_list.append(customers[0][0])
# for i in range(1, len(customers)):
#     if not customers[i] == customers[i-1]:
#         customers_list.append(customers[i][0])
    
# # print(customers_list[0:100])
# print(len(customers_list), len(prediction))




# output_df = pd.DataFrame({'customer_ID' : customers, 'prediction' : np.array(prediction)})
# # print(4)
# output_df.to_csv('./Outputs/submission1.csv', index=False)
# # # print(prepared_test_data.index.compute())

In [10]:
"""
Sanity check
"""

# import numpy as np

# prediction = np.array(prediction)
# for i in range(10):
#     print(prediction[i])


'\nSanity check\n'

In [11]:
# print(prepared_test_data.columns)

# prepared_test_data1 = prepared_test_data.copy()
# prepared_test_data1['customer_ID'] = prepared_test_data.index  #.compute()
# test_results = prepared_test_data1[['customer_ID']].compute()
# test_results['prediction'] = prediction
# print(test_results.head())
# # print(len(test_data))
# # print(len(test_results))
# # test_results['prediction'] = prediction
# # print(test_results.head())

In [12]:
# initial = [1, 1, 9, 1, 9, 6, 9, 7]
# result = sorted(set(initial), key=initial.index)
# print(result)

In [13]:
# print(train_data[['customer_ID','B_30']].groupby('customer_ID').agg({'B_30':custom_mode}))
# print(prepared_test_data.head(50))
# print(prepared_test_data.columns)
# print(train_data_onehot_and_labels.columns)
# customers =list(prepared_test_data.index)
# print(customers[0:100])

# X_train.head(200).to_csv('./X_train_head')
# train_data.head(2600).to_csv('./train_data_head')
# train_data_prepared.head(200).to_csv('./mean_std_etc')
# print(train_data_prepared.head().index)
# print(train_data_prepared.columns)
# print(train_data_prepared.loc['0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a'][('D_49','last')].compute())

print(len(test_data))

11363762
