# 1. SETTINGS

In [8]:
# libraries
import numpy as np
import pickle
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import scipy.stats
import os
from functions.smooth_stat import smooth_stat

In [2]:
# pandas options
pd.set_option("display.max_columns", None)

In [3]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [4]:
# garbage collection
import gc
gc.enable()

# 2. DATA PARTITIONING

In [5]:
# import data
#train = pd.read_csv("../data/prepared/train_no_holding.csv")
#test  = pd.read_csv("../data/prepared/test_no_holding.csv")

In [6]:
train.sort_values('Week', inplace=True)

# CROSS-VALIDATION TRAINING

In [6]:
### PARAMETERS

# learner settings
metric   = "auc"
verbose  = 10
stopping = 30
seed = 42
features = ["ratio1", "ratio2", "ratio3", "ratio4", "ratio5", "ratio6", "RatioMean"]
n_folds = 3

# lgb settings
gbm = lgb.LGBMClassifier(n_estimators     = 1000,
                         learning_rate    = 0.005,
                         num_leaves       = 70,
                         colsample_bytree = 0.8,
                         subsample        = 0.9,
                         max_depth        = 7,
                         reg_alpha        = 0.1,
                         reg_lambda       = 0.1,
                         min_split_gain   = 0.01,
                         min_child_weight = 2,
                         random_state     = seed)

In [8]:
def _train_model(model, train_x, train_y, val_x, val_y):
    #create features inside the CV
    
    # train lightGBM
    global verbose
    global stopping
    global metric
    model = model.fit(train_x, train_y, 
              eval_set = [(train_x, train_y), (val_x, val_y)], 
              eval_metric = metric, 
              verbose = verbose, 
              early_stopping_rounds = stopping)
    
    # save number of iterations
    num_iters = gbm.best_iteration_
    best_auc = gbm.best_score_
    return (model, num_iters, best_auc)  

In [9]:
def check(dataset, fold_count):
    fold_size = len(dataset) // fold_count
    for fold_id in range(0, fold_count - 2):
                fold_start = fold_size * fold_id
                fold_end = fold_start + fold_size
                print(f'stat {fold_start}  {fold_end}')
                train_start = fold_end
                train_end = train_start + fold_size
                print(f'train {train_start}  {train_end}')
                val_start = train_end
                val_end = val_start + fold_size
            
                if fold_id == fold_count - 2:
                    val_end = len(dataset)
                print(f'val {val_start}  {val_end}')

In [10]:
check(train, 5)

stat 0  22258776
train 22258776  44517552
val 44517552  66776328
stat 22258776  44517552
train 44517552  66776328
val 66776328  89035104
stat 44517552  66776328
train 66776328  89035104
val 89035104  111293880


In [12]:
def train_fold(fold_count):
    ''' Please, use Xs and y_s sorted by date,
    otherwise it can overfitting by looking in the future'''
    import pandas as pd
    fold_size = 111293880 // fold_count
    models = {}
    epochs = {}
    aucs = {}
    feat = {}
    for fold_id in range(0, fold_count - 2):
            fold_start = fold_size * fold_id
            fold_end = fold_start + fold_size
            train_start = fold_end
            train_end = train_start + fold_size
            val_start = train_end
            val_end = val_start + fold_size
            
            if fold_id == fold_count - 2:
                val_end = 111293880
            dataset = pd.read_csv("../data/prepared/train_no_holding.csv")   
            dataset.sort_values('Week', inplace=True)
            stats_data = dataset.iloc[fold_start:fold_end,:]
            train_data = dataset.iloc[train_start:train_end,:]
            val_data = dataset.iloc[val_start:val_end,:]
            print(train_data['Week'].max())
            del dataset
            
            # FEATURE CREATION GOES HERE
            # Example (in general can be imporved)

            # compute historical target ratio
            from functions.smooth_stat import smooth_stat
            
            print('Feature creation')
            
            # "CustomerIdx"
            target_feature='CustomerInterest'
            # compute target ratio (last 30 weeks)
            cust_int0 = stats_data[stats_data["Week"] >= (stats_data["Week"].max()-30)]
            cust_int0 = cust_int0[["CustomerIdx", "CustomerInterest", "IsinIdx"]]
            cust_int0 = cust_int0.groupby(["CustomerIdx", "IsinIdx"], as_index = False).mean()
            cust_int0.columns = ["CustomerIdx", "IsinIdx", "ratio0"]
                        
            # compute target ratio (last 16 weeks)
            cust_int1 = stats_data[stats_data["Week"] >= (stats_data["Week"].max()-16)]
            cust_int1 = cust_int1[["CustomerIdx", "CustomerInterest", "IsinIdx"]]
            cust_int1 = cust_int1.groupby(["CustomerIdx", "IsinIdx"], as_index = False).mean()
            cust_int1.columns = ["CustomerIdx", "IsinIdx", "ratio1"]
            
            # compute target ratio (last 8 weeks)
            cust_int2 = stats_data[stats_data["Week"] >= (stats_data["Week"].max()-8)]
            cust_int2 = cust_int2[["CustomerIdx", "CustomerInterest", "IsinIdx"]]
            cust_int2 = cust_int2.groupby(["CustomerIdx", "IsinIdx"], as_index = False).mean()
            cust_int2.columns = ["CustomerIdx", "IsinIdx", "ratio2"]
            
            # compute target ratio (last 4 weeks)
            cust_int3 = stats_data[stats_data["Week"] >= (stats_data["Week"].max()-4)]
            cust_int3 = cust_int3[["CustomerIdx", "CustomerInterest", "IsinIdx"]]
            cust_int3 = cust_int3.groupby(["CustomerIdx", "IsinIdx"], as_index = False).mean()
            cust_int3.columns = ["CustomerIdx", "IsinIdx", "ratio3"]
            
            # compute target ratio (last 1 week1)
            cust_int4 = stats_data[stats_data["Week"] >= (stats_data["Week"].max())]
            cust_int4 = cust_int4[["CustomerIdx", "CustomerInterest", "IsinIdx"]]
            cust_int4 = cust_int4.groupby(["CustomerIdx", "IsinIdx"], as_index = False).mean()
            cust_int4.columns = ["CustomerIdx", "IsinIdx", "ratio4"]
            
            # compute customer target ratio (last 30 weeks)
            cust_int5 = stats_data[stats_data["Week"] >= (110-30)]
            cust_int5 = cust_int5[["CustomerIdx", "CustomerInterest"]]
            cust_int5 = cust_int5.groupby(["CustomerIdx"], as_index = False).mean()
            cust_int5.columns = ["CustomerIdx", "ratio5"]
            
            # compute bond target ratio (last 30 weeks)
            cust_int6 = stats_data[stats_data["Week"] >= (110-30)]
            cust_int6 = cust_int6[["IsinIdx", "CustomerInterest"]]
            cust_int6 = cust_int6.groupby(["IsinIdx"], as_index = False).mean()
            cust_int6.columns = ["IsinIdx", "ratio6"]
            
            # merge and average all ratios
            cust_int = cust_int0.merge(cust_int1, how = "left", on = ["CustomerIdx", "IsinIdx"])
            cust_int = cust_int.merge(cust_int2,  how = "left", on = ["CustomerIdx", "IsinIdx"])
            cust_int = cust_int.merge(cust_int3,  how = "left", on = ["CustomerIdx", "IsinIdx"])
            cust_int = cust_int.merge(cust_int4,  how = "left", on = ["CustomerIdx", "IsinIdx"])
            cust_int = cust_int.merge(cust_int5,  how = "left", on = ["CustomerIdx"])
            cust_int = cust_int.merge(cust_int6,  how = "left", on = ["IsinIdx"])
            cust_int["RatioMean"] = (cust_int["ratio0"] + cust_int["ratio1"] + cust_int["ratio2"] + 
                                     cust_int["ratio3"] + cust_int["ratio4"] + cust_int["ratio5"] +
                                     cust_int["ratio6"]) / 7
            
            del stats_data
            # merge features
            train_data = train_data.merge(cust_int, how = "left", on = ["CustomerIdx", "IsinIdx"])
            val_data = val_data.merge(cust_int, how = "left", on = ["CustomerIdx", "IsinIdx"])
            
            train_data.fillna(0, inplace=True)
            val_data.fillna(0, inplace=True)
            
            train_x = train_data.drop('CustomerInterest', axis=1)
            train_y = train_data['CustomerInterest']
            val_x = val_data.drop('CustomerInterest', axis=1)
            val_y = val_data['CustomerInterest']
            train_y = train_y.astype('int')
            val_y = val_y.astype('int')
        
            ##### END OF EXAMPLE
            global features
            # TRAINING
            from sklearn.linear_model import LogisticRegression
            from sklearn.model_selection import cross_val_score
            from sklearn.metrics import roc_auc_score
            classifier = LogisticRegression(C=3)
            print('Training')
            classifier.fit(train_x[features], train_y)
            del train_data
            print('Validation')
            probs = classifier.predict_proba(val_x[features])[:,1]
            auc = roc_auc_score(val_y, probs)
            
        
            print(f'_______________________ \n {fold_id} {auc} \n ____________________')
            
            print('Saving_features')
            cust_int.to_csv(f'../submissions/cv_validations/cust_int_{fold_id}.csv', index=False)
            
            print('Saving model')
            filename = f'../submissions/cv_validations/log_reg_{fold_id}.sav'
            pickle.dump(classifier, open(filename, 'wb'))
            
                   
   # return (models, epochs, aucs, feat)

In [7]:
def auc_mean(aucs):
    train_auc = []
    valid0_auc = []
    test_auc = []
    for fold in aucs:
        train_auc.append(aucs[fold]['training']['auc'])  
        test_auc.append(aucs[fold]['valid_1']['auc'])   
    mean_train = np.asarray(train_auc).mean()
    mean_test = np.asarray(test_auc).mean()
    return({'train_auc':mean_train, 'cv_auc':mean_test})

In [13]:
train_fold(5)

48.0
Feature creation
Training
Validation
_______________________ 
 0 0.5094741893368767 
 ____________________
Saving_features
Saving model
72.0
Feature creation
Training
Validation
_______________________ 
 1 0.5095178333947881 
 ____________________
Saving_features
Saving model
96.0
Feature creation
Training
Validation
_______________________ 
 2 0.5033542099421813 
 ____________________
Saving_features
Saving model


In [None]:
for fold_id in range(n_folds - 2):
    test_t  = pd.read_csv("../data/prepared/test_no_holding.csv")
    cust_int = pd.read_csv(f'../submissions/cv_validations/cust_int_{fold_id}.csv')
    test_t = test_t.merge(cust_int, how = "left", on = "CustomerIdx")
    test_t.fillna(0, inplace=True)
    print('Predicting')
    with open(f'../submissions/cv_validations/log_reg_{fold_id}.sav', 'rb') as pickle_file:
        classifier = pickle.load(pickle_file)
    test_t["CustomerInterest"] = classifier.predict_proba(test_t[features])[:, 1]
    test_t[test_t[features].isnull()]["CustomerInterest"] = 0
    subm = test_t[["PredictionIdx", "CustomerInterest"]]
    subm.to_csv(f"{validation_path}/submission_logit_cv{fold_id}.csv", index = False)

Predicting


In [13]:
auc_mean(aucs)

{'cv_auc': 0.86230693406824099, 'train_auc': 0.91670941121186211}

In [14]:
# Prediction
validation_path = "../submissions/cv_validations/"

if not os.path.exists(validation_path):
    os.makedirs(validation_path)

# predict
for i, model in enumerate(models):   
    test_t = test.merge(features_cv[i], how = "left", on = "CustomerIdx")
    test_t["CustomerInterest"] = models[model].predict_proba(test_t[features], num_iteration = epochs[model])[:, 1]
    # smart impute 
    test_t[test_t[features].isnull()]["CustomerInterest"] = 0
    # export CSV
    subm = test_t[["PredictionIdx", "CustomerInterest"]]
    subm.to_csv(f"{validation_path}/submission{i}.csv", index = False)

In [15]:
final_sub = pd.read_csv(f"{validation_path}/submission0.csv")
for sub in range(n_folds):
    if(sub==0):
        continue
    else:
        final_sub['CustomerInterest'] += pd.read_csv(f"{validation_path}/submission{sub}.csv")["CustomerInterest"]
final_sub['CustomerInterest'] /= n_folds

In [16]:
final_sub.to_csv("../submissions/cv_light_gbm_edited_the_cv.csv", index=False, float_format = "%.8f")