# 1. SETTINGS

In [1]:
# libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import scipy.stats
import os
from functions.smooth_stat import smooth_stat

In [2]:
# pandas options
pd.set_option("display.max_columns", None)

In [3]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [4]:
# garbage collection
import gc
gc.enable()

# 2. DATA PARTITIONING

In [5]:
# import data
train = pd.read_csv("../data/raw/Trade.csv")
test  = pd.read_csv("../data/raw/Challenge_20180423.csv")

In [6]:
# create target variable
train["CustomerInterest"] = 1
train["CustomerInterest"][train["TradeStatus"] == "Holding"] = 0

In [7]:
# partitioning
d_train = train[(train["TradeDateKey"] >=  20170323) & (train["TradeDateKey"] < 20180323)]
d_valid = train[(train["TradeDateKey"] >=  20180323)]

# CROSS-VALIDATION TRAINING

In [8]:
### PARAMETERS

# learner settings
metric   = "auc"
verbose  = 10
stopping = 30
seed = 42
features = ['cust_int_mean0', 'cust_int_min0', 'cust_int_std0']
n_folds = 5

# lgb settings
gbm = lgb.LGBMClassifier(n_estimators     = 1000,
                         learning_rate    = 0.005,
                         num_leaves       = 70,
                         colsample_bytree = 0.8,
                         subsample        = 0.9,
                         max_depth        = 7,
                         reg_alpha        = 0.1,
                         reg_lambda       = 0.1,
                         min_split_gain   = 0.01,
                         min_child_weight = 2,
                         random_state     = seed)

In [9]:
def _train_model(model, train_x, train_y, val_x, val_y):
    #create features inside the CV
    
    # train lightGBM
    global verbose
    global stopping
    global metric
    model = model.fit(train_x, train_y, 
              eval_set = [(train_x, train_y), (val_x, val_y)], 
              eval_metric = metric, 
              verbose = verbose, 
              early_stopping_rounds = stopping)
    
    # save number of iterations
    num_iters = gbm.best_iteration_
    best_auc = gbm.best_score_
    return (model, num_iters, best_auc)  

In [10]:
def train_fold(dataset, validation_set, fold_count, model):
    ''' Please, use Xs and y_s sorted by date,
    otherwise it can overfitting by looking in the future'''
    fold_size = len(dataset) // fold_count
    models = {}
    epochs = {}
    aucs = {}
    feat = {}
    for fold_id in range(0, fold_count):
            fold_start = fold_size * fold_id
            fold_end = fold_start + fold_size
            if fold_id == fold_size - 1:
                fold_end = len(dataset)
                
            train_data = pd.concat([dataset.iloc[:fold_start,:], dataset.iloc[fold_end:,:]])
            val_data = validation_set
            #val_data = dataset.iloc[fold_start:fold_end,:]
           # train_x = np.concatenate([X[:fold_start], X[fold_end:]])
           # train_y = np.concatenate([y[:fold_start], y[fold_end:]])
           # val_x = X[fold_start:fold_end]
           # val_y = y[fold_start:fold_end]

            # FEATURE CREATION GOES HERE
            # Example (in general can be imporved)

            # compute historical target ratio
            from functions.smooth_stat import smooth_stat
            
            # "CustomerIdx"
            target_feature='CustomerInterest'
            cust_int_mean0 = smooth_stat(train_data, ["CustomerIdx"], type_of_stat='mean',
                                        target_feature='CustomerInterest', alpha=10)
            cust_int_mean0.rename(columns={f'{target_feature}': 'cust_int_mean0'}, inplace=True)
            
            cust_int_min0 = smooth_stat(train_data, ["CustomerIdx"], type_of_stat='min',
                                        target_feature='CustomerInterest', alpha=10)
            cust_int_min0.rename(columns={f'{target_feature}': 'cust_int_min0'}, inplace=True)
            
            cust_int_max0 = smooth_stat(train_data, ["CustomerIdx"], type_of_stat='max',
                                        target_feature='CustomerInterest', alpha=10)
            cust_int_max0.rename(columns={f'{target_feature}': 'cust_int_max0'}, inplace=True)
            
            cust_int_std0 = smooth_stat(train_data, ["CustomerIdx"], type_of_stat='std',
                                        target_feature='CustomerInterest', alpha=10)
            cust_int_std0.rename(columns={f'{target_feature}': 'cust_int_std0'}, inplace=True)
            
            cust_int0 = cust_int_mean0.copy()
            cust_int0 = cust_int0.merge(cust_int_min0, how = "left", on = "CustomerIdx")
            cust_int0 = cust_int0.merge(cust_int_max0,   how = "left", on = "CustomerIdx")
            cust_int0 = cust_int0.merge(cust_int_std0,   how = "left", on = "CustomerIdx")
            
            
            # merge features
            train_data = train_data.merge(cust_int0, how = "left", on = "CustomerIdx")
            val_data = val_data.merge(cust_int0, how = "left", on = "CustomerIdx")
            
            train_data.fillna(0, inplace=True)
            val_data.fillna(0, inplace=True)
            
            train_x = train_data.drop('CustomerInterest', axis=1)
            train_y = train_data['CustomerInterest']
            val_x = val_data.drop('CustomerInterest', axis=1)
            val_y = val_data['CustomerInterest']
            
        
            ##### END OF EXAMPLE
            global features
            # TRAINING
            fold_model, fold_iter, fold_auc = _train_model(model, train_x[features], train_y, val_x[features], val_y)
            
            epochs[fold_id] = fold_iter    
            aucs[fold_id] = fold_auc
            feat[fold_id] = cust_int0
            models[fold_id] = fold_model
            
            print(f'_______________________ \n {fold_id} {fold_auc} \n ____________________')
            
    return (models, epochs, aucs, feat)

In [11]:
def auc_mean(aucs):
    train_auc = []
    valid0_auc = []
    test_auc = []
    for fold in aucs:
        train_auc.append(aucs[fold]['training']['auc'])  
        test_auc.append(aucs[fold]['valid_1']['auc'])   
    mean_train = np.asarray(train_auc).mean()
    mean_test = np.asarray(test_auc).mean()
    return({'train_auc':mean_train, 'cv_auc':mean_test})

In [12]:
# Prediction
#X = d_train.sort_values(by=['TradeDateKey'])[['NotionalEUR']]
#y = d_train.sort_values(by=['TradeDateKey'])['CustomerInterest']

d_train = d_train.sort_values(by=['TradeDateKey'])
models, epochs, aucs, features_cv = train_fold(train, d_valid, n_folds, gbm)

Training until validation scores don't improve for 30 rounds.
[10]	training's auc: 0.914053	valid_1's auc: 0.860687
[20]	training's auc: 0.913486	valid_1's auc: 0.860046
[30]	training's auc: 0.913947	valid_1's auc: 0.860676
Early stopping, best iteration is:
[1]	training's auc: 0.914571	valid_1's auc: 0.861813
_______________________ 
 0 defaultdict(<class 'dict'>, {'training': {'auc': 0.91457144114848721}, 'valid_1': {'auc': 0.86181272183258706}}) 
 ____________________
Training until validation scores don't improve for 30 rounds.
[10]	training's auc: 0.916946	valid_1's auc: 0.862888
[20]	training's auc: 0.916617	valid_1's auc: 0.862586
[30]	training's auc: 0.916874	valid_1's auc: 0.862916
Early stopping, best iteration is:
[1]	training's auc: 0.917628	valid_1's auc: 0.864357
_______________________ 
 1 defaultdict(<class 'dict'>, {'training': {'auc': 0.91762788288987063}, 'valid_1': {'auc': 0.86435669241939372}}) 
 ____________________
Training until validation scores don't improve f

In [13]:
auc_mean(aucs)

{'cv_auc': 0.86230693406824099, 'train_auc': 0.91670941121186211}

In [14]:
# Prediction
validation_path = "../submissions/cv_validations/"

if not os.path.exists(validation_path):
    os.makedirs(validation_path)

# predict
for i, model in enumerate(models):   
    test_t = test.merge(features_cv[i], how = "left", on = "CustomerIdx")
    test_t["CustomerInterest"] = models[model].predict_proba(test_t[features], num_iteration = epochs[model])[:, 1]
    # smart impute 
    test_t[test_t[features].isnull()]["CustomerInterest"] = 0
    # export CSV
    subm = test_t[["PredictionIdx", "CustomerInterest"]]
    subm.to_csv(f"{validation_path}/submission{i}.csv", index = False)

In [15]:
final_sub = pd.read_csv(f"{validation_path}/submission0.csv")
for sub in range(n_folds):
    if(sub==0):
        continue
    else:
        final_sub['CustomerInterest'] += pd.read_csv(f"{validation_path}/submission{sub}.csv")["CustomerInterest"]
final_sub['CustomerInterest'] /= n_folds

In [16]:
final_sub.to_csv("../submissions/cv_light_gbm_edited_the_cv.csv", index=False, float_format = "%.8f")