# 1. SETTINGS

In [None]:
# libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from scipy.stats import spearmanr
import lightgbm as lgb
import random

In [None]:
# garbage collection
import gc
gc.enable()

In [None]:
# warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
# pandas options
pd.set_option("display.max_columns", None)

# 2. IMPORT

In [None]:
# data name (used for saving files too)
data_name = "data_v4_0_60_under"

In [None]:
# import datasets
data = pd.read_csv("../data/prepared/" + str(data_name) + ".csv", compression = "gzip")

In [None]:
# data name (used for saving files too)
data_name = "data_v4mlp_ak2_0_60_under_rec27"

In [None]:
# drop high recency from train
data = data[(data.Week == 121) | (data.Recency2 < 27)]
data = data[(data.Week == 121) | (data.Recency1 < data.Recency1.max())]
data.shape

# 3. PREPARATIONS

### ADD MORE FEATURES

In [None]:
# add last week sum (CUST)
tmp = data.sort_values(by = ["Week"], ascending = True).groupby(["CustomerIdx", "Week"], as_index = True)
tmp = tmp.CustomerInterest.sum().reset_index()
tmp.columns = ["CustomerIdx", "Week", "LastWeekCustSum"]
tmp["Week"] = tmp.Week + 1
data = data.merge(tmp, how = "left", on = ["CustomerIdx", "Week"])
print(data.shape)

# add last week sum (BOND)
tmp = data.sort_values(by = ["Week"], ascending = True).groupby(["IsinIdx", "Week"], as_index = True)
tmp = tmp.CustomerInterest.sum().reset_index()
tmp.columns = ["IsinIdx", "Week", "LastWeekBondSum"]
tmp["Week"] = tmp.Week + 1
data = data.merge(tmp, how = "left", on = ["IsinIdx", "Week"])
print(data.shape)

In [None]:
### ADD PRICE AND NOTIONAL

tmp = pd.read_csv("../data/prepared/price_notional.csv", compression = "gzip")

# add last week price sum (CUST)
tmp2 = tmp.sort_values(by = ["Week"], ascending = True).groupby(["CustomerIdx", "Week"], as_index = True)
tmp2 = tmp2.Price.sum().reset_index()
tmp2.columns = ["CustomerIdx", "Week", "LastWeekCustPriceSum"]
tmp2["Week"] = tmp2.Week + 1
data = data.merge(tmp2, how = "left", on = ["CustomerIdx", "Week"])
print(data.shape)

# add last week notional sum (CUST)
tmp2 = tmp.sort_values(by = ["Week"], ascending = True).groupby(["CustomerIdx", "Week"], as_index = True)
tmp2 = tmp2.NotionalEUR.sum().reset_index()
tmp2.columns = ["CustomerIdx", "Week", "LastWeekCustNotionalSum"]
tmp2["Week"] = tmp2.Week + 1
data = data.merge(tmp2, how = "left", on = ["CustomerIdx", "Week"])
print(data.shape)

In [None]:
### ADD CUSTOMER INDICATORS

cust  = pd.read_csv("../data/raw/Customer.csv")
cust.head()

# create dummies for customers
cust_dummies = pd.concat([cust.CustomerIdx, pd.get_dummies(cust.Subsector)], axis = 1)

# merge customer dummies
data = data.merge(cust_dummies, on = "CustomerIdx", how = "left")
print(data.shape)

In [None]:
### ADD BOND FEATURES [V1]

# load bond data
bond_stat = pd.read_csv("../data/prepared/bondstat1_AK")

# merge bond data
data = data.merge(bond_stat[["IsinIdx", "t", "timeOnMarket", "percentoflifeleft"]], how = "left", on = "IsinIdx")
print(data.shape)

In [None]:
### ADD NEW BOND FEATURES

# load bond data
bond_stat = pd.read_csv("../data/prepared/bondstat2_AK")

# merge bond data
data = data.merge(bond_stat[["IsinIdx", 'ymeansq', 'ymeancube', 'zmeansq', 'zmeancube',
                             'years_tomaturity', 'count', 'ratingspread', 'ratingZspread']], 
                  how = "left", on = "IsinIdx")
print(data.shape)

In [None]:
### ADD BOND FEATURES [V3]

# load bond data
bond_stat = pd.read_csv("../data/prepared/bondstat3_AK")
bond_stat.head()

# merge bond data
data = data.merge(bond_stat[["IsinIdx", 'predyield', 'predprice']], 
                  how = "left", on = "IsinIdx")
print(data.shape)

In [None]:
### BOND-SPECIFIC FINANCIAL INDICATORS

bond  = pd.read_csv("../data/raw/Isin.csv")

# create bond dummies
bond_dummies = pd.concat([bond.IsinIdx,
                          pd.get_dummies(bond.IndustrySector), 
                          pd.get_dummies(bond.Region),
                          pd.get_dummies(bond.MarketIssue), 
                          pd.get_dummies(bond.CouponType)], axis = 1)

# merge bond dummies
data = data.merge(bond_dummies, on = "IsinIdx", how = "left")
print(data.shape)

### CHECKS AND PREPARATIONS

In [None]:
# check data
print("Dimensions:", data.shape)
data.head()

In [None]:
# check NA
nas = data.isnull().sum()
nas[nas > 0]

In [None]:
# list of features
excluded_features = ["PredictionIdx", "CustomerIdx", "IsinIdx", "BuySell", "CustomerInterest"]
features = [var for var in data.columns if var not in excluded_features]
len(features)

In [None]:
### DATA PARTITIONING

# training
X_train = data.loc[data.Week < 120]
y_train = data.loc[data.Week < 120].CustomerInterest

# validation
X_valid = data.loc[data.Week == 120]
y_valid = data.loc[data.Week == 120].CustomerInterest

# test set
test = data.loc[data.Week == 121]

In [None]:
# check NA in test
nas = test.isnull().sum()
nas[nas > 0]

# 4. MODELING - STAGE 1

In [None]:
### PARAMETERS

# parallel settings
cores = 2

# learner settings
metric   = "auc"
verbose  = 250
stopping = 100

# lightGBM
gbm = lgb.LGBMClassifier(n_estimators     = 10000,
                         learning_rate    = 0.005,
                         num_leaves       = 70,
                         colsample_bytree = 0.8,
                         subsample        = 0.9,
                         max_depth        = 7,
                         reg_alpha        = 0.1,
                         reg_lambda       = 0.1,
                         min_split_gain   = 0.01,
                         min_child_weight = 2,
                         random_state     = 42,
                         num_threads      = cores)

In [None]:
# train lightGBM
gbm = gbm.fit(X_train[features], y_train, 
              eval_set = [(X_train[features], y_train), 
                          (X_valid[features], y_valid)], 
              eval_metric = metric, verbose = verbose, 
              early_stopping_rounds = stopping)
    
# save number of iterations
num_iters = gbm.best_iteration_  


##### RESULTS (FULL VALIDATION)

# k = 61 (v2), train from 80 week, 1k iter, stop = 100 (not met):                0.83679  (0.77887 LB)
# k = 61 (v2), train from 80 week, 3k iter, stop = 100 (not met):                0.840938 (0.78581 LB)
# k = 61 (v2), train from 80 week, 5k iter, stop = 100 (not met):                0.841801 (0.78626 LB)
# k = 61 (v2), train from 80 week, 10k iter, stop = 100 (met):                   0.841866 (0.78626 LB)

# k = 62 (v2+v3), train from 80 week, 10k iter, stop = 100 (met):                0.842258 ()
# k = 38 (v3),    train from 80 week, 5k iter,  stop = 100 (not met):            0.843073 (0.78504 LB)
# k = 39 (v3+BS), train from 80 week, 10k iter, stop = 100 (met):                0.843815 ()
# k = 31 (v2red), train from 60 week, 10k iter, stop = 100 (not met):            0.844060 ()

# k = 39 (v4), train from 60  week, 10k iter, stop = 100 (met):                  0.844511 ()
# k = 39 (v4), train from 80  week, 10k iter, stop = 100 (met):                  0.845086 (0.78952 LB)
# k = 39 (v4), train from 100 week, 10k iter, stop = 100 (met):                  0.844058 ()


##### RESULTS (UNDERSAMPLED DATA)

# k = 39 (v4), under0.5, train from 60  week, 10k iter, stop = 100 (met):        0.843178 (0.79132 LB)
# k = 39 (v4), under0.5, train from 80  week, 10k iter, stop = 100 (met):        0.845558 (0.78835 LB)
# k = 39 (v4), under0.5, train from 100 week, 10k iter, stop = 100 (met):        0.843116 ()

# k = 40 (v4w),        under0.5, train from 60 week, 10k, stop = 100 (met):      0.844871 (0.79152 LB)
# k = 44 (v4wlp),      under0.5, train from 60 week, 10k, stop = 100 (met):      0.846500 (0.79330 LB)
# k = 46 (v4w2lp),     under0.5, train from 60 week, 10k, stop = 100 (met):      0.846537 ()
# k = 52 (v4wlp_ak2),  under0.5, train from 60 week, 10k, stop = 100 (met):      0.846774 (0.79316 LB)
# k = 54 (v4w2lp_ak2), under0.5, train from 60 week, 10k, stop = 100 (met):      0.846399 ()


##### RESULTS (UNDERSAMPLE, REMOVE RECENCY2 > 27)

# 0.786076
# 0.786368
# 0.787315
# 0.789084
# 0.789294

In [None]:
##### VARIABLE IMPORTANCE

# load variable importance
importance = pd.DataFrame()
importance["feature"] = features
importance["importance"] = gbm.feature_importances_

# plot variable importance
plt.figure(figsize = (10, 20))
sns.barplot(x = "importance", y = "feature", data = importance.sort_values(by = "importance", ascending = False))
plt.title('LGBM Feature Importance')
plt.tight_layout()

# save plot as pdf
plt.savefig("../var_importance.pdf")

In [None]:
##### PREDICTION

### VALID DATA

# predict validation set
pred_valid = X_valid[["CustomerIdx", "IsinIdx", "BuySell", "Week", "CustomerInterest"]]
pred_valid["TARGET"] = gbm.predict_proba(X_valid[features], num_iteration = num_iters)[:, 1]
auc = roc_auc_score(y_valid, pred_valid.TARGET)

# export CSV
pred_valid.to_csv("../pred_valid/auc" + str(round(auc, 6))[2:8] + "_" + str(data_name) + "_lgb.csv", 
                  index = False, float_format = "%.8f")


### TEST DATA

# predict test set
test["TARGET"] = gbm.predict_proba(test[features], num_iteration = num_iters)[:, 1]

# check rank correlation with the best submission
best = pd.read_csv("../submissions/auc789227_ensemble_es.csv.csv")
best = best.merge(test[["PredictionIdx", "TARGET"]], how = "left", on = "PredictionIdx")
print(spearmanr(best["TARGET"], best.CustomerInterest))

# export CSV
subm = best[["PredictionIdx", "TARGET"]]
subm.columns = ["PredictionIdx", "CustomerInterest"]
subm.to_csv("../submissions/auc" + str(round(auc, 6))[2:8] + "_" + str(data_name) + "_lgb_1stage.csv", 
            index = False, float_format = "%.8f")

In [None]:
# export CSV
subm = best[["PredictionIdx", "TARGET"]]
subm.columns = ["PredictionIdx", "CustomerInterest"]
subm.to_csv("../submissions/auc" + str(round(auc, 6))[2:8] + "_" + str(data_name) + "_lgb_1stage.csv", 
            index = False, float_format = "%.8f")

# 5. MODELING - STAGE 2

In [None]:
# keep top features
top = 100
features = list(importance["feature"][0:np.min([top, len(features)])])

In [None]:
# use full data as train
X_train = data.loc[data.Week <= 120]
y_train = data.loc[data.Week <= 120].CustomerInterest

In [None]:
### retrain model with the best iters

# lightGBM
gbm = lgb.LGBMClassifier(n_estimators     = num_iters,
                         learning_rate    = 0.005,
                         num_leaves       = 70,
                         colsample_bytree = 0.8,
                         subsample        = 0.9,
                         max_depth        = 7,
                         reg_alpha        = 0.1,
                         reg_lambda       = 0.1,
                         min_split_gain   = 0.01,
                         min_child_weight = 2,
                         random_state     = 42,
                         num_threads      = cores)

# train lightGBM
gbm = gbm.fit(X_train[features], y_train, 
              eval_set = [(X_train[features], y_train)], 
              eval_metric = metric, verbose = 250)

In [None]:
# predict test set
test["TARGET"] = gbm.predict_proba(test[features], num_iteration = num_iters)[:, 1]

In [None]:
# check rank correlation with the best submission
best = pd.read_csv("../submissions/auc789227_ensemble_es.csv.csv")
best = best.merge(test[["PredictionIdx", "TARGET"]], how = "left", on = "PredictionIdx")
spearmanr(best["TARGET"], best.CustomerInterest)

In [None]:
# export CSV
subm = best[["PredictionIdx", "TARGET"]]
subm.columns = ["PredictionIdx", "CustomerInterest"]
subm.to_csv("../submissions/auc" + str(round(auc, 6))[2:8] + "_" + str(data_name) + "_lgb_2stage.csv", 
            index = False, float_format = "%.8f")