# 1. SETTINGS

In [1]:
# libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import scipy.stats

In [2]:
# pandas options
pd.set_option("display.max_columns", None)

In [3]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [4]:
# garbage collection
import gc
gc.enable()

# 2. DATA PARTITIONING

In [5]:
# import data
train = pd.read_csv("../data/raw/Trade.csv")
test  = pd.read_csv("../data/raw/Challenge_20180423.csv")

In [6]:
# check all datasets
display(test.head(3))
print("Test data:", test.shape)
print("------------------------------")
display(train.head(3))
print("Train data:", train.shape)

Unnamed: 0,PredictionIdx,DateKey,CustomerIdx,IsinIdx,BuySell,CustomerInterest
0,a1e0d80784,20180423,1856,13323,Buy,
1,c2cc6cc2a8,20180423,1856,9230,Buy,
2,a8e94f6344,20180423,1780,9157,Buy,


Test data: (484758, 6)
------------------------------


Unnamed: 0,TradeDateKey,CustomerIdx,IsinIdx,BuySell,NotionalEUR,Price,TradeStatus,CustomerInterest
0,20161207,2789,8478,Sell,653168.0,0.0,Unknown,1.0
1,20170329,2574,14562,Buy,1656487.0,0.0,Unknown,1.0
2,20170418,2574,4747,Buy,939673.0,0.0,Unknown,1.0


Train data: (6762021, 8)


In [7]:
# create target variable
train["CustomerInterest"] = 1
train["CustomerInterest"][train["TradeStatus"] == "Holding"] = 0

In [8]:
# partitioning
d_stats = train[(train["TradeDateKey"] <   20170323)]
d_train = train[(train["TradeDateKey"] >=  20170323) & (train["TradeDateKey"] < 20180323)]
d_valid = train[(train["TradeDateKey"] >=  20180323)]

# 3. FEATURE ENGINEERING

In [9]:
### COMPUTE TARGET RATIOS

# compute historical target ratio (LAST 12 MONTHS)
cust_int_1 = d_stats[d_stats["TradeDateKey"] >= 20160323]
cust_int_1 = cust_int_1[["CustomerIdx", "CustomerInterest", "IsinIdx", "BuySell"]]
cust_int_1 = cust_int_1.groupby(["CustomerIdx", "IsinIdx", "BuySell"], as_index = False).mean()

# compute historical target ratio (LAST 6 MONTHS)
cust_int_2 = d_stats[d_stats["TradeDateKey"] >= 20160923]
cust_int_2 = cust_int_2[["CustomerIdx", "CustomerInterest", "IsinIdx", "BuySell"]]
cust_int_2 = cust_int_2.groupby(["CustomerIdx", "IsinIdx", "BuySell"], as_index = False).mean()

# compute historical target ratio (LAST 3 MONTHS)
cust_int_3 = d_stats[d_stats["TradeDateKey"] >= 20161223]
cust_int_3 = cust_int_3[["CustomerIdx", "CustomerInterest", "IsinIdx", "BuySell"]]
cust_int_3 = cust_int_3.groupby(["CustomerIdx", "IsinIdx", "BuySell"], as_index = False).mean()

# compute historical target ratio (LAST 1 MONTH)
cust_int_4 = d_stats[d_stats["TradeDateKey"] >= 20170223]
cust_int_4 = cust_int_4[["CustomerIdx", "CustomerInterest", "IsinIdx", "BuySell"]]
cust_int_4 = cust_int_4.groupby(["CustomerIdx", "IsinIdx", "BuySell"], as_index = False).mean()

In [10]:
### MERGE RATIOS

# build data frame
cust_int = cust_int_1.merge(cust_int_2, how = "left", on = ["CustomerIdx", "IsinIdx", "BuySell"])
cust_int = cust_int.merge(cust_int_3,   how = "left", on = ["CustomerIdx", "IsinIdx", "BuySell"])
cust_int = cust_int.merge(cust_int_4,   how = "left", on = ["CustomerIdx", "IsinIdx", "BuySell"])
cust_int.columns = [['CustomerIdx', 'IsinIdx', 'BuySell', 'ratio1', 'ratio2', 'ratio3', 'ratio4']]
cust_int.columns = cust_int.columns.get_level_values(0)

# merge features
d_train = d_train.merge(cust_int, how = "left", on = ["CustomerIdx", "IsinIdx", "BuySell"])
d_valid = d_valid.merge(cust_int, how = "left", on = ["CustomerIdx", "IsinIdx", "BuySell"])

In [11]:
# check all datasets
display(d_train.head(3))
print("Train data:", d_train.shape)
print("------------------------------")
display(d_valid.head(3))
print("Valid data:", d_valid.shape)

Unnamed: 0,TradeDateKey,CustomerIdx,IsinIdx,BuySell,NotionalEUR,Price,TradeStatus,CustomerInterest,ratio1,ratio2,ratio3,ratio4
0,20170329,2574,14562,Buy,1656487.0,0.0,Unknown,1,0.2,0.333333,0.333333,0.0
1,20170418,2574,4747,Buy,939673.0,0.0,Unknown,1,,,,
2,20170801,2585,15428,Sell,1544963.0,0.0,Unknown,1,0.0,0.0,0.0,0.0


Train data: (3048675, 12)
------------------------------


Unnamed: 0,TradeDateKey,CustomerIdx,IsinIdx,BuySell,NotionalEUR,Price,TradeStatus,CustomerInterest,ratio1,ratio2,ratio3,ratio4
0,20180418,3122,19044,Buy,866504.0,126.329,NotTraded,1,0.571429,0.666667,0.5,0.0
1,20180403,2505,23240,Buy,500000.0,103.625,NotTraded,1,1.0,1.0,1.0,
2,20180409,2638,23240,Buy,500000.0,104.25,Unknown,1,0.333333,0.0,0.0,


Valid data: (227387, 12)


In [12]:
# extract target
y_train = d_train["CustomerInterest"]
y_valid = d_valid["CustomerInterest"]

# 4. MODELING

In [13]:
# exclude features
excluded_feats = ["TradeDateKey", "CustomerIdx", "IsinIdx", "BuySell", 
                  "NotionalEUR", "Price", "TradeStatus", "CustomerInterest"]
features = [f for f in d_train.columns if f not in excluded_feats]
features

['ratio1', 'ratio2', 'ratio3', 'ratio4']

In [14]:
### PARAMETERS

# learner settings
metric   = "auc"
verbose  = 10
stopping = 100
seed = 42

# lgb settings
gbm = lgb.LGBMClassifier(n_estimators     = 1000,
                         learning_rate    = 0.005,
                         num_leaves       = 70,
                         colsample_bytree = 0.8,
                         subsample        = 0.9,
                         max_depth        = 7,
                         reg_alpha        = 0.1,
                         reg_lambda       = 0.1,
                         min_split_gain   = 0.01,
                         min_child_weight = 2,
                         random_state     = seed)

In [15]:
# train lightGBM
gbm = gbm.fit(d_train[features], y_train, 
              eval_set = [(d_train[features], y_train), (d_valid[features], y_valid)], 
              eval_metric = metric, 
              verbose = verbose, 
              early_stopping_rounds = stopping)
    
# save number of iterations
num_iters = gbm.best_iteration_

Training until validation scores don't improve for 100 rounds.
[10]	valid_0's auc: 0.759179	valid_1's auc: 0.704967
[20]	valid_0's auc: 0.759273	valid_1's auc: 0.704808
[30]	valid_0's auc: 0.758984	valid_1's auc: 0.704476
[40]	valid_0's auc: 0.758977	valid_1's auc: 0.704432
[50]	valid_0's auc: 0.759117	valid_1's auc: 0.704455
[60]	valid_0's auc: 0.759069	valid_1's auc: 0.704481
[70]	valid_0's auc: 0.759066	valid_1's auc: 0.704504
[80]	valid_0's auc: 0.759106	valid_1's auc: 0.704527
[90]	valid_0's auc: 0.759059	valid_1's auc: 0.70453
[100]	valid_0's auc: 0.759067	valid_1's auc: 0.704536
[110]	valid_0's auc: 0.759024	valid_1's auc: 0.704466
Early stopping, best iteration is:
[12]	valid_0's auc: 0.759222	valid_1's auc: 0.705043


# 5. RECOMPUTE FEATURES

In [16]:
# merge train data
train = pd.concat([d_train, d_valid], axis = 0)
train = train[excluded_feats]

In [17]:
### COMPUTE TARGET RATIOS

# compute historical target ratio (LAST 12 MONTHS)
cust_int_1 = train[train["TradeDateKey"] >= 20170423]
cust_int_1 = cust_int_1[["CustomerIdx", "CustomerInterest", "IsinIdx", "BuySell"]]
cust_int_1 = cust_int_1.groupby(["CustomerIdx", "IsinIdx", "BuySell"], as_index = False).mean()

# compute historical target ratio (LAST 6 MONTHS)
cust_int_2 = train[train["TradeDateKey"] >= 20171023]
cust_int_2 = cust_int_2[["CustomerIdx", "CustomerInterest", "IsinIdx", "BuySell"]]
cust_int_2 = cust_int_2.groupby(["CustomerIdx", "IsinIdx", "BuySell"], as_index = False).mean()

# compute historical target ratio (LAST 3 MONTHS)
cust_int_3 = train[train["TradeDateKey"] >= 20180123]
cust_int_3 = cust_int_3[["CustomerIdx", "CustomerInterest", "IsinIdx", "BuySell"]]
cust_int_3 = cust_int_3.groupby(["CustomerIdx", "IsinIdx", "BuySell"], as_index = False).mean()

# compute historical target ratio (LAST 1 MONTH)
cust_int_4 = train[train["TradeDateKey"] >= 20180323]
cust_int_4 = cust_int_4[["CustomerIdx", "CustomerInterest", "IsinIdx", "BuySell"]]
cust_int_4 = cust_int_4.groupby(["CustomerIdx", "IsinIdx", "BuySell"], as_index = False).mean()

In [18]:
### MERGE RATIOS

# build data frame
cust_int = cust_int_1.merge(cust_int_2, how = "left", on = ["CustomerIdx", "IsinIdx", "BuySell"])
cust_int = cust_int.merge(cust_int_3,   how = "left", on = ["CustomerIdx", "IsinIdx", "BuySell"])
cust_int = cust_int.merge(cust_int_4,   how = "left", on = ["CustomerIdx", "IsinIdx", "BuySell"])
cust_int.columns = [['CustomerIdx', 'IsinIdx', 'BuySell', 'ratio1', 'ratio2', 'ratio3', 'ratio4']]
cust_int.columns = cust_int.columns.get_level_values(0)

# merge features
train = train.merge(cust_int, how = "left", on = ["CustomerIdx", "IsinIdx", "BuySell"])
test  = test.merge(cust_int,  how = "left", on = ["CustomerIdx", "IsinIdx", "BuySell"])

In [19]:
# check all datasets
display(train.head(3))
print("Train data:", train.shape)
print("------------------------------")
display(test.head(3))
print("Test data:", test.shape)

Unnamed: 0,TradeDateKey,CustomerIdx,IsinIdx,BuySell,NotionalEUR,Price,TradeStatus,CustomerInterest,ratio1,ratio2,ratio3,ratio4
0,20170329,2574,14562,Buy,1656487.0,0.0,Unknown,1,0.6,0.5,,
1,20170418,2574,4747,Buy,939673.0,0.0,Unknown,1,,,,
2,20170801,2585,15428,Sell,1544963.0,0.0,Unknown,1,0.666667,0.0,,


Train data: (3276062, 12)
------------------------------


Unnamed: 0,PredictionIdx,DateKey,CustomerIdx,IsinIdx,BuySell,CustomerInterest,ratio1,ratio2,ratio3,ratio4
0,a1e0d80784,20180423,1856,13323,Buy,,,,,
1,c2cc6cc2a8,20180423,1856,9230,Buy,,0.571429,0.8,1.0,1.0
2,a8e94f6344,20180423,1780,9157,Buy,,,,,


Test data: (484758, 10)


# 6. SUBMISSION

In [20]:
# predict
test["CustomerInterest"] = gbm.predict_proba(test[features], num_iteration = num_iters)[:, 1]

# smart impute 
test["CustomerInterest"][test["ratio1"].isnull()] = 0

# export CSV
subm = test[["PredictionIdx", "CustomerInterest"]]
subm.to_csv("../submissions/lgb_4ratios.csv", index = False, float_format = "%.8f")