# 1. SETTINGS

In [1]:
# libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score
from scipy.stats import spearmanr
import random
import xgboost as xgb

In [2]:
# garbage collection
import gc
gc.enable()

In [3]:
# warnings
import warnings
warnings.filterwarnings("ignore")

In [4]:
# pandas options
pd.set_option("display.max_columns", None)

# 2. IMPORT

In [5]:
# data name (used for saving files too)
data_name = "data_v4_0_60_under"

In [6]:
# import datasets
data = pd.read_csv("../data/prepared/" + str(data_name) + ".csv", compression = "gzip")

In [7]:
# data name (used for saving files too)
data_name = "data_v4_0_60_under_wlp_lm_bm"

In [8]:
# drop high recency from train
#data = data[(data.Week == 121) | (data.Recency2 < 27)]
data = data[(data.Week == 121) | (data.Recency1 < data.Recency1.max())]
data.shape

(2668912, 45)

# 3. PREPARATIONS

### ADD MORE FEATURES

In [9]:
# compute BuySell dummy
data["Buy"] = 0
data["Buy"][data.BuySell == "Buy"] = 1
print(data.shape)

(2668912, 45)


In [10]:
### RF Ratios

#data["R12_ratio"] = data.Recency1 / data.Recency2
#data["R13_ratio"] = data.Recency1 / data.Recency3
#data["R14_ratio"] = data.Recency1 / data.Recency4
#data["R56_ratio"] = data.Recency5 / data.Recency6

#data["F12_ratio"] = data.Frequency1 / data.Frequency2
#data["F13_ratio"] = data.Frequency1 / data.Frequency3
#data["F14_ratio"] = data.Frequency1 / data.Frequency4
#data["F56_ratio"] = data.Frequency5 / data.Frequency6

#data.shape

In [11]:
# add last week sum (CUST)
tmp = data.sort_values(by = ["Week"], ascending = True).groupby(["CustomerIdx", "Week"], as_index = True)
tmp = tmp.CustomerInterest.sum().reset_index()
tmp.columns = ["CustomerIdx", "Week", "LastWeekCustSum"]
tmp["Week"] = tmp.Week + 1
data = data.merge(tmp, how = "left", on = ["CustomerIdx", "Week"])
print(data.shape)

# add last week sum (BOND)
tmp = data.sort_values(by = ["Week"], ascending = True).groupby(["IsinIdx", "Week"], as_index = True)
tmp = tmp.CustomerInterest.sum().reset_index()
tmp.columns = ["IsinIdx", "Week", "LastWeekBondSum"]
tmp["Week"] = tmp.Week + 1
data = data.merge(tmp, how = "left", on = ["IsinIdx", "Week"])
print(data.shape)

(2668912, 46)
(2668912, 47)


In [12]:
### ADD PRICE AND NOTIONAL

tmp = pd.read_csv("../data/prepared/price_notional.csv", compression = "gzip")

# add last week price sum (CUST)
tmp2 = tmp.sort_values(by = ["Week"], ascending = True).groupby(["CustomerIdx", "Week"], as_index = True)
tmp2 = tmp2.Price.sum().reset_index()
tmp2.columns = ["CustomerIdx", "Week", "LastWeekCustPriceSum"]
tmp2["Week"] = tmp2.Week + 1
data = data.merge(tmp2, how = "left", on = ["CustomerIdx", "Week"])
print(data.shape)

# add last week notional sum (CUST)
tmp2 = tmp.sort_values(by = ["Week"], ascending = True).groupby(["CustomerIdx", "Week"], as_index = True)
tmp2 = tmp2.NotionalEUR.sum().reset_index()
tmp2.columns = ["CustomerIdx", "Week", "LastWeekCustNotionalSum"]
tmp2["Week"] = tmp2.Week + 1
data = data.merge(tmp2, how = "left", on = ["CustomerIdx", "Week"])
print(data.shape)

(2668912, 48)
(2668912, 49)


In [13]:
### ADD CUSTOMER INDICATORS

#cust  = pd.read_csv("../data/raw/Customer.csv")
#cust.head()

# create dummies for customers
#cust_dummies = pd.concat([cust.CustomerIdx, pd.get_dummies(cust.Subsector)], axis = 1)

# merge customer dummies
#data = data.merge(cust_dummies, on = "CustomerIdx", how = "left")
#print(data.shape)

In [14]:
### ADD BOND FEATURES [V1]

# load bond data
#bond_stat = pd.read_csv("../data/prepared/bondstat1_AK")

# merge bond data
#data = data.merge(bond_stat[["IsinIdx", "t", "timeOnMarket", "percentoflifeleft"]], how = "left", on = "IsinIdx")
#print(data.shape)

In [15]:
### ADD NEW BOND FEATURES [V2]

# load bond data
#bond_stat = pd.read_csv("../data/prepared/bondstat2_AK")

# merge bond data
#data = data.merge(bond_stat[["IsinIdx", 'ymeansq', 'ymeancube', 'zmeansq', 'zmeancube',
#                             'years_tomaturity', 'count', 'ratingspread', 'ratingZspread']], 
#                  how = "left", on = "IsinIdx")
#print(data.shape)

In [16]:
### ADD BOND FEATURES [V3]

# load bond data
#bond_stat = pd.read_csv("../data/prepared/bondstat3_AK")
#bond_stat.head()

# merge bond data
#data = data.merge(bond_stat[["IsinIdx", 'predyield', 'predprice']], how = "left", on = "IsinIdx")
#print(data.shape)

In [17]:
### BOND-SPECIFIC FINANCIAL INDICATORS

#bond  = pd.read_csv("../data/raw/Isin.csv")

# create bond dummies
#bond_dummies = pd.concat([bond.IsinIdx,
#                          pd.get_dummies(bond.IndustrySector), 
#                          pd.get_dummies(bond.Region),
#                          pd.get_dummies(bond.MarketIssue), 
#                          pd.get_dummies(bond.CouponType)], axis = 1)

# merge bond dummies
#data = data.merge(bond_dummies, on = "IsinIdx", how = "left")
#print(data.shape)

In [18]:
### ADD NEW NIKITA FEATURES

# import data
#cust    = pd.read_csv("../data/prepared/cust_nik.csv",   compression = "gzip")
#bond    = pd.read_csv("../data/prepared/bond_nik.csv",   compression = "gzip")
#market  = pd.read_csv("../data/prepared/market_nik.csv", compression = "gzip")

#cust = cust[["CustomerIdx", "Region"]]
#bond = bond[["IsinIdx", "ActualMaturityDateKey", "IssueDateKey"]]

# merge data
#print(data.shape)
#data = data.merge(cust, on = "CustomerIdx", how = "left")
#print(data.shape)
#data = data.merge(bond, on = "IsinIdx", how = "left")
#print(data.shape)
#data = data.merge(market, on = ["IsinIdx", "Week"], how = "left")
#print(data.shape)

# compute day differences
#data["ActualMaturityDateKey"] = data["ActualMaturityDateKey"] - data["Week"]
#data["IssueDateKey"]          = data["IssueDateKey"] - data["Week"]

In [19]:
### ADD LAST MONTH SUMS

# merge month number
tmp = pd.read_csv("../data/raw/Trade.csv")[["TradeDateKey"]]
tmp = tmp.drop_duplicates()
tmp["TradeDateKey"] = pd.to_datetime(tmp["TradeDateKey"], format = '%Y%m%d')
tmp["Week"] = (tmp.TradeDateKey.dt.year - 2016) * 52 + (tmp.TradeDateKey.dt.week)
tmp["CumMonth"] = (tmp.TradeDateKey.dt.year - 2016) * 12 + (tmp.TradeDateKey.dt.month)
tmp = tmp[["Week", "CumMonth"]]
tmp = tmp.drop_duplicates()
tmp = tmp.groupby("Week").CumMonth.min().reset_index()
data = data.merge(tmp, how = "left", on = "Week")
data["CumMonth"][data.Week == 121] = 28

# add last month mean (CUST)
tmp = data.sort_values(by = ["CumMonth"], ascending = True).groupby(["CustomerIdx", "CumMonth"], as_index = True)
tmp = tmp.CustomerInterest.sum().reset_index()
tmp.columns = ["CustomerIdx", "CumMonth", "LastMonthCustSum"]
tmp["CumMonth"] = tmp.CumMonth + 1
data = data.merge(tmp, how = "left", on = ["CustomerIdx", "CumMonth"])
print(data.shape)

# add last month mean (BOND)
tmp = data.sort_values(by = ["CumMonth"], ascending = True).groupby(["IsinIdx", "CumMonth"], as_index = True)
tmp = tmp.CustomerInterest.sum().reset_index()
tmp.columns = ["IsinIdx", "CumMonth", "LastMonthBondSum"]
tmp["CumMonth"] = tmp.CumMonth + 1
data = data.merge(tmp, how = "left", on = ["IsinIdx", "CumMonth"])
print(data.shape)

# drop month
del data["CumMonth"]

(2668912, 51)
(2668912, 52)


In [20]:
### ADD BOND MATURITY DATA

# import
bond = pd.read_csv("../data/prepared/data_bond_v1.csv", compression = "gzip")
bond = bond[['IsinIdx', 'MaturityWeek', 'IssueWeek']]

# merge
data = data.merge(bond, on = "IsinIdx", how = "left")

# compute week differences
data["MaturityWeek"]    = data.MaturityWeek - data.Week
data["IssueWeek"]       = data.Week - data.IssueWeek
data["MaturityPercent"] = (data.Week - data.IssueWeek) / (data.MaturityWeek - data.IssueWeek)
print(data.shape)

(2668912, 54)


In [21]:
### MERGE WEEKLY MACRO VARIABLES

# import
#fx1   = pd.read_csv("../data/prepared/data_fx1_v1.csv",  compression = "gzip")
#fx2   = pd.read_csv("../data/prepared/data_fx2_v1.csv",  compression = "gzip")
#ind1  = pd.read_csv("../data/prepared/data_ind1_v1.csv", compression = "gzip")
#ind2  = pd.read_csv("../data/prepared/data_ind2_v1.csv", compression = "gzip")

# merge currency
#bond = pd.read_csv("../data/raw/Isin.csv")
#data = data.merge(bond[["IsinIdx", "Currency"]], on = "IsinIdx")

# merge macro data
#data = data.merge(fx1,  how = 'left', on = ["Week",  "Currency"])
#print(data.shape)
#data = data.merge(fx2,  how = 'left', on = ["Month", "Currency"])
#print(data.shape)
#data = data.merge(ind1, how = 'left', on = "Week")
#print(data.shape)
#data = data.merge(ind2, how = 'left', on = "Month")
#print(data.shape)

# there are a few currencies for which we don't have info, e.g. CNH
#data[fx1.columns] = data[fx1.columns].fillna(0)
#data[fx2.columns] = data[fx2.columns].fillna(0)

# remove currency
#data.drop("Currency", axis = 1, inplace = True)
#print(data.shape)

In [22]:
### ADD NEW WEEKLY AND MONTHLY MARKET FEATURES

# merge month number
'''tmp = pd.read_csv("../data/raw/Trade.csv")[["TradeDateKey"]]
tmp = tmp.drop_duplicates()
tmp["TradeDateKey"] = pd.to_datetime(tmp["TradeDateKey"], format = '%Y%m%d')
tmp["Week"] = (tmp.TradeDateKey.dt.year - 2016) * 52 + (tmp.TradeDateKey.dt.week)
tmp["CumMonth"] = (tmp.TradeDateKey.dt.year - 2016) * 12 + (tmp.TradeDateKey.dt.month)
tmp = tmp[["Week", "CumMonth"]]
tmp = tmp.drop_duplicates()
tmp = tmp.groupby("Week").CumMonth.min().reset_index()
data = data.merge(tmp, how = "left", on = "Week")
data["CumMonth"][data.Week == 121] = 28

# import data
market  = pd.read_csv("../data/prepared/data_market_v1.csv", compression = "gzip")
market = market.rename(columns = {"Month": "CumMonth"})
market.CumMonth = market.groupby(["IsinIdx", "Week"]).CumMonth.min()
keep = market[["IsinIdx", "Week", "CumMonth"]].drop_duplicates().index
market = market[market.index.isin(keep)]
data = data.merge(market, how = "left", on = ["IsinIdx", "Week", "CumMonth"])
del data["CumMonth"]
print(data.shape)'''

'tmp = pd.read_csv("../data/raw/Trade.csv")[["TradeDateKey"]]\ntmp = tmp.drop_duplicates()\ntmp["TradeDateKey"] = pd.to_datetime(tmp["TradeDateKey"], format = \'%Y%m%d\')\ntmp["Week"] = (tmp.TradeDateKey.dt.year - 2016) * 52 + (tmp.TradeDateKey.dt.week)\ntmp["CumMonth"] = (tmp.TradeDateKey.dt.year - 2016) * 12 + (tmp.TradeDateKey.dt.month)\ntmp = tmp[["Week", "CumMonth"]]\ntmp = tmp.drop_duplicates()\ntmp = tmp.groupby("Week").CumMonth.min().reset_index()\ndata = data.merge(tmp, how = "left", on = "Week")\ndata["CumMonth"][data.Week == 121] = 28\n\n# import data\nmarket  = pd.read_csv("../data/prepared/data_market_v1.csv", compression = "gzip")\nmarket = market.rename(columns = {"Month": "CumMonth"})\nmarket.CumMonth = market.groupby(["IsinIdx", "Week"]).CumMonth.min()\nkeep = market[["IsinIdx", "Week", "CumMonth"]].drop_duplicates().index\nmarket = market[market.index.isin(keep)]\ndata = data.merge(market, how = "left", on = ["IsinIdx", "Week", "CumMonth"])\ndel data["CumMonth"]\nprin

In [23]:
### FAVORITE FEATURES

bond  = pd.read_csv("../data/raw/Isin.csv")
bond.loc[~bond.Seniority.isin(["GOV","SEC","SEN","SUB"]), "Seniority"] = "OTHER"
bond.loc[~bond.Activity.isin(["Asia","RETAIL","GBP SAS", "ARGENTINIA"]), "Activity"] = "OTHER"
bond.loc[bond.Currency.isin(
    (bond.groupby("Currency").size())[bond.groupby("Currency").size()<350].index.tolist()
), "Currency"] = "OTHER"

bond.loc[bond.CompositeRating.str.contains("C|D"), "CompositeRating"] = "LOWER"
# Risk captain
bond.loc[bond.RiskCaptain.isin(
    (bond.groupby("RiskCaptain").size())[bond.groupby("RiskCaptain").size()<200].index.tolist()
), "RiskCaptain"] = "OTHER"

bond = bond[[
       'IsinIdx', 
       'Seniority', 'Currency', 'ActivityGroup', 'Region', 'Activity',
       'RiskCaptain', 'Owner', 'CompositeRating', 'IndustrySector',
       'IndustrySubgroup', 'MarketIssue', 'CouponType']]

favor = pd.read_csv("../data/prepared/favorite_bonds_v2_60.csv", compression = "gzip")

data = data.merge(bond,  how = "left")
data = data.merge(favor, how = "left")

data["BuySell_mode"][data.BuySell == data["BuySell_mode"]] = 1
data["BuySell_mode"][data.BuySell_mode != 1] = 0

data["IsinIdx_mode"][data.IsinIdx == data["IsinIdx_mode"]] = 1
data["IsinIdx_mode"][data.IsinIdx_mode != 1] = 0

data["Seniority_mode"][data.Seniority == data["Seniority_mode"]] = 1
data["Seniority_mode"][data.Seniority_mode != 1] = 0

data["ActivityGroup_mode"][data.ActivityGroup == data["ActivityGroup_mode"]] = 1
data["ActivityGroup_mode"][data.ActivityGroup_mode != 1] = 0

data["Region_mode"][data.Region == data["Region_mode"]] = 1
data["Region_mode"][data.Region_mode != 1] = 0

data["Activity_mode"][data.Activity == data["Activity_mode"]] = 1
data["Activity_mode"][data.Activity_mode != 1] = 0

data["RiskCaptain_mode"][data.RiskCaptain == data["RiskCaptain_mode"]] = 1
data["RiskCaptain_mode"][data.RiskCaptain_mode != 1] = 0

data["Owner_mode"][data.Owner == data["Owner_mode"]] = 1
data["Owner_mode"][data.Owner_mode != 1] = 0

data["CompositeRating_mode"][data.CompositeRating == data["CompositeRating_mode"]] = 1
data["CompositeRating_mode"][data.CompositeRating_mode != 1] = 0

data["Currency_mode"][data.Currency == data["Currency_mode"]] = 1
data["Currency_mode"][data.Currency_mode != 1] = 0

data["CouponType_mode"][data.CouponType == data["CouponType_mode"]] = 1
data["CouponType_mode"][data.CouponType_mode != 1] = 0

excluded_features = ['Seniority', 'Currency', 'ActivityGroup', 'Region', 'Activity',
       'RiskCaptain', 'Owner', 'CompositeRating', 'IndustrySector',
       'IndustrySubgroup', 'MarketIssue', 'CouponType']
features = [var for var in data.columns if var not in excluded_features]
data = data[features]

print(data.shape)

(2668912, 65)


### CHECKS AND PREPARATIONS

In [24]:
# check data
print("Dimensions:", data.shape)
data.head()

Dimensions: (2668912, 65)


Unnamed: 0,PredictionIdx,CustomerIdx,IsinIdx,BuySell,CustomerInterest,Week,Recency1,Recency2,Recency3,Recency4,Recency5,Recency6,Recency1isLowerRecency2,Recency2isLowerRecency4,Frequency1,Frequency2,Frequency3,Frequency4,Frequency5,Frequency6,Frequecny1isLowerFrequency2,Frequecny2isLowerFrequency4,Month,Buy,Asset Managers & Hedge Funds,Asset Owners,Banks and Intermediaries,Corporation,Official Institution - OI,Americas,Asia Pacific,"Europe, Middle East and Africa",FLOW G10,FLOW LOCAL MARKET,SAS & COVERED BONDS,NR,Currency_trend,MeanPrice,StdPrice,MeanYield,StdYield,MeanZScore,StdZScore,YieldMarktDelta,ZScoreMarktDelta,LastWeekCustSum,LastWeekBondSum,LastWeekCustPriceSum,LastWeekCustNotionalSum,LastMonthCustSum,LastMonthBondSum,MaturityWeek,IssueWeek,MaturityPercent,IsinIdx_mode,BuySell_mode,Seniority_mode,Currency_mode,ActivityGroup_mode,Region_mode,Activity_mode,RiskCaptain_mode,Owner_mode,CompositeRating_mode,CouponType_mode
0,,2554,12022,Sell,0.0,68,2.0,2.0,1.0,1.0,2.0,2.0,1,0,0.014925,0.029851,43.716418,80.791045,0.134328,0.208955,1,1,1,0,1,0,0,0,0,0,0,1,0,1,0,0,1.0,117.319623,4.613998,4.468827,0.354249,2.207733,0.545221,1.4187,0.58415,27.0,0.0,5224.483,72245120.0,173.0,1.0,930,630,-1.873333,0.0,1,1,0,0,0,1,0,0,0,1
1,,1997,23143,Sell,0.0,112,71.0,71.0,1.0,1.0,1.0,1.0,1,0,0.009009,0.009009,4.333333,6.252252,0.990991,2.315315,1,1,3,0,0,1,0,0,0,0,0,1,1,0,0,0,-0.00752,102.526643,8.867004,2.470258,2.011795,2.34419,1.979572,-0.579869,0.720607,0.0,5.0,1000099.0,1400000.0,14.0,7.0,202,161,-1.195122,0.0,1,1,1,1,1,1,1,0,0,1
2,,2743,5570,Buy,0.0,99,88.0,88.0,1.0,1.0,6.0,4.0,1,0,0.020408,0.020408,62.642857,106.0,0.77551,1.387755,1,1,5,1,1,0,0,0,0,0,0,1,1,0,0,0,1.0,102.62727,2.228127,3.838438,0.347182,1.876163,0.581684,0.788311,0.25258,57.0,0.0,9977.553,136203100.0,441.0,2.0,359,161,-0.313131,0.0,1,0,1,1,1,1,0,0,0,1
3,,1307,23023,Buy,0.0,91,82.0,82.0,1.0,1.0,20.0,2.0,1,0,0.011111,0.011111,1.288889,2.233333,0.5,0.977778,1,1,7,1,0,0,1,0,0,0,0,1,1,0,0,0,-0.00294,101.01728,0.204407,-0.036528,0.193049,0.16106,0.150582,-3.086655,-1.462523,1.0,0.0,522.722,3768715.0,2.0,0.0,60,148,0.647727,0.0,1,1,1,1,1,1,0,0,0,1
4,,2107,5204,Sell,0.0,73,2.0,2.0,1.0,1.0,1.0,1.0,1,0,0.027778,0.027778,9.5,21.541667,0.319444,0.777778,1,1,12,0,1,0,0,0,0,1,0,0,0,1,0,0,1.0,110.170463,2.44328,5.958544,0.341985,3.63793,0.340257,2.908417,2.014346,3.0,4.0,5547.505,30714590.0,27.0,1.0,464,6,0.146288,0.0,0,0,1,0,1,1,0,0,0,1


In [25]:
# check NA
nas = data.isnull().sum()
nas[nas > 0]

PredictionIdx              2184154
CustomerInterest            484758
MeanPrice                   206495
StdPrice                    206601
MeanYield                   206495
StdYield                    206601
MeanZScore                  206495
StdZScore                   206601
YieldMarktDelta             206495
ZScoreMarktDelta            206495
LastWeekCustSum              53095
LastWeekBondSum             439429
LastWeekCustPriceSum        159194
LastWeekCustNotionalSum     159194
LastMonthCustSum             32568
LastMonthBondSum            122039
MaturityPercent                  3
dtype: int64

In [26]:
data.replace([np.inf, -np.inf], np.nan, inplace=True)

In [27]:
col_with_na = ['MeanPrice', 'StdPrice',
       'MeanYield', 'StdYield', 'MeanZScore', 'StdZScore', 'YieldMarktDelta',
       'ZScoreMarktDelta']
col_zero =  ['LastWeekCustSum', 'LastWeekBondSum',
       'LastWeekCustPriceSum', 'LastWeekCustNotionalSum', 'LastMonthCustSum',
       'LastMonthBondSum', 'MaturityPercent']

In [28]:
for col in col_with_na:
    data[col].fillna(np.mean(data[col]), inplace=True)
    
for col in col_zero:
    data[col].fillna(np.mean(data[col]), inplace=True)

In [29]:
# list of features
excluded_features = ["PredictionIdx", "CustomerIdx", "IsinIdx", "BuySell", "CustomerInterest",
                     "Frequecny1isLowerFrequency2", "Frequecny2isLowerFrequency4"]
features = [var for var in data.columns if var not in excluded_features]
len(features)

58

In [30]:
### DATA PARTITIONING

# training
X_train = data.loc[data.Week < 120]
y_train = data.loc[data.Week < 120].CustomerInterest

# validation
X_valid = data.loc[data.Week == 120]
y_valid = data.loc[data.Week == 120].CustomerInterest

# test set
test = data.loc[data.Week == 121]

In [31]:
# check NA in test
nas = test.isnull().sum()
nas[nas > 0]

CustomerInterest    484758
dtype: int64

In [32]:
nas = X_valid.isnull().sum()
nas[nas > 0]

PredictionIdx    40437
dtype: int64

# 4. MODELING - STAGE 1

In [None]:
param = {'max_depth': 5, 'eta': 0.01, 'silent': 1, 'objective': 'binary:logistic'}
param['nthread'] = 16
param['eval_metric'] = 'auc'

In [36]:
# transforming dataframes
d_train   = xgb.DMatrix(X_train[features], label = y_train)
d_valid = xgb.DMatrix(X_valid[features], label = y_valid)

In [42]:
evallist = [(d_train, 'train'), (d_valid, 'eval')]

In [51]:
params = {#"n_estimators"     : 10000,
                         "eta"    : 0.005,
                         "num_leaves"       : 70,
                         "colsample_bytree" : 0.8,
                         "subsample"        : 0.9,
                         "max_depth"        : 7,
                         "reg_alpha"        : 0.1,
                         "reg_lambda"       : 0.1,
                         "min_split_gain"   : 0.01,
                         "min_child_weight" : 2,
                         "random_state"     : 42,
                         "nthread"      : 20,
                        'objective': 'binary:logistic',
                        'eval_metric':'auc'}

In [52]:
num_round = 10000
bst = xgb.train(params, d_train, num_round, evallist, early_stopping_rounds=100, verbose_eval=10)

[0]	train-auc:0.82671	eval-auc:0.827179
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 100 rounds.
[10]	train-auc:0.833088	eval-auc:0.833158
[20]	train-auc:0.833698	eval-auc:0.834313
[30]	train-auc:0.834358	eval-auc:0.835311
[40]	train-auc:0.834381	eval-auc:0.835496
[50]	train-auc:0.834471	eval-auc:0.835649
[60]	train-auc:0.834605	eval-auc:0.835884
[70]	train-auc:0.834763	eval-auc:0.836062
[80]	train-auc:0.834929	eval-auc:0.836179
[90]	train-auc:0.835074	eval-auc:0.83631
[100]	train-auc:0.835305	eval-auc:0.836607
[110]	train-auc:0.835486	eval-auc:0.836858
[120]	train-auc:0.835659	eval-auc:0.837063
[130]	train-auc:0.835873	eval-auc:0.837304
[140]	train-auc:0.836083	eval-auc:0.837497
[150]	train-auc:0.836231	eval-auc:0.837684
[160]	train-auc:0.836411	eval-auc:0.837863
[170]	train-auc:0.836566	eval-auc:0.838035
[180]	train-auc:0.836672	eval-auc:0.838168
[190]	train-auc:0.836842	eval-auc:0.838401
[200]	train

# train the optimal model
xgb_best = xgb.train(best_params, d_known, num_boost_round = cv["test-mae-mean"].argmin())

In [54]:
num_iters = bst.best_iteration

In [62]:
pred_valid = X_valid[["CustomerIdx", "IsinIdx", "BuySell", "Week", "CustomerInterest"]]
pred_valid["TARGET"] = bst.predict(d_valid, ntree_limit=num_iters)
auc = roc_auc_score(y_valid, pred_valid.TARGET)

In [63]:
bst.best_score

0.850756

In [64]:
auc

0.8507550089726351

In [65]:
##### PREDICTION

### VALID DATA

# predict validation set
pred_valid = X_valid[["CustomerIdx", "IsinIdx", "BuySell", "Week", "CustomerInterest"]]
pred_valid["TARGET"] = bst.predict(d_valid, ntree_limit=num_iters)
auc = roc_auc_score(y_valid, pred_valid.TARGET)

# export CSV
pred_valid.to_csv("../pred_valid/auc" + str(round(auc, 6))[2:8] + "_" + str(data_name) + "_xgb.csv", 
                  index = False, float_format = "%.8f")


### TEST DATA

# predict test set
d_test = xgb.DMatrix(test[features])
test["TARGET"] = bst.predict(d_test, ntree_limit=num_iters)

# export CSV
subm = test[["PredictionIdx", "TARGET"]]
subm.columns = ["PredictionIdx", "CustomerInterest"]
subm.to_csv("../submissions/auc" + str(round(auc, 6))[2:8] + "_" + str(data_name) + "_xgb_1stage.csv", 
            index = False, float_format = "%.8f")

##### VARIABLE IMPORTANCE

# load variable importance
importance = pd.DataFrame()
importance["feature"] = features
importance["importance"] = rf.feature_importances_
%matplotlib inline
# plot variable importance
plt.figure(figsize = (10, 12))
sns.barplot(x = "importance", y = "feature", data = importance.sort_values(by = "importance", ascending = False))
plt.title("LGBM Feature Importance")
plt.tight_layout()

# save plot as pdf
#plt.savefig(“../var_importance.pdf)

# 5. MODELING - STAGE 2

In [66]:
# use full data as train
X_train = data.loc[data.Week <= 120]
y_train = data.loc[data.Week <= 120].CustomerInterest

In [68]:
d_train = xgb.DMatrix(X_train[features], label=y_train)

In [69]:
evallist = [(d_train, 'train')]

In [71]:
params = {#"n_estimators"     : num_iters,
                         "eta"    : 0.005,
                         "num_leaves"       : 70,
                         "colsample_bytree" : 0.8,
                         "subsample"        : 0.9,
                         "max_depth"        : 7,
                         "reg_alpha"        : 0.1,
                         "reg_lambda"       : 0.1,
                         "min_split_gain"   : 0.01,
                         "min_child_weight" : 2,
                         "random_state"     : 42,
                         "nthread"      : 20,
                        'objective': 'binary:logistic',
                        'eval_metric':'auc'}

In [72]:
num_round = num_iters
bst = xgb.train(params, d_train, num_round, evallist, verbose_eval=10)

[0]	train-auc:0.825951
[10]	train-auc:0.832848
[20]	train-auc:0.833428
[30]	train-auc:0.833668
[40]	train-auc:0.833866
[50]	train-auc:0.834226
[60]	train-auc:0.834648
[70]	train-auc:0.834874
[80]	train-auc:0.835188
[90]	train-auc:0.835389
[100]	train-auc:0.835628
[110]	train-auc:0.835805
[120]	train-auc:0.835983
[130]	train-auc:0.836159
[140]	train-auc:0.836299
[150]	train-auc:0.836455
[160]	train-auc:0.836609
[170]	train-auc:0.836774
[180]	train-auc:0.836888
[190]	train-auc:0.836988
[200]	train-auc:0.837136
[210]	train-auc:0.837281
[220]	train-auc:0.837424
[230]	train-auc:0.837561
[240]	train-auc:0.837697
[250]	train-auc:0.837799
[260]	train-auc:0.837919
[270]	train-auc:0.838026
[280]	train-auc:0.838125
[290]	train-auc:0.838242
[300]	train-auc:0.838361
[310]	train-auc:0.838482
[320]	train-auc:0.838599
[330]	train-auc:0.838712
[340]	train-auc:0.838822
[350]	train-auc:0.838935
[360]	train-auc:0.839033
[370]	train-auc:0.839139
[380]	train-auc:0.839259
[390]	train-auc:0.839359
[400]	train

In [73]:
# test set

# predict test set
test["TARGET"] = bst.predict(d_test, ntree_limit=num_iters)

In [74]:
# export CSV
subm = test[["PredictionIdx", "TARGET"]]
subm.columns = ["PredictionIdx", "CustomerInterest"]
subm.to_csv("../submissions/auc" + str(round(auc, 6))[2:8] + "_" + str(data_name) + "_xgb_2stage.csv", 
            index = False, float_format = "%.8f")