# 1. SETTINGS

In [74]:
# libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score
from scipy.stats import spearmanr
import random

In [75]:
# garbage collection
import gc
gc.enable()

In [76]:
# warnings
import warnings
warnings.filterwarnings("ignore")

In [77]:
# pandas options
pd.set_option("display.max_columns", None)

# 2. IMPORT

In [78]:
# data name (used for saving files too)
data_name = "data_v4_0_60_under"

In [79]:
# import datasets
data = pd.read_csv("../data/prepared/" + str(data_name) + ".csv", compression = "gzip")

In [80]:
# data name (used for saving files too)
data_name = "data_v4_0_60_under_wlp_lm_bm"

In [81]:
# drop high recency from train
#data = data[(data.Week == 121) | (data.Recency2 < 27)]
data = data[(data.Week == 121) | (data.Recency1 < data.Recency1.max())]
data.shape

(2668912, 45)

# 3. PREPARATIONS

### ADD MORE FEATURES

In [82]:
# compute BuySell dummy
data["Buy"] = 0
data["Buy"][data.BuySell == "Buy"] = 1
print(data.shape)

(2668912, 45)


In [83]:
### RF Ratios

#data["R12_ratio"] = data.Recency1 / data.Recency2
#data["R13_ratio"] = data.Recency1 / data.Recency3
#data["R14_ratio"] = data.Recency1 / data.Recency4
#data["R56_ratio"] = data.Recency5 / data.Recency6

#data["F12_ratio"] = data.Frequency1 / data.Frequency2
#data["F13_ratio"] = data.Frequency1 / data.Frequency3
#data["F14_ratio"] = data.Frequency1 / data.Frequency4
#data["F56_ratio"] = data.Frequency5 / data.Frequency6

#data.shape

In [84]:
# add last week sum (CUST)
tmp = data.sort_values(by = ["Week"], ascending = True).groupby(["CustomerIdx", "Week"], as_index = True)
tmp = tmp.CustomerInterest.sum().reset_index()
tmp.columns = ["CustomerIdx", "Week", "LastWeekCustSum"]
tmp["Week"] = tmp.Week + 1
data = data.merge(tmp, how = "left", on = ["CustomerIdx", "Week"])
print(data.shape)

# add last week sum (BOND)
tmp = data.sort_values(by = ["Week"], ascending = True).groupby(["IsinIdx", "Week"], as_index = True)
tmp = tmp.CustomerInterest.sum().reset_index()
tmp.columns = ["IsinIdx", "Week", "LastWeekBondSum"]
tmp["Week"] = tmp.Week + 1
data = data.merge(tmp, how = "left", on = ["IsinIdx", "Week"])
print(data.shape)

(2668912, 46)
(2668912, 47)


In [85]:
### ADD PRICE AND NOTIONAL

tmp = pd.read_csv("../data/prepared/price_notional.csv", compression = "gzip")

# add last week price sum (CUST)
tmp2 = tmp.sort_values(by = ["Week"], ascending = True).groupby(["CustomerIdx", "Week"], as_index = True)
tmp2 = tmp2.Price.sum().reset_index()
tmp2.columns = ["CustomerIdx", "Week", "LastWeekCustPriceSum"]
tmp2["Week"] = tmp2.Week + 1
data = data.merge(tmp2, how = "left", on = ["CustomerIdx", "Week"])
print(data.shape)

# add last week notional sum (CUST)
tmp2 = tmp.sort_values(by = ["Week"], ascending = True).groupby(["CustomerIdx", "Week"], as_index = True)
tmp2 = tmp2.NotionalEUR.sum().reset_index()
tmp2.columns = ["CustomerIdx", "Week", "LastWeekCustNotionalSum"]
tmp2["Week"] = tmp2.Week + 1
data = data.merge(tmp2, how = "left", on = ["CustomerIdx", "Week"])
print(data.shape)

(2668912, 48)
(2668912, 49)


In [86]:
### ADD CUSTOMER INDICATORS

#cust  = pd.read_csv("../data/raw/Customer.csv")
#cust.head()

# create dummies for customers
#cust_dummies = pd.concat([cust.CustomerIdx, pd.get_dummies(cust.Subsector)], axis = 1)

# merge customer dummies
#data = data.merge(cust_dummies, on = "CustomerIdx", how = "left")
#print(data.shape)

In [87]:
### ADD BOND FEATURES [V1]

# load bond data
#bond_stat = pd.read_csv("../data/prepared/bondstat1_AK")

# merge bond data
#data = data.merge(bond_stat[["IsinIdx", "t", "timeOnMarket", "percentoflifeleft"]], how = "left", on = "IsinIdx")
#print(data.shape)

In [88]:
### ADD NEW BOND FEATURES [V2]

# load bond data
#bond_stat = pd.read_csv("../data/prepared/bondstat2_AK")

# merge bond data
#data = data.merge(bond_stat[["IsinIdx", 'ymeansq', 'ymeancube', 'zmeansq', 'zmeancube',
#                             'years_tomaturity', 'count', 'ratingspread', 'ratingZspread']], 
#                  how = "left", on = "IsinIdx")
#print(data.shape)

In [89]:
### ADD BOND FEATURES [V3]

# load bond data
#bond_stat = pd.read_csv("../data/prepared/bondstat3_AK")
#bond_stat.head()

# merge bond data
#data = data.merge(bond_stat[["IsinIdx", 'predyield', 'predprice']], how = "left", on = "IsinIdx")
#print(data.shape)

In [90]:
### BOND-SPECIFIC FINANCIAL INDICATORS

#bond  = pd.read_csv("../data/raw/Isin.csv")

# create bond dummies
#bond_dummies = pd.concat([bond.IsinIdx,
#                          pd.get_dummies(bond.IndustrySector), 
#                          pd.get_dummies(bond.Region),
#                          pd.get_dummies(bond.MarketIssue), 
#                          pd.get_dummies(bond.CouponType)], axis = 1)

# merge bond dummies
#data = data.merge(bond_dummies, on = "IsinIdx", how = "left")
#print(data.shape)

In [91]:
### ADD NEW NIKITA FEATURES

# import data
#cust    = pd.read_csv("../data/prepared/cust_nik.csv",   compression = "gzip")
#bond    = pd.read_csv("../data/prepared/bond_nik.csv",   compression = "gzip")
#market  = pd.read_csv("../data/prepared/market_nik.csv", compression = "gzip")

#cust = cust[["CustomerIdx", "Region"]]
#bond = bond[["IsinIdx", "ActualMaturityDateKey", "IssueDateKey"]]

# merge data
#print(data.shape)
#data = data.merge(cust, on = "CustomerIdx", how = "left")
#print(data.shape)
#data = data.merge(bond, on = "IsinIdx", how = "left")
#print(data.shape)
#data = data.merge(market, on = ["IsinIdx", "Week"], how = "left")
#print(data.shape)

# compute day differences
#data["ActualMaturityDateKey"] = data["ActualMaturityDateKey"] - data["Week"]
#data["IssueDateKey"]          = data["IssueDateKey"] - data["Week"]

In [92]:
### ADD LAST MONTH SUMS

# merge month number
tmp = pd.read_csv("../data/raw/Trade.csv")[["TradeDateKey"]]
tmp = tmp.drop_duplicates()
tmp["TradeDateKey"] = pd.to_datetime(tmp["TradeDateKey"], format = '%Y%m%d')
tmp["Week"] = (tmp.TradeDateKey.dt.year - 2016) * 52 + (tmp.TradeDateKey.dt.week)
tmp["CumMonth"] = (tmp.TradeDateKey.dt.year - 2016) * 12 + (tmp.TradeDateKey.dt.month)
tmp = tmp[["Week", "CumMonth"]]
tmp = tmp.drop_duplicates()
tmp = tmp.groupby("Week").CumMonth.min().reset_index()
data = data.merge(tmp, how = "left", on = "Week")
data["CumMonth"][data.Week == 121] = 28

# add last month mean (CUST)
tmp = data.sort_values(by = ["CumMonth"], ascending = True).groupby(["CustomerIdx", "CumMonth"], as_index = True)
tmp = tmp.CustomerInterest.sum().reset_index()
tmp.columns = ["CustomerIdx", "CumMonth", "LastMonthCustSum"]
tmp["CumMonth"] = tmp.CumMonth + 1
data = data.merge(tmp, how = "left", on = ["CustomerIdx", "CumMonth"])
print(data.shape)

# add last month mean (BOND)
tmp = data.sort_values(by = ["CumMonth"], ascending = True).groupby(["IsinIdx", "CumMonth"], as_index = True)
tmp = tmp.CustomerInterest.sum().reset_index()
tmp.columns = ["IsinIdx", "CumMonth", "LastMonthBondSum"]
tmp["CumMonth"] = tmp.CumMonth + 1
data = data.merge(tmp, how = "left", on = ["IsinIdx", "CumMonth"])
print(data.shape)

# drop month
del data["CumMonth"]

(2668912, 51)
(2668912, 52)


In [93]:
### ADD BOND MATURITY DATA

# import
bond = pd.read_csv("../data/prepared/data_bond_v1.csv", compression = "gzip")
bond = bond[['IsinIdx', 'MaturityWeek', 'IssueWeek']]

# merge
data = data.merge(bond, on = "IsinIdx", how = "left")

# compute week differences
data["MaturityWeek"]    = data.MaturityWeek - data.Week
data["IssueWeek"]       = data.Week - data.IssueWeek
data["MaturityPercent"] = (data.Week - data.IssueWeek) / (data.MaturityWeek - data.IssueWeek)
print(data.shape)

(2668912, 54)


In [94]:
### MERGE WEEKLY MACRO VARIABLES

# import
#fx1   = pd.read_csv("../data/prepared/data_fx1_v1.csv",  compression = "gzip")
#fx2   = pd.read_csv("../data/prepared/data_fx2_v1.csv",  compression = "gzip")
#ind1  = pd.read_csv("../data/prepared/data_ind1_v1.csv", compression = "gzip")
#ind2  = pd.read_csv("../data/prepared/data_ind2_v1.csv", compression = "gzip")

# merge currency
#bond = pd.read_csv("../data/raw/Isin.csv")
#data = data.merge(bond[["IsinIdx", "Currency"]], on = "IsinIdx")

# merge macro data
#data = data.merge(fx1,  how = 'left', on = ["Week",  "Currency"])
#print(data.shape)
#data = data.merge(fx2,  how = 'left', on = ["Month", "Currency"])
#print(data.shape)
#data = data.merge(ind1, how = 'left', on = "Week")
#print(data.shape)
#data = data.merge(ind2, how = 'left', on = "Month")
#print(data.shape)

# there are a few currencies for which we don't have info, e.g. CNH
#data[fx1.columns] = data[fx1.columns].fillna(0)
#data[fx2.columns] = data[fx2.columns].fillna(0)

# remove currency
#data.drop("Currency", axis = 1, inplace = True)
#print(data.shape)

In [95]:
### ADD NEW WEEKLY AND MONTHLY MARKET FEATURES

# merge month number
'''tmp = pd.read_csv("../data/raw/Trade.csv")[["TradeDateKey"]]
tmp = tmp.drop_duplicates()
tmp["TradeDateKey"] = pd.to_datetime(tmp["TradeDateKey"], format = '%Y%m%d')
tmp["Week"] = (tmp.TradeDateKey.dt.year - 2016) * 52 + (tmp.TradeDateKey.dt.week)
tmp["CumMonth"] = (tmp.TradeDateKey.dt.year - 2016) * 12 + (tmp.TradeDateKey.dt.month)
tmp = tmp[["Week", "CumMonth"]]
tmp = tmp.drop_duplicates()
tmp = tmp.groupby("Week").CumMonth.min().reset_index()
data = data.merge(tmp, how = "left", on = "Week")
data["CumMonth"][data.Week == 121] = 28

# import data
market  = pd.read_csv("../data/prepared/data_market_v1.csv", compression = "gzip")
market = market.rename(columns = {"Month": "CumMonth"})
market.CumMonth = market.groupby(["IsinIdx", "Week"]).CumMonth.min()
keep = market[["IsinIdx", "Week", "CumMonth"]].drop_duplicates().index
market = market[market.index.isin(keep)]
data = data.merge(market, how = "left", on = ["IsinIdx", "Week", "CumMonth"])
del data["CumMonth"]
print(data.shape)'''

'tmp = pd.read_csv("../data/raw/Trade.csv")[["TradeDateKey"]]\ntmp = tmp.drop_duplicates()\ntmp["TradeDateKey"] = pd.to_datetime(tmp["TradeDateKey"], format = \'%Y%m%d\')\ntmp["Week"] = (tmp.TradeDateKey.dt.year - 2016) * 52 + (tmp.TradeDateKey.dt.week)\ntmp["CumMonth"] = (tmp.TradeDateKey.dt.year - 2016) * 12 + (tmp.TradeDateKey.dt.month)\ntmp = tmp[["Week", "CumMonth"]]\ntmp = tmp.drop_duplicates()\ntmp = tmp.groupby("Week").CumMonth.min().reset_index()\ndata = data.merge(tmp, how = "left", on = "Week")\ndata["CumMonth"][data.Week == 121] = 28\n\n# import data\nmarket  = pd.read_csv("../data/prepared/data_market_v1.csv", compression = "gzip")\nmarket = market.rename(columns = {"Month": "CumMonth"})\nmarket.CumMonth = market.groupby(["IsinIdx", "Week"]).CumMonth.min()\nkeep = market[["IsinIdx", "Week", "CumMonth"]].drop_duplicates().index\nmarket = market[market.index.isin(keep)]\ndata = data.merge(market, how = "left", on = ["IsinIdx", "Week", "CumMonth"])\ndel data["CumMonth"]\nprin

In [96]:
### FAVORITE FEATURES

bond  = pd.read_csv("../data/raw/Isin.csv")
bond.loc[~bond.Seniority.isin(["GOV","SEC","SEN","SUB"]), "Seniority"] = "OTHER"
bond.loc[~bond.Activity.isin(["Asia","RETAIL","GBP SAS", "ARGENTINIA"]), "Activity"] = "OTHER"
bond.loc[bond.Currency.isin(
    (bond.groupby("Currency").size())[bond.groupby("Currency").size()<350].index.tolist()
), "Currency"] = "OTHER"

bond.loc[bond.CompositeRating.str.contains("C|D"), "CompositeRating"] = "LOWER"
# Risk captain
bond.loc[bond.RiskCaptain.isin(
    (bond.groupby("RiskCaptain").size())[bond.groupby("RiskCaptain").size()<200].index.tolist()
), "RiskCaptain"] = "OTHER"

bond = bond[[
       'IsinIdx', 
       'Seniority', 'Currency', 'ActivityGroup', 'Region', 'Activity',
       'RiskCaptain', 'Owner', 'CompositeRating', 'IndustrySector',
       'IndustrySubgroup', 'MarketIssue', 'CouponType']]

favor = pd.read_csv("../data/prepared/favorite_bonds_v2_60.csv", compression = "gzip")

data = data.merge(bond,  how = "left")
data = data.merge(favor, how = "left")

data["BuySell_mode"][data.BuySell == data["BuySell_mode"]] = 1
data["BuySell_mode"][data.BuySell_mode != 1] = 0

data["IsinIdx_mode"][data.IsinIdx == data["IsinIdx_mode"]] = 1
data["IsinIdx_mode"][data.IsinIdx_mode != 1] = 0

data["Seniority_mode"][data.Seniority == data["Seniority_mode"]] = 1
data["Seniority_mode"][data.Seniority_mode != 1] = 0

data["ActivityGroup_mode"][data.ActivityGroup == data["ActivityGroup_mode"]] = 1
data["ActivityGroup_mode"][data.ActivityGroup_mode != 1] = 0

data["Region_mode"][data.Region == data["Region_mode"]] = 1
data["Region_mode"][data.Region_mode != 1] = 0

data["Activity_mode"][data.Activity == data["Activity_mode"]] = 1
data["Activity_mode"][data.Activity_mode != 1] = 0

data["RiskCaptain_mode"][data.RiskCaptain == data["RiskCaptain_mode"]] = 1
data["RiskCaptain_mode"][data.RiskCaptain_mode != 1] = 0

data["Owner_mode"][data.Owner == data["Owner_mode"]] = 1
data["Owner_mode"][data.Owner_mode != 1] = 0

data["CompositeRating_mode"][data.CompositeRating == data["CompositeRating_mode"]] = 1
data["CompositeRating_mode"][data.CompositeRating_mode != 1] = 0

data["Currency_mode"][data.Currency == data["Currency_mode"]] = 1
data["Currency_mode"][data.Currency_mode != 1] = 0

data["CouponType_mode"][data.CouponType == data["CouponType_mode"]] = 1
data["CouponType_mode"][data.CouponType_mode != 1] = 0

excluded_features = ['Seniority', 'Currency', 'ActivityGroup', 'Region', 'Activity',
       'RiskCaptain', 'Owner', 'CompositeRating', 'IndustrySector',
       'IndustrySubgroup', 'MarketIssue', 'CouponType']
features = [var for var in data.columns if var not in excluded_features]
data = data[features]

print(data.shape)

(2668912, 65)


### CHECKS AND PREPARATIONS

In [97]:
# check data
print("Dimensions:", data.shape)
data.head()

Dimensions: (2668912, 65)


Unnamed: 0,PredictionIdx,CustomerIdx,IsinIdx,BuySell,CustomerInterest,Week,Recency1,Recency2,Recency3,Recency4,Recency5,Recency6,Recency1isLowerRecency2,Recency2isLowerRecency4,Frequency1,Frequency2,Frequency3,Frequency4,Frequency5,Frequency6,Frequecny1isLowerFrequency2,Frequecny2isLowerFrequency4,Month,Buy,Asset Managers & Hedge Funds,Asset Owners,Banks and Intermediaries,Corporation,Official Institution - OI,Americas,Asia Pacific,"Europe, Middle East and Africa",FLOW G10,FLOW LOCAL MARKET,SAS & COVERED BONDS,NR,Currency_trend,MeanPrice,StdPrice,MeanYield,StdYield,MeanZScore,StdZScore,YieldMarktDelta,ZScoreMarktDelta,LastWeekCustSum,LastWeekBondSum,LastWeekCustPriceSum,LastWeekCustNotionalSum,LastMonthCustSum,LastMonthBondSum,MaturityWeek,IssueWeek,MaturityPercent,IsinIdx_mode,BuySell_mode,Seniority_mode,Currency_mode,ActivityGroup_mode,Region_mode,Activity_mode,RiskCaptain_mode,Owner_mode,CompositeRating_mode,CouponType_mode
0,,2554,12022,Sell,0.0,68,2.0,2.0,1.0,1.0,2.0,2.0,1,0,0.014925,0.029851,43.716418,80.791045,0.134328,0.208955,1,1,1,0,1,0,0,0,0,0,0,1,0,1,0,0,1.0,117.319623,4.613998,4.468827,0.354249,2.207733,0.545221,1.4187,0.58415,27.0,0.0,5224.483,72245120.0,173.0,1.0,930,630,-1.873333,0.0,1,1,0,0,0,1,0,0,0,1
1,,1997,23143,Sell,0.0,112,71.0,71.0,1.0,1.0,1.0,1.0,1,0,0.009009,0.009009,4.333333,6.252252,0.990991,2.315315,1,1,3,0,0,1,0,0,0,0,0,1,1,0,0,0,-0.00752,102.526643,8.867004,2.470258,2.011795,2.34419,1.979572,-0.579869,0.720607,0.0,5.0,1000099.0,1400000.0,14.0,7.0,202,161,-1.195122,0.0,1,1,1,1,1,1,1,0,0,1
2,,2743,5570,Buy,0.0,99,88.0,88.0,1.0,1.0,6.0,4.0,1,0,0.020408,0.020408,62.642857,106.0,0.77551,1.387755,1,1,5,1,1,0,0,0,0,0,0,1,1,0,0,0,1.0,102.62727,2.228127,3.838438,0.347182,1.876163,0.581684,0.788311,0.25258,57.0,0.0,9977.553,136203100.0,441.0,2.0,359,161,-0.313131,0.0,1,0,1,1,1,1,0,0,0,1
3,,1307,23023,Buy,0.0,91,82.0,82.0,1.0,1.0,20.0,2.0,1,0,0.011111,0.011111,1.288889,2.233333,0.5,0.977778,1,1,7,1,0,0,1,0,0,0,0,1,1,0,0,0,-0.00294,101.01728,0.204407,-0.036528,0.193049,0.16106,0.150582,-3.086655,-1.462523,1.0,0.0,522.722,3768715.0,2.0,0.0,60,148,0.647727,0.0,1,1,1,1,1,1,0,0,0,1
4,,2107,5204,Sell,0.0,73,2.0,2.0,1.0,1.0,1.0,1.0,1,0,0.027778,0.027778,9.5,21.541667,0.319444,0.777778,1,1,12,0,1,0,0,0,0,1,0,0,0,1,0,0,1.0,110.170463,2.44328,5.958544,0.341985,3.63793,0.340257,2.908417,2.014346,3.0,4.0,5547.505,30714590.0,27.0,1.0,464,6,0.146288,0.0,0,0,1,0,1,1,0,0,0,1


In [98]:
# check NA
nas = data.isnull().sum()
nas[nas > 0]

PredictionIdx              2184154
CustomerInterest            484758
MeanPrice                   206495
StdPrice                    206601
MeanYield                   206495
StdYield                    206601
MeanZScore                  206495
StdZScore                   206601
YieldMarktDelta             206495
ZScoreMarktDelta            206495
LastWeekCustSum              53095
LastWeekBondSum             439429
LastWeekCustPriceSum        159194
LastWeekCustNotionalSum     159194
LastMonthCustSum             32568
LastMonthBondSum            122039
MaturityPercent                  3
dtype: int64

In [99]:
data.replace([np.inf, -np.inf], np.nan, inplace=True)

In [100]:
col_with_na = ['MeanPrice', 'StdPrice',
       'MeanYield', 'StdYield', 'MeanZScore', 'StdZScore', 'YieldMarktDelta',
       'ZScoreMarktDelta']
col_zero =  ['LastWeekCustSum', 'LastWeekBondSum',
       'LastWeekCustPriceSum', 'LastWeekCustNotionalSum', 'LastMonthCustSum',
       'LastMonthBondSum', 'MaturityPercent']

In [113]:
for col in col_with_na:
    data[col].fillna(np.mean(data[col]), inplace=True)
    
for col in col_zero:
    data[col].fillna(0, inplace=True)

In [114]:
# list of features
excluded_features = ["PredictionIdx", "CustomerIdx", "IsinIdx", "BuySell", "CustomerInterest",
                     "Frequecny1isLowerFrequency2", "Frequecny2isLowerFrequency4"]
features = [var for var in data.columns if var not in excluded_features]
len(features)

58

In [115]:
for var in features:
    
    mmin = data[var].mean() - 5*data[var].std()
    mmax = data[var].mean() + 5*data[var].std()
    
    data[var] = data[var].clip(mmin, mmax)

In [116]:
data[features].describe()

Unnamed: 0,Week,Recency1,Recency2,Recency3,Recency4,Recency5,Recency6,Recency1isLowerRecency2,Recency2isLowerRecency4,Frequency1,Frequency2,Frequency3,Frequency4,Frequency5,Frequency6,Month,Buy,Asset Managers & Hedge Funds,Asset Owners,Banks and Intermediaries,Corporation,Official Institution - OI,Americas,Asia Pacific,"Europe, Middle East and Africa",FLOW G10,FLOW LOCAL MARKET,SAS & COVERED BONDS,NR,Currency_trend,MeanPrice,StdPrice,MeanYield,StdYield,MeanZScore,StdZScore,YieldMarktDelta,ZScoreMarktDelta,LastWeekCustSum,LastWeekBondSum,LastWeekCustPriceSum,LastWeekCustNotionalSum,LastMonthCustSum,LastMonthBondSum,MaturityWeek,IssueWeek,MaturityPercent,IsinIdx_mode,BuySell_mode,Seniority_mode,Currency_mode,ActivityGroup_mode,Region_mode,Activity_mode,RiskCaptain_mode,Owner_mode,CompositeRating_mode,CouponType_mode
count,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0,2668912.0
mean,97.86913,37.18308,26.64048,1.358641,1.16038,5.60552,3.968954,0.7607321,0.08186632,0.02435919,0.03914357,43.9211,86.12,0.649803,1.268837,5.373084,0.5112323,0.6617363,0.039642,0.2784828,0.0001352629,0.008234574,0.4158837,0.05398042,0.5301359,0.7117781,0.2059041,0.08231781,0.1000265,0.5876176,103.8518,2.384195,3.523718,5.616164,17.48185,60.86593,0.4735906,15.85827,77.24105,1.07344,90707.8,278391400.0,318.2895,4.235358,356.4957,184.0519,0.2944751,0.0005867062,0.5570157,0.7513496,0.8313848,0.7615096,0.7417154,0.9989068,0.4105474,0.1729075,0.1689823,0.8835106
std,18.92735,34.02015,25.82139,1.54849,0.7964945,10.58059,8.446211,0.4266367,0.274161,0.03000516,0.05227209,71.73528,141.9835,0.6158205,1.189379,3.261264,0.4998739,0.4731189,0.1951167,0.4482524,0.002813165,0.06111244,0.4928738,0.2259791,0.4990911,0.452935,0.4043608,0.2748484,0.3000354,0.491885,7.217017,2.040716,16.49674,92.7988,402.0698,1453.63,16.49674,402.0698,142.5526,1.713911,1084139.0,752394800.0,595.9914,5.703844,385.5565,160.2801,3.049401,0.008457246,0.4967387,0.4322308,0.3744117,0.4261606,0.4376914,0.01348538,0.4919333,0.3781674,0.3747364,0.320811
min,61.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.008333333,0.0,0.008333333,0.0,0.008333333,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.12096,67.04698,0.0,-487.623,0.0,-99.36235,0.0,-490.6731,-100.9859,0.0,0.0,-6715030.0,0.0,0.0,0.0,-145.0,-42.0,-20.72693,0.0,0.0,0.0,0.0,0.0,0.0,0.8325525,0.0,0.0,0.0,0.0
25%,82.0,9.0,6.0,1.0,1.0,1.0,1.0,1.0,0.0,0.009433962,0.01075269,4.396396,8.794393,0.2253521,0.45,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00568,100.1982,1.048255,1.293219,0.2251318,0.4329083,0.1585271,-1.756908,-1.190675,3.0,0.0,100.359,13224920.0,14.0,0.0,123.0,69.0,-0.095,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
50%,100.0,26.0,18.0,1.0,1.0,2.0,1.0,1.0,0.0,0.01351351,0.01904762,15.68919,30.69167,0.4642857,0.9117647,4.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,102.5447,1.960794,2.735028,0.3268757,1.140602,0.3090316,-0.3150988,-0.4829808,17.0,0.0,1641.726,55328650.0,75.0,2.0,246.0,144.0,0.1670823,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0
75%,116.0,57.0,41.0,1.0,1.0,4.0,2.0,1.0,0.0,0.02631579,0.04166667,50.32812,99.96721,0.8627451,1.666667,8.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,105.5513,2.825024,4.33686,0.5777157,2.540766,0.6666935,1.286733,0.9171827,77.0,2.0,13048.88,203611200.0,304.0,6.0,420.0,246.0,0.7622951,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
max,121.0,120.0,119.0,12.4489,6.981423,60.65593,49.02589,1.0,1.0,0.1816904,0.3122087,398.6528,762.1928,3.743134,7.244242,12.0,1.0,1.0,1.0,1.0,0.05864277,0.461777,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,140.6918,12.68771,498.9012,2768.617,16003.21,56850.26,495.8511,16001.58,801.3837,10.08229,6944332.0,5694029000.0,3322.613,34.10643,2291.066,986.2174,21.31741,0.1224961,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [117]:
### DATA PARTITIONING

# training
X_train = data.loc[data.Week < 120]
y_train = data.loc[data.Week < 120].CustomerInterest

# validation
X_valid = data.loc[data.Week == 120]
y_valid = data.loc[data.Week == 120].CustomerInterest

# test set
test = data.loc[data.Week == 121]

In [118]:
# check NA in test
nas = test.isnull().sum()
nas[nas > 0]

CustomerInterest    484758
dtype: int64

In [119]:
nas = X_valid.isnull().sum()
nas[nas > 0]

PredictionIdx    40437
dtype: int64

# 4. MODELING - STAGE 1

In [120]:
X_train[features].describe()

Unnamed: 0,Week,Recency1,Recency2,Recency3,Recency4,Recency5,Recency6,Recency1isLowerRecency2,Recency2isLowerRecency4,Frequency1,Frequency2,Frequency3,Frequency4,Frequency5,Frequency6,Month,Buy,Asset Managers & Hedge Funds,Asset Owners,Banks and Intermediaries,Corporation,Official Institution - OI,Americas,Asia Pacific,"Europe, Middle East and Africa",FLOW G10,FLOW LOCAL MARKET,SAS & COVERED BONDS,NR,Currency_trend,MeanPrice,StdPrice,MeanYield,StdYield,MeanZScore,StdZScore,YieldMarktDelta,ZScoreMarktDelta,LastWeekCustSum,LastWeekBondSum,LastWeekCustPriceSum,LastWeekCustNotionalSum,LastMonthCustSum,LastMonthBondSum,MaturityWeek,IssueWeek,MaturityPercent,IsinIdx_mode,BuySell_mode,Seniority_mode,Currency_mode,ActivityGroup_mode,Region_mode,Activity_mode,RiskCaptain_mode,Owner_mode,CompositeRating_mode,CouponType_mode
count,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0,2143717.0
mean,92.22109,33.69933,29.82196,1.353249,1.165033,6.002391,4.377038,0.8174069,0.08424573,0.02681437,0.04186655,48.82405,95.53752,0.7102198,1.382632,5.766068,0.5140525,0.6645877,0.03719008,0.2779173,0.0001350001,0.008313299,0.4303045,0.05319219,0.5165033,0.7128194,0.206267,0.08091367,0.09893237,0.596674,103.9965,2.435175,3.646985,6.336522,20.13491,70.86485,0.5968582,18.51133,81.63099,1.087682,74328.01,304040600.0,336.7044,4.313704,356.3241,188.7799,0.2889688,0.0006779315,0.5713198,0.7522817,0.8375868,0.7660083,0.7475404,0.9992521,0.4162574,0.17739,0.1715586,0.8875724
std,16.84889,28.0621,27.04833,1.524659,0.8067271,11.14241,9.154716,0.3863327,0.277756,0.03121955,0.05496171,76.0155,150.5329,0.6387655,1.233831,3.507142,0.4998026,0.4721345,0.1892274,0.4479725,0.002810438,0.06139854,0.4951188,0.2244166,0.4997277,0.4524467,0.4046245,0.2727025,0.2985713,0.491185,7.245019,2.050197,17.64138,98.64885,431.0445,1559.436,17.64138,431.0445,150.3673,1.768565,953306.9,809476800.0,629.2594,5.958899,390.559,161.5228,3.097731,0.009087595,0.4948875,0.4316874,0.3688295,0.4233671,0.4344236,0.01116586,0.4929374,0.3819985,0.3769964,0.3158919
min,61.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.008474576,0.008474576,0.008474576,0.008474576,0.008474576,0.008474576,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.12096,67.04698,0.0,-487.623,0.0,-99.36235,0.0,-490.6731,-100.9859,0.0,0.0,-6715030.0,0.0,0.0,0.0,-145.0,-42.0,-20.72693,0.0,0.0,0.0,0.0,0.0,0.0,0.8325525,0.0,0.0,0.0,0.0
25%,77.0,9.0,7.0,1.0,1.0,1.0,1.0,1.0,0.0,0.01041667,0.01176471,5.791304,11.25773,0.266055,0.5245902,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0032,100.332,1.09593,1.312373,0.235624,0.4449067,0.1720668,-1.737754,-1.178676,3.0,0.0,147.0,12869690.0,14.0,0.0,117.0,74.0,-0.1176471,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
50%,94.0,27.0,22.0,1.0,1.0,2.0,1.0,1.0,0.0,0.01428571,0.02083333,18.86607,36.1875,0.5238095,1.027273,5.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,102.8829,2.040098,2.779273,0.3333432,1.187664,0.3306096,-0.2708538,-0.4359196,17.0,0.0,1875.425,55481050.0,77.0,2.0,243.0,148.0,0.1395833,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0
75%,108.0,53.0,48.0,1.0,1.0,5.0,3.0,1.0,0.0,0.02816901,0.04545455,56.66964,111.2358,0.9438202,1.817073,9.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,105.6609,2.869183,4.441995,0.6425043,2.644782,0.731985,1.391868,1.021199,80.0,2.0,13280.57,212979700.0,318.0,6.0,421.0,251.0,0.7779851,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
max,119.0,118.0,118.0,12.4489,6.981423,60.65593,49.02589,1.0,1.0,0.1816904,0.3122087,398.6528,762.1928,3.743134,7.244242,12.0,1.0,1.0,1.0,1.0,0.05864277,0.461777,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,140.6918,12.68771,498.9012,2768.617,16003.21,56850.26,495.8511,16001.58,801.3837,10.08229,6944332.0,5694029000.0,3322.613,34.10643,2291.066,986.2174,21.31741,0.1224961,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [121]:
from sklearn.linear_model import LogisticRegression

In [122]:
nb = LogisticRegression(C=1.0)

In [123]:
nb.fit(X_train[features], y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [124]:
pred_valid = X_valid[["CustomerIdx", "IsinIdx", "BuySell", "Week", "CustomerInterest"]]
pred_valid["TARGET"] = nb.predict_proba(X_valid[features])[:,1]
auc = roc_auc_score(y_valid, pred_valid.TARGET)
auc

0.3483107920240308

In [82]:
##### PREDICTION

### VALID DATA

# predict validation set
pred_valid = X_valid[["CustomerIdx", "IsinIdx", "BuySell", "Week", "CustomerInterest"]]
pred_valid["TARGET"] = nb.predict_proba(X_valid[features])[:,1]
auc = roc_auc_score(y_valid, pred_valid.TARGET)

# export CSV
pred_valid.to_csv("../pred_valid/auc" + str(round(auc, 6))[2:8] + "_" + str(data_name) + "_nb.csv", 
                  index = False, float_format = "%.8f")


### TEST DATA

# predict test set
test["TARGET"] = nb.predict_proba(test[features])[:,1]

# export CSV
subm = test[["PredictionIdx", "TARGET"]]
subm.columns = ["PredictionIdx", "CustomerInterest"]
subm.to_csv("../submissions/auc" + str(round(auc, 6))[2:8] + "_" + str(data_name) + "_nb_1stage.csv", 
            index = False, float_format = "%.8f")

In [83]:
auc

0.6255990207875272

# 5. MODELING - STAGE 2

In [84]:
# use full data as train
X_train = data.loc[data.Week <= 120]
y_train = data.loc[data.Week <= 120].CustomerInterest

In [85]:
nb = GaussianNB()
nb.fit(X_train[features], y_train)

GaussianNB(priors=None)

In [86]:
# export CSV
subm = test[["PredictionIdx", "TARGET"]]
subm.columns = ["PredictionIdx", "CustomerInterest"]
subm.to_csv("../submissions/auc" + str(round(auc, 6))[2:8] + "_" + str(data_name) + "_nb_2stage.csv", 
            index = False, float_format = "%.8f")