# 1. SETTINGS

In [None]:
# libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score
from scipy.stats import spearmanr
import random

In [None]:
# garbage collection
import gc
gc.enable()

In [None]:
# warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
# pandas options
pd.set_option("display.max_columns", None)

# 2. IMPORT

In [None]:
# data name (used for saving files too)
data_name = "data_v4_0_60_under"

In [None]:
# import datasets
data = pd.read_csv("../data/prepared/" + str(data_name) + ".csv", compression = "gzip")

In [None]:
# data name (used for saving files too)
data_name = "data_v4_0_60_under_wlp_lm_bm_mff"

In [None]:
# drop high recency from train
data = data[(data.Week == 121) | (data.Recency1 < data.Recency1.max())]
data.shape

# 3. PREPARATIONS

### ADD MORE FEATURES

In [None]:
# compute BuySell dummy
data["Buy"] = 0
data["Buy"][data.BuySell == "Buy"] = 1
print(data.shape)

In [None]:
# add last week sum (CUST)
tmp = data.sort_values(by = ["Week"], ascending = True).groupby(["CustomerIdx", "Week"], as_index = True)
tmp = tmp.CustomerInterest.sum().reset_index()
tmp.columns = ["CustomerIdx", "Week", "LastWeekCustSum"]
tmp["Week"] = tmp.Week + 1
data = data.merge(tmp, how = "left", on = ["CustomerIdx", "Week"])
print(data.shape)

# add last week sum (BOND)
tmp = data.sort_values(by = ["Week"], ascending = True).groupby(["IsinIdx", "Week"], as_index = True)
tmp = tmp.CustomerInterest.sum().reset_index()
tmp.columns = ["IsinIdx", "Week", "LastWeekBondSum"]
tmp["Week"] = tmp.Week + 1
data = data.merge(tmp, how = "left", on = ["IsinIdx", "Week"])
print(data.shape)

In [None]:
### ADD PRICE AND NOTIONAL

tmp = pd.read_csv("../data/prepared/price_notional.csv", compression = "gzip")

# add last week price sum (CUST)
tmp2 = tmp.sort_values(by = ["Week"], ascending = True).groupby(["CustomerIdx", "Week"], as_index = True)
tmp2 = tmp2.Price.sum().reset_index()
tmp2.columns = ["CustomerIdx", "Week", "LastWeekCustPriceSum"]
tmp2["Week"] = tmp2.Week + 1
data = data.merge(tmp2, how = "left", on = ["CustomerIdx", "Week"])
print(data.shape)

# add last week notional sum (CUST)
tmp2 = tmp.sort_values(by = ["Week"], ascending = True).groupby(["CustomerIdx", "Week"], as_index = True)
tmp2 = tmp2.NotionalEUR.sum().reset_index()
tmp2.columns = ["CustomerIdx", "Week", "LastWeekCustNotionalSum"]
tmp2["Week"] = tmp2.Week + 1
data = data.merge(tmp2, how = "left", on = ["CustomerIdx", "Week"])
print(data.shape)

In [None]:
### ADD LAST MONTH SUMS

# merge month number
tmp = pd.read_csv("../data/raw/Trade.csv")[["TradeDateKey"]]
tmp = tmp.drop_duplicates()
tmp["TradeDateKey"] = pd.to_datetime(tmp["TradeDateKey"], format = '%Y%m%d')
tmp["Week"] = (tmp.TradeDateKey.dt.year - 2016) * 52 + (tmp.TradeDateKey.dt.week)
tmp["CumMonth"] = (tmp.TradeDateKey.dt.year - 2016) * 12 + (tmp.TradeDateKey.dt.month)
tmp = tmp[["Week", "CumMonth"]]
tmp = tmp.drop_duplicates()
tmp = tmp.groupby("Week").CumMonth.min().reset_index()
data = data.merge(tmp, how = "left", on = "Week")
data["CumMonth"][data.Week == 121] = 28

# add last month mean (CUST)
tmp = data.sort_values(by = ["CumMonth"], ascending = True).groupby(["CustomerIdx", "CumMonth"], as_index = True)
tmp = tmp.CustomerInterest.sum().reset_index()
tmp.columns = ["CustomerIdx", "CumMonth", "LastMonthCustSum"]
tmp["CumMonth"] = tmp.CumMonth + 1
data = data.merge(tmp, how = "left", on = ["CustomerIdx", "CumMonth"])
print(data.shape)

# add last month mean (BOND)
tmp = data.sort_values(by = ["CumMonth"], ascending = True).groupby(["IsinIdx", "CumMonth"], as_index = True)
tmp = tmp.CustomerInterest.sum().reset_index()
tmp.columns = ["IsinIdx", "CumMonth", "LastMonthBondSum"]
tmp["CumMonth"] = tmp.CumMonth + 1
data = data.merge(tmp, how = "left", on = ["IsinIdx", "CumMonth"])
print(data.shape)

# drop month
del data["CumMonth"]

In [None]:
### ADD BOND MATURITY DATA

# import
bond = pd.read_csv("../data/prepared/data_bond_v1.csv", compression = "gzip")
bond = bond[['IsinIdx', 'MaturityWeek', 'IssueWeek']]

# merge
data = data.merge(bond, on = "IsinIdx", how = "left")

# compute week differences
data["MaturityWeek"]    = data.MaturityWeek - data.Week
data["IssueWeek"]       = data.Week - data.IssueWeek
data["MaturityPercent"] = (data.Week - data.IssueWeek) / (data.MaturityWeek - data.IssueWeek)
print(data.shape)

In [None]:
### FAVORITE FEATURES

bond  = pd.read_csv("../data/raw/Isin.csv")
bond.loc[~bond.Seniority.isin(["GOV","SEC","SEN","SUB"]), "Seniority"] = "OTHER"
bond.loc[~bond.Activity.isin(["Asia","RETAIL","GBP SAS", "ARGENTINIA"]), "Activity"] = "OTHER"
bond.loc[bond.Currency.isin(
    (bond.groupby("Currency").size())[bond.groupby("Currency").size()<350].index.tolist()
), "Currency"] = "OTHER"

bond.loc[bond.CompositeRating.str.contains("C|D"), "CompositeRating"] = "LOWER"
# Risk captain
bond.loc[bond.RiskCaptain.isin(
    (bond.groupby("RiskCaptain").size())[bond.groupby("RiskCaptain").size()<200].index.tolist()
), "RiskCaptain"] = "OTHER"

bond = bond[[
       'IsinIdx', "TickerIdx",
       'Seniority', 'Currency', 'ActivityGroup', 'Region', 'Activity',
       'RiskCaptain', 'Owner', 'CompositeRating', 'IndustrySector',
       'IndustrySubgroup', 'MarketIssue', 'CouponType']]

favor = pd.read_csv("../data/prepared/favorite_bonds_v3_60.csv", compression = "gzip")

data = data.merge(bond,  how = "left")
data = data.merge(favor, how = "left")

data["BuySell_mode"][data.BuySell == data["BuySell_mode"]] = 1
data["BuySell_mode"][data.BuySell_mode != 1] = 0

data["TickerIdx_mode"][data.TickerIdx == data["TickerIdx_mode"]] = 1
data["TickerIdx_mode"][data.TickerIdx != 1] = 0

data["IsinIdx_mode"][data.IsinIdx == data["IsinIdx_mode"]] = 1
data["IsinIdx_mode"][data.IsinIdx_mode != 1] = 0

data["Seniority_mode"][data.Seniority == data["Seniority_mode"]] = 1
data["Seniority_mode"][data.Seniority_mode != 1] = 0

data["Currency_mode"][data.Currency == data["Currency_mode"]] = 1
data["Currency_mode"][data.Currency_mode != 1] = 0

data["ActivityGroup_mode"][data.ActivityGroup == data["ActivityGroup_mode"]] = 1
data["ActivityGroup_mode"][data.ActivityGroup_mode != 1] = 0

data["Region_mode"][data.Region == data["Region_mode"]] = 1
data["Region_mode"][data.Region_mode != 1] = 0

data["Activity_mode"][data.Activity == data["Activity_mode"]] = 1
data["Activity_mode"][data.Activity_mode != 1] = 0

data["RiskCaptain_mode"][data.RiskCaptain == data["RiskCaptain_mode"]] = 1
data["RiskCaptain_mode"][data.RiskCaptain_mode != 1] = 0

data["Owner_mode"][data.Owner == data["Owner_mode"]] = 1
data["Owner_mode"][data.Owner_mode != 1] = 0

data["CompositeRating_mode"][data.CompositeRating == data["CompositeRating_mode"]] = 1
data["CompositeRating_mode"][data.CompositeRating_mode != 1] = 0

data["IndustrySector_mode"][data.IndustrySector == data["IndustrySector_mode"]] = 1
data["IndustrySector_mode"][data.IndustrySector != 1] = 0

data["IndustrySubgroup_mode"][data.IndustrySubgroup == data["IndustrySubgroup_mode"]] = 1
data["IndustrySubgroup_mode"][data.IndustrySubgroup != 1] = 0

data["MarketIssue_mode"][data.MarketIssue == data["MarketIssue_mode"]] = 1
data["MarketIssue_mode"][data.MarketIssue != 1] = 0

data["CouponType_mode"][data.CouponType == data["CouponType_mode"]] = 1
data["CouponType_mode"][data.CouponType_mode != 1] = 0

excluded_features = ['Seniority', 'Currency', 'ActivityGroup', 'Region', 'Activity',
       'RiskCaptain', 'Owner', 'CompositeRating', 'IndustrySector',
       'IndustrySubgroup', 'MarketIssue', 'CouponType', "TickerIdx"]
features = [var for var in data.columns if var not in excluded_features]
data = data[features]

modes = list(data.filter(like = "_mode").columns)
for var in modes:
    data[var] = data[var].astype(int)
    
data["NumFavorites"] = data.filter(like = "_mode").sum(axis = 1)

print(data.shape)

### CHECKS AND PREPARATIONS

In [None]:
# check data
print("Dimensions:", data.shape)
data.head()

In [None]:
# check NA
nas = data.isnull().sum()
nas[nas > 0]

In [None]:
data.replace([np.inf, -np.inf], np.nan, inplace=True)

In [None]:
col_with_na = ['MeanPrice', 'StdPrice',
       'MeanYield', 'StdYield', 'MeanZScore', 'StdZScore', 'YieldMarktDelta',
       'ZScoreMarktDelta']
col_zero =  ['LastWeekCustSum', 'LastWeekBondSum',
       'LastWeekCustPriceSum', 'LastWeekCustNotionalSum', 'LastMonthCustSum',
       'LastMonthBondSum', 'MaturityPercent']

In [None]:
for col in col_with_na:
    data[col].fillna(np.mean(data[col]), inplace=True)
    
for col in col_zero:
    data[col].fillna(np.mean(data[col]), inplace=True)

In [None]:
# list of features
excluded_features = ["PredictionIdx", "CustomerIdx", "IsinIdx", "BuySell", "CustomerInterest",
                     "Frequecny1isLowerFrequency2", "Frequecny2isLowerFrequency4"]
features = [var for var in data.columns if var not in excluded_features]
len(features)

In [None]:
### DATA PARTITIONING

# training
X_train = data.loc[data.Week < 120]
y_train = data.loc[data.Week < 120].CustomerInterest

# validation
X_valid = data.loc[data.Week == 120]
y_valid = data.loc[data.Week == 120].CustomerInterest

# test set
test = data.loc[data.Week == 121]

In [None]:
### UPDATE FAVORITE FEATURES FOR TEST DATA

test = test.drop(list(test.filter(like = "_mode").columns), axis = 1)
del test["NumFavorites"]

bond  = pd.read_csv("../data/raw/Isin.csv")
bond.loc[~bond.Seniority.isin(["GOV","SEC","SEN","SUB"]), "Seniority"] = "OTHER"
bond.loc[~bond.Activity.isin(["Asia","RETAIL","GBP SAS", "ARGENTINIA"]), "Activity"] = "OTHER"
bond.loc[bond.Currency.isin(
    (bond.groupby("Currency").size())[bond.groupby("Currency").size()<350].index.tolist()
), "Currency"] = "OTHER"

bond.loc[bond.CompositeRating.str.contains("C|D"), "CompositeRating"] = "LOWER"
# Risk captain
bond.loc[bond.RiskCaptain.isin(
    (bond.groupby("RiskCaptain").size())[bond.groupby("RiskCaptain").size()<200].index.tolist()
), "RiskCaptain"] = "OTHER"

bond = bond[[
       'IsinIdx', "TickerIdx",
       'Seniority', 'Currency', 'ActivityGroup', 'Region', 'Activity',
       'RiskCaptain', 'Owner', 'CompositeRating', 'IndustrySector',
       'IndustrySubgroup', 'MarketIssue', 'CouponType']]

tmp = data.loc[data.CustomerInterest == 1]
tmp = tmp.loc[tmp.Week < 121]
full = tmp.merge(bond, on = ["IsinIdx"], how = "left")

for var in full.columns:
    full[var].fillna("Other", inplace = True)

import scipy.stats
favor = full[["CustomerIdx", "IsinIdx", "TickerIdx", "BuySell", 
                 'Seniority', 'Currency', 'ActivityGroup', 'Region',
                 'Activity', 'RiskCaptain', 'Owner', 'CompositeRating', 'IndustrySector',
                 'IndustrySubgroup', 'MarketIssue', 'CouponType']].groupby("CustomerIdx").agg(lambda x: scipy.stats.mode(x)[0][0])
favor.columns = favor.columns + "_mode"
favor = favor.reset_index()

### FAVORITE FEATURES

test = test.merge(bond,  how = "left")
test = test.merge(favor, how = "left")

test["BuySell_mode"][test.BuySell == test["BuySell_mode"]] = 1
test["BuySell_mode"][test.BuySell_mode != 1] = 0

test["TickerIdx_mode"][test.TickerIdx == test["TickerIdx_mode"]] = 1
test["TickerIdx_mode"][test.TickerIdx != 1] = 0

test["IsinIdx_mode"][test.IsinIdx == test["IsinIdx_mode"]] = 1
test["IsinIdx_mode"][test.IsinIdx_mode != 1] = 0

test["Seniority_mode"][test.Seniority == test["Seniority_mode"]] = 1
test["Seniority_mode"][test.Seniority_mode != 1] = 0

test["Currency_mode"][test.Currency == test["Currency_mode"]] = 1
test["Currency_mode"][test.Currency_mode != 1] = 0

test["ActivityGroup_mode"][test.ActivityGroup == test["ActivityGroup_mode"]] = 1
test["ActivityGroup_mode"][test.ActivityGroup_mode != 1] = 0

test["Region_mode"][test.Region == test["Region_mode"]] = 1
test["Region_mode"][test.Region_mode != 1] = 0

test["Activity_mode"][test.Activity == test["Activity_mode"]] = 1
test["Activity_mode"][test.Activity_mode != 1] = 0

test["RiskCaptain_mode"][test.RiskCaptain == test["RiskCaptain_mode"]] = 1
test["RiskCaptain_mode"][test.RiskCaptain_mode != 1] = 0

test["Owner_mode"][test.Owner == test["Owner_mode"]] = 1
test["Owner_mode"][test.Owner_mode != 1] = 0

test["CompositeRating_mode"][test.CompositeRating == test["CompositeRating_mode"]] = 1
test["CompositeRating_mode"][test.CompositeRating_mode != 1] = 0

test["IndustrySector_mode"][test.IndustrySector == test["IndustrySector_mode"]] = 1
test["IndustrySector_mode"][test.IndustrySector != 1] = 0

test["IndustrySubgroup_mode"][test.IndustrySubgroup == test["IndustrySubgroup_mode"]] = 1
test["IndustrySubgroup_mode"][test.IndustrySubgroup != 1] = 0

test["MarketIssue_mode"][test.MarketIssue == test["MarketIssue_mode"]] = 1
test["MarketIssue_mode"][test.MarketIssue != 1] = 0

test["CouponType_mode"][test.CouponType == test["CouponType_mode"]] = 1
test["CouponType_mode"][test.CouponType_mode != 1] = 0

excluded_features = ['Seniority', 'Currency', 'ActivityGroup', 'Region', 'Activity',
       'RiskCaptain', 'Owner', 'CompositeRating', 'IndustrySector',
       'IndustrySubgroup', 'MarketIssue', 'CouponType', "TickerIdx"]
feats = [var for var in test.columns if var not in excluded_features]
test = test[feats]

modes = list(test.filter(like = "_mode").columns)
for var in modes:
    test[var] = test[var].astype(int)
    
test["NumFavorites"] = test.filter(like = "_mode").sum(axis = 1)

print(test.shape)

In [None]:
# check NA in test
nas = test.isnull().sum()
nas[nas > 0]

In [None]:
nas = X_valid.isnull().sum()
nas[nas > 0]

# 4. MODELING - STAGE 1

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
num_iters=3000
nj = 20

In [None]:
rf = ExtraTreesClassifier(n_estimators=num_iters, n_jobs=nj, verbose=1)

In [None]:
rf.fit(X_train[features], y_train)

In [None]:
##### PREDICTION

### VALID DATA

# predict validation set
pred_valid = X_valid[["CustomerIdx", "IsinIdx", "BuySell", "Week", "CustomerInterest"]]
pred_valid["TARGET"] = rf.predict_proba(X_valid[features])[:,1]
auc = roc_auc_score(y_valid, pred_valid.TARGET)

# export CSV
pred_valid.to_csv("../pred_valid/auc" + str(round(auc, 6))[2:8] + "_" + str(data_name) + "_exdt.csv", 
                  index = False, float_format = "%.8f")


### TEST DATA

# predict test set
test["TARGET"] = rf.predict_proba(test[features])[:,1]

# export CSV
subm = test[["PredictionIdx", "TARGET"]]
subm.columns = ["PredictionIdx", "CustomerInterest"]
subm.to_csv("../submissions/auc" + str(round(auc, 6))[2:8] + "_" + str(data_name) + "_exdt_1stage.csv", 
            index = False, float_format = "%.8f")

# 5. MODELING - STAGE 2

In [None]:
# use full data as train
X_train = data.loc[data.Week <= 120]
y_train = data.loc[data.Week <= 120].CustomerInterest

In [None]:
rf = ExtraTreesClassifier(n_estimators=num_iters,  max_depth=None, min_samples_leaf=2, max_features='auto', n_jobs=nj, verbose=1)
rf.fit(X_train[features], y_train)

In [None]:
# test set
test = data.loc[data.Week == 121]
# predict test set
test["TARGET"] = rf.predict_proba(test[features])[:, 1]

In [None]:
# export CSV
subm = test[["PredictionIdx", "TARGET"]]
subm.columns = ["PredictionIdx", "CustomerInterest"]
subm.to_csv("../submissions/auc" + str(round(auc, 6))[2:8] + "_" + str(data_name) + "_exdt_2stage.csv", 
            index = False, float_format = "%.8f")