# 1. SETTINGS

In [1]:
# libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
import scipy.stats

In [2]:
# pandas options
pd.set_option("display.max_columns", None)

In [3]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [4]:
# garbage collection
import gc
gc.enable()

# 2. NAIVE BENCHMARK

In [5]:
# import data
train = pd.read_csv("../data/raw/Trade.csv")
test  = pd.read_csv("../data/raw/Challenge_20180423.csv")

In [6]:
# check all datasets
display(test.head(3))
print("Test data:", test.shape)
print("------------------------------")
display(train.head(3))
print("Train data:", train.shape)

Unnamed: 0,PredictionIdx,DateKey,CustomerIdx,IsinIdx,BuySell,CustomerInterest
0,a1e0d80784,20180423,1856,13323,Buy,
1,c2cc6cc2a8,20180423,1856,9230,Buy,
2,a8e94f6344,20180423,1780,9157,Buy,


Test data: (484758, 6)
------------------------------


Unnamed: 0,TradeDateKey,CustomerIdx,IsinIdx,BuySell,NotionalEUR,Price,TradeStatus,CustomerInterest
0,20161207,2789,8478,Sell,653168.0,0.0,Unknown,1.0
1,20170329,2574,14562,Buy,1656487.0,0.0,Unknown,1.0
2,20170418,2574,4747,Buy,939673.0,0.0,Unknown,1.0


Train data: (6762021, 8)


In [7]:
# create target variable
train["CustomerInterest"] = 1
train["CustomerInterest"][train["TradeStatus"] == "Holding"] = 0

In [13]:
# compute historical target ratio (ALL DATA)
cust_int = train[["CustomerIdx", "CustomerInterest", "IsinIdx", "BuySell"]]
cust_int = cust_int.groupby(["CustomerIdx", "IsinIdx", "BuySell"], as_index = False).mean()
cust_int.head()

Unnamed: 0,CustomerIdx,IsinIdx,BuySell,CustomerInterest
0,0,16471,Buy,1.0
1,0,16499,Buy,1.0
2,0,20601,Buy,1.0
3,0,20737,Buy,1.0
4,0,20909,Sell,1.0


In [12]:
# compute historical target ratio (2018 ONLY)
cust_int_2018 = train[train["TradeDateKey"] > 20180101]
cust_int_2018 = cust_int_2018[["CustomerIdx", "CustomerInterest", "IsinIdx", "BuySell"]]
cust_int_2018 = cust_int_2018.groupby(["CustomerIdx", "IsinIdx", "BuySell"], as_index = False).mean()
cust_int_2018.head()

Unnamed: 0,CustomerIdx,IsinIdx,BuySell,CustomerInterest
0,0,24944,Sell,1.0
1,0,25992,Buy,1.0
2,0,26726,Sell,1.0
3,0,26793,Sell,1.0
4,0,27045,Buy,1.0


In [19]:
# average two ratios
cust_int = cust_int.merge(cust_int_2018, how = "left", on = ["CustomerIdx", "IsinIdx", "BuySell"])
cust_int["CustomerInterest"] = (cust_int["CustomerInterest_x"].fillna(0) + cust_int["CustomerInterest_y"].fillna(0))/2
cust_int.head()

Unnamed: 0,CustomerIdx,IsinIdx,BuySell,CustomerInterest_x,CustomerInterest_y,CustomerInterest
0,0,16471,Buy,1.0,,0.5
1,0,16499,Buy,1.0,,0.5
2,0,20601,Buy,1.0,,0.5
3,0,20737,Buy,1.0,,0.5
4,0,20909,Sell,1.0,,0.5
5,0,20910,Buy,1.0,,0.5
6,0,21852,Buy,1.0,,0.5
7,0,21856,Buy,1.0,,0.5
8,0,22294,Sell,1.0,,0.5
9,0,22331,Buy,1.0,,0.5


In [20]:
# merge to test
del test["CustomerInterest"]
test = test.merge(cust_int, how = "left", on = ["CustomerIdx", "IsinIdx", "BuySell"])
test.head()

Unnamed: 0,PredictionIdx,DateKey,CustomerIdx,IsinIdx,BuySell,CustomerInterest_x,CustomerInterest_y,CustomerInterest
0,a1e0d80784,20180423,1856,13323,Buy,,,
1,c2cc6cc2a8,20180423,1856,9230,Buy,0.416667,1.0,0.708333
2,a8e94f6344,20180423,1780,9157,Buy,,,
3,758bae1e35,20180423,2129,9131,Buy,,,
4,02ab378ee8,20180423,1758,7151,Buy,0.0,,0.0


In [22]:
# fill NA (no customer-bond interactions)
test.CustomerInterest.fillna(0, inplace = True)

In [23]:
# export CSV
subm = test[["PredictionIdx", "CustomerInterest"]]
subm.to_csv("../submissions/naive_mean.csv", index = False, float_format = "%.8f")