# 1. SETTINGS

In [1]:
# libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
import scipy.stats

In [2]:
# pandas options
pd.set_option("display.max_columns", None)

In [3]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [4]:
# garbage collection
import gc
gc.enable()

# 2. DATA PARTITIONING

In [5]:
# import data
data = pd.read_csv("../data/prepared/train.csv")
test = pd.read_csv("../data/prepared/test.csv")

In [6]:
# check all datasets
print("Test data:", test.shape)
display(test.head(3))
print("------------------------------")
print("Train data:", data.shape)
display(data.head(3))

Test data: (484758, 7)


Unnamed: 0,PredictionIdx,DateKey,CustomerIdx,IsinIdx,BuySell,CustomerInterest,Week
0,a1e0d80784,20180423,1856,13323,Buy,,122.0
1,c2cc6cc2a8,20180423,1856,9230,Buy,,122.0
2,a8e94f6344,20180423,1780,9157,Buy,,122.0


------------------------------
Train data: (247575317, 5)


Unnamed: 0,CustomerIdx,IsinIdx,BuySell,Week,CustomerInterest
0,0,16471,Buy,1.0,0
1,0,16471,Buy,2.0,0
2,0,16471,Buy,3.0,0


In [7]:
# data partitioning
train = data[(data["Week"] >= 1)   & (data["Week"] <= 110)]
valid = data[(data["Week"] >= 111) & (data["Week"] <= 121)]
del data

# 3. COMPUTING NAIVE RATIOS

In [None]:
# compute target ratio (1)
cust_int1 = train[["CustomerIdx", "CustomerInterest", "IsinIdx", "BuySell"]]
cust_int1 = cust_int1.groupby(["CustomerIdx", "IsinIdx", "BuySell"], as_index = False).mean()
cust_int1.columns = ["CustomerIdx", "IsinIdx", "BuySell", "ratio1"]

In [None]:
# compute target ratio (2)
cust_int2 = train[["CustomerIdx", "CustomerInterest", "IsinIdx"]]
cust_int2 = cust_int2.groupby(["CustomerIdx", "IsinIdx"], as_index = False).mean()
cust_int2.columns = ["CustomerIdx", "IsinIdx", "ratio2"]

In [None]:
# compute target ratio (1, last 10 weeks)
cust_int3 = train[train["TradeDateKey"] > 100]
cust_int3 = cust_int3[["CustomerIdx", "CustomerInterest", "IsinIdx", "BuySell"]]
cust_int3 = cust_int3.groupby(["CustomerIdx", "IsinIdx", "BuySell"], as_index = False).mean()
cust_int3.columns = ["CustomerIdx", "IsinIdx", "BuySell", "ratio3"]

In [None]:
# compute target ratio (2, last 10 weeks)
cust_int4 = train[train["TradeDateKey"] > 100
cust_int4 = cust_int4[["CustomerIdx", "CustomerInterest", "IsinIdx"]]
cust_int4 = cust_int4.groupby(["CustomerIdx", "IsinIdx"], as_index = False).mean()
cust_int4.columns = ["CustomerIdx", "IsinIdx", "ratio4"]

In [None]:
# average ratios
cust_int = cust_int1.merge(cust_int2, how = "left", on = ["CustomerIdx", "IsinIdx", "BuySell"])
cust_int = cust_int.merge(cust_int3,  how = "left", on = ["CustomerIdx", "IsinIdx", "BuySell"])
cust_int = cust_int.merge(cust_int4,  how = "left", on = ["CustomerIdx", "IsinIdx", "BuySell"])
cust_int["CustomerInterest"] = (cust_int["ratio1"].fillna(0) + 
                                cust_int["ratio2"].fillna(0) +
                                cust_int["ratio3"].fillna(0) + 
                                cust_int["ratio4"]) / 4
cust_int.head()

In [13]:
# merge to test
del test["CustomerInterest"]
test = test.merge(cust_int, how = "left", on = ["CustomerIdx", "IsinIdx", "BuySell"])
test.head()

Unnamed: 0,PredictionIdx,DateKey,CustomerIdx,IsinIdx,BuySell,CustomerInterest_x,CustomerInterest_y,CustomerInterest
0,a1e0d80784,20180423,1856,13323,Buy,,,
1,c2cc6cc2a8,20180423,1856,9230,Buy,0.416667,1.0,0.805556
2,a8e94f6344,20180423,1780,9157,Buy,,,
3,758bae1e35,20180423,2129,9131,Buy,,,
4,02ab378ee8,20180423,1758,7151,Buy,0.0,,0.0


In [14]:
# fill NA (no customer-bond interactions)
test.CustomerInterest.fillna(0, inplace = True)

In [15]:
# export CSV
subm = test[["PredictionIdx", "CustomerInterest"]]
subm.to_csv("../submissions/naive_4mean.csv", index = False, float_format = "%.8f")