# 1. SETTINGS

In [3]:
# libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
from sklearn.metrics import roc_auc_score
import scipy.stats

In [4]:
# pandas options
pd.set_option("display.max_columns", None)

# 2. DATA PARTITIONING

In [27]:
# import data
data = pd.read_csv("../data/prepared/train.csv")

In [28]:
# check all datasets
print("Test data:", test.shape)
display(test.head(3))
print("------------------------------")
print("Train data:", data.shape)
display(data.head(3))

Test data: (484758, 7)


Unnamed: 0,PredictionIdx,DateKey,CustomerIdx,IsinIdx,BuySell,CustomerInterest,Week
0,a1e0d80784,20180423,1856,13323,Buy,,122.0
1,c2cc6cc2a8,20180423,1856,9230,Buy,,122.0
2,a8e94f6344,20180423,1780,9157,Buy,,122.0


------------------------------
Train data: (247575317, 5)


Unnamed: 0,CustomerIdx,IsinIdx,BuySell,Week,CustomerInterest
0,0,16471,Buy,1.0,0
1,0,16471,Buy,2.0,0
2,0,16471,Buy,3.0,0


In [29]:
# data partitioning
train = data[(data["Week"] >= 1)   & (data["Week"] <= 110)]
valid = data[(data["Week"] >= 111) & (data["Week"] <= 121)]
del data

# 3. COMPUTING NAIVE RATIOS (TRAIN)

In [31]:
# compute target ratio (last 64 weeks)
cust_int0 = train[train["Week"] >= (110-64)]
cust_int0 = cust_int0[["CustomerIdx", "CustomerInterest", "IsinIdx"]]
cust_int0 = cust_int0.groupby(["CustomerIdx", "IsinIdx"], as_index = False).mean()
cust_int0.columns = ["CustomerIdx", "IsinIdx", "ratio0"]

In [32]:
# compute target ratio (last 32 weeks)
cust_int1 = train[train["Week"] >= (110-32)]
cust_int1 = cust_int1[["CustomerIdx", "CustomerInterest", "IsinIdx"]]
cust_int1 = cust_int1.groupby(["CustomerIdx", "IsinIdx"], as_index = False).mean()
cust_int1.columns = ["CustomerIdx", "IsinIdx", "ratio1"]

In [33]:
# compute target ratio (last 16 weeks)
cust_int2 = train[train["Week"] >= (110-16)]
cust_int2 = cust_int2[["CustomerIdx", "CustomerInterest", "IsinIdx"]]
cust_int2 = cust_int2.groupby(["CustomerIdx", "IsinIdx"], as_index = False).mean()
cust_int2.columns = ["CustomerIdx", "IsinIdx", "ratio2"]

In [34]:
# compute target ratio (last 8 weeks)
cust_int3 = train[train["Week"] >= (110-8)]
cust_int3 = cust_int3[["CustomerIdx", "CustomerInterest", "IsinIdx"]]
cust_int3 = cust_int3.groupby(["CustomerIdx", "IsinIdx"], as_index = False).mean()
cust_int3.columns = ["CustomerIdx", "IsinIdx", "ratio3"]

In [35]:
# compute target ratio (last 4 weeks)
cust_int4 = train[train["Week"] >= (110-4)]
cust_int4 = cust_int4[["CustomerIdx", "CustomerInterest", "IsinIdx"]]
cust_int4 = cust_int4.groupby(["CustomerIdx", "IsinIdx"], as_index = False).mean()
cust_int4.columns = ["CustomerIdx", "IsinIdx", "ratio4"]

In [36]:
# compute customer target ratio (last 32 weeks)
cust_int5 = train[train["Week"] >= (110-32)]
cust_int5 = cust_int5[["CustomerIdx", "CustomerInterest"]]
cust_int5 = cust_int5.groupby(["CustomerIdx"], as_index = False).mean()
cust_int5.columns = ["CustomerIdx", "ratio5"]

In [38]:
# compute bond target ratio (last 32 weeks)
cust_int6 = train[train["Week"] >= (110-32)]
cust_int6 = cust_int6[["IsinIdx", "CustomerInterest"]]
cust_int6 = cust_int6.groupby(["IsinIdx"], as_index = False).mean()
cust_int6.columns = ["IsinIdx", "ratio6"]

In [46]:
# merge and average all ratios
cust_int = cust_int0.merge(cust_int1, how = "left", on = ["CustomerIdx", "IsinIdx"])
cust_int = cust_int.merge(cust_int2,  how = "left", on = ["CustomerIdx", "IsinIdx"])
cust_int = cust_int.merge(cust_int3,  how = "left", on = ["CustomerIdx", "IsinIdx"])
cust_int = cust_int.merge(cust_int4,  how = "left", on = ["CustomerIdx", "IsinIdx"])
cust_int = cust_int.merge(cust_int5,  how = "left", on = ["CustomerIdx"])
cust_int = cust_int.merge(cust_int6,  how = "left", on = ["IsinIdx"])
cust_int["RatioMean"] = (cust_int["ratio0"] + cust_int["ratio1"] + cust_int["ratio2"] + 
                         cust_int["ratio3"] + cust_int["ratio4"] + cust_int["ratio5"] +
                         cust_int["ratio6"]) / 7
cust_int.head()

Unnamed: 0,CustomerIdx,IsinIdx,ratio0,ratio1,ratio2,ratio3,ratio4,ratio5,ratio6,RatioMean
0,0,16471,0.015385,0.0,0.0,0.0,0.0,0.007346,0.0,0.003247
1,0,16499,0.015385,0.0,0.0,0.0,0.0,0.007346,0.000253,0.003283
2,0,20601,0.015385,0.0,0.0,0.0,0.0,0.007346,0.001263,0.003428
3,0,20737,0.015385,0.030303,0.0,0.0,0.0,0.007346,0.010331,0.009052
4,0,20909,0.015385,0.0,0.0,0.0,0.0,0.007346,0.0,0.003247


In [47]:
# merge to valid
valid = valid.merge(cust_int, how = "left", on = ["CustomerIdx", "IsinIdx"])
valid.head()

Unnamed: 0,CustomerIdx,IsinIdx,BuySell,Week,CustomerInterest,ratio0,ratio1,ratio2,ratio3,ratio4,ratio5,ratio6,RatioMean
0,0,16471,Buy,111.0,0,0.015385,0.0,0.0,0.0,0.0,0.007346,0.0,0.003247
1,0,16471,Buy,112.0,0,0.015385,0.0,0.0,0.0,0.0,0.007346,0.0,0.003247
2,0,16471,Buy,113.0,0,0.015385,0.0,0.0,0.0,0.0,0.007346,0.0,0.003247
3,0,16471,Buy,114.0,0,0.015385,0.0,0.0,0.0,0.0,0.007346,0.0,0.003247
4,0,16471,Buy,115.0,0,0.015385,0.0,0.0,0.0,0.0,0.007346,0.0,0.003247


In [None]:
# compute AUC ROC
roc_auc_score(valid.CustomerInterest, valid.ratio5)

# 4. COMPUTING NAIVE RATIOS (FULL)

In [5]:
# import data
train = pd.read_csv("../data/prepared/train.csv")
test  = pd.read_csv("../data/prepared/test.csv")

In [6]:
# compute target ratio (last 64 weeks)
cust_int0 = train[train["Week"] >= (121-64)]
cust_int0 = cust_int0[["CustomerIdx", "CustomerInterest", "IsinIdx"]]
cust_int0 = cust_int0.groupby(["CustomerIdx", "IsinIdx"], as_index = False).mean()
cust_int0.columns = ["CustomerIdx", "IsinIdx", "ratio0"]

In [7]:
# compute target ratio (last 32 weeks)
cust_int1 = train[train["Week"] >= (121-32)]
cust_int1 = cust_int1[["CustomerIdx", "CustomerInterest", "IsinIdx"]]
cust_int1 = cust_int1.groupby(["CustomerIdx", "IsinIdx"], as_index = False).mean()
cust_int1.columns = ["CustomerIdx", "IsinIdx", "ratio1"]

In [8]:
# compute target ratio (last 16 weeks)
cust_int2 = train[train["Week"] >= (121-16)]
cust_int2 = cust_int2[["CustomerIdx", "CustomerInterest", "IsinIdx"]]
cust_int2 = cust_int2.groupby(["CustomerIdx", "IsinIdx"], as_index = False).mean()
cust_int2.columns = ["CustomerIdx", "IsinIdx", "ratio2"]

In [9]:
# compute target ratio (last 8 weeks)
cust_int3 = train[train["Week"] >= (121-8)]
cust_int3 = cust_int3[["CustomerIdx", "CustomerInterest", "IsinIdx"]]
cust_int3 = cust_int3.groupby(["CustomerIdx", "IsinIdx"], as_index = False).mean()
cust_int3.columns = ["CustomerIdx", "IsinIdx", "ratio3"]

In [10]:
# compute target ratio (last 4 weeks)
cust_int4 = train[train["Week"] >= (121-4)]
cust_int4 = cust_int4[["CustomerIdx", "CustomerInterest", "IsinIdx"]]
cust_int4 = cust_int4.groupby(["CustomerIdx", "IsinIdx"], as_index = False).mean()
cust_int4.columns = ["CustomerIdx", "IsinIdx", "ratio4"]

In [11]:
# compute customer target ratio (last 32 weeks)
cust_int5 = train[train["Week"] >= (121-32)]
cust_int5 = cust_int5[["CustomerIdx", "CustomerInterest"]]
cust_int5 = cust_int5.groupby(["CustomerIdx"], as_index = False).mean()
cust_int5.columns = ["CustomerIdx", "ratio5"]

In [12]:
# compute bond target ratio (last 32 weeks)
cust_int6 = train[train["Week"] >= (121-32)]
cust_int6 = cust_int6[["IsinIdx", "CustomerInterest"]]
cust_int6 = cust_int6.groupby(["IsinIdx"], as_index = False).mean()
cust_int6.columns = ["IsinIdx", "ratio6"]

In [13]:
# merge and average all ratios
cust_int = cust_int0.merge(cust_int1, how = "left", on = ["CustomerIdx", "IsinIdx"])
cust_int = cust_int.merge(cust_int2,  how = "left", on = ["CustomerIdx", "IsinIdx"])
cust_int = cust_int.merge(cust_int3,  how = "left", on = ["CustomerIdx", "IsinIdx"])
cust_int = cust_int.merge(cust_int4,  how = "left", on = ["CustomerIdx", "IsinIdx"])
cust_int = cust_int.merge(cust_int5,  how = "left", on = ["CustomerIdx"])
cust_int = cust_int.merge(cust_int6,  how = "left", on = ["IsinIdx"])
cust_int["RatioMean"] = (cust_int["ratio0"] + cust_int["ratio1"] + cust_int["ratio2"] + 
                         cust_int["ratio3"] + cust_int["ratio4"] + cust_int["ratio5"] +
                         cust_int["ratio6"]) / 7
cust_int.head()

Unnamed: 0,CustomerIdx,IsinIdx,ratio0,ratio1,ratio2,ratio3,ratio4,ratio5,ratio6,RatioMean
0,0,16471,0.015385,0.0,0.0,0.0,0.0,0.012856,0.0,0.004034
1,0,16499,0.015385,0.0,0.0,0.0,0.0,0.012856,0.0,0.004034
2,0,20601,0.015385,0.0,0.0,0.0,0.0,0.012856,0.0,0.004034
3,0,20737,0.015385,0.0,0.0,0.0,0.0,0.012856,0.006887,0.005018
4,0,20909,0.0,0.0,0.0,0.0,0.0,0.012856,0.0,0.001837


In [16]:
# merge to test
del test["CustomerInterest"]
test = test.merge(cust_int, how = "left", on = ["CustomerIdx", "IsinIdx"])
test.head()

Unnamed: 0,PredictionIdx,DateKey,CustomerIdx,IsinIdx,BuySell,Week,ratio0,ratio1,ratio2,ratio3,ratio4,ratio5,ratio6,RatioMean
0,a1e0d80784,20180423,1856,13323,Buy,122.0,0.015385,0.030303,0.0,0.0,0.0,0.021351,0.008207,0.010749
1,c2cc6cc2a8,20180423,1856,9230,Buy,122.0,0.038462,0.060606,0.088235,0.111111,0.2,0.021351,0.00713,0.075271
2,a8e94f6344,20180423,1780,9157,Buy,122.0,0.030769,0.060606,0.0,0.0,0.0,0.010444,0.005682,0.015357
3,758bae1e35,20180423,2129,9131,Buy,122.0,0.015385,0.030303,0.0,0.0,0.0,0.001908,0.014971,0.008938
4,02ab378ee8,20180423,1758,7151,Buy,122.0,0.007692,0.015152,0.0,0.0,0.0,0.009125,0.00404,0.005144


In [19]:
# compute prediction
test["CustomerInterest"] = test["ratio6"]
test.CustomerInterest.mean()

0.012468138431356518

In [20]:
# export CSV
subm = test[["PredictionIdx", "CustomerInterest"]]
subm.to_csv("../submissions/naive_bond_32_ratio.csv", index = False, float_format = "%.8f")