# 1. SETTINGS

In [104]:
# libraries
import numpy as np
import pandas as pd
from datetime import date

In [105]:
# warnings
import warnings
warnings.filterwarnings("ignore")

In [106]:
# pandas options
pd.set_option("display.max_columns", None)

# 2. IMPORT

In [107]:
# import datasets
trade = pd.read_csv("../data/raw/Trade.csv")
test  = pd.read_csv("../data/raw/Challenge_20180423.csv")

In [108]:
# check trade
print("Trade data:", trade.shape)
trade.head()

Trade data: (6762021, 8)


Unnamed: 0,TradeDateKey,CustomerIdx,IsinIdx,BuySell,NotionalEUR,Price,TradeStatus,CustomerInterest
0,20161207,2789,8478,Sell,653168.0,0.0,Unknown,1.0
1,20170329,2574,14562,Buy,1656487.0,0.0,Unknown,1.0
2,20170418,2574,4747,Buy,939673.0,0.0,Unknown,1.0
3,20170310,2574,9885,Sell,708082.0,0.0,Unknown,1.0
4,20161116,2574,8885,Buy,1147709.0,0.0,Unknown,1.0


In [109]:
# check trade
print("Test data:", test.shape)
test.head()

Test data: (484758, 6)


Unnamed: 0,PredictionIdx,DateKey,CustomerIdx,IsinIdx,BuySell,CustomerInterest
0,a1e0d80784,20180423,1856,13323,Buy,
1,c2cc6cc2a8,20180423,1856,9230,Buy,
2,a8e94f6344,20180423,1780,9157,Buy,
3,758bae1e35,20180423,2129,9131,Buy,
4,02ab378ee8,20180423,1758,7151,Buy,


# 3. PREPROCESSING

In [110]:
# create target variable
trade["CustomerInterest"] = 1
trade["CustomerInterest"][trade["TradeStatus"] == "Holding"] = 0

In [111]:
# deleting holding cases
trade = trade[trade["TradeStatus"] != "Holding"]

In [112]:
# convert dates
trade["TradeDateKey"] = pd.to_datetime(trade["TradeDateKey"], format = '%Y%m%d')

In [113]:
### FUNCTION FOR COMPUTING WEEK INDEX
def week_idx(date, end_date):
    return round((end_date - date).dt.days / 7 + 0.4).astype(int)

# add week index
trade["Week"] = week_idx(trade["TradeDateKey"], pd.Timestamp('2018-04-23 00:00:00'))
trade["Week"] = trade["Week"].max() + 1 - trade["Week"]
test["Week"]  = trade["Week"].max() + 1

In [114]:
# aggregate weekly data: target = 1 if there is at least single 1 during week
trade = trade.groupby(["CustomerIdx", "Week", "IsinIdx", "BuySell"], 
                      as_index = False).agg({'CustomerInterest': 'max', 'Price': 'mean', 'NotionalEUR': 'mean'})

# 4. SUBSETING

In [115]:
### 1: Keep only customers that are also in test data
print(trade.shape)
trade = trade.loc[trade['CustomerIdx'].isin(set(test.CustomerIdx.unique()))]
print(trade.shape)

(1772720, 7)
(1747439, 7)


In [116]:
### 2: Drop the first 70 weeks
print(trade.shape)
trade = trade[trade.Week > 70]
print(trade.shape)

(1747439, 7)
(776272, 7)


In [117]:
### 3. Delete PRICE and NOTIONALUER (for now)
print(trade.shape)
del trade["Price"]
del trade["NotionalEUR"]
print(trade.shape)

(776272, 7)
(776272, 5)


# 5. ADD ZEROES

In [118]:
# add missing zeroes
print(trade.shape)
trade = trade.groupby(["CustomerIdx", "Week", "IsinIdx", "BuySell"]).CustomerInterest.unique().unstack("Week").stack("Week", dropna = False)
trade = trade.reset_index()
trade.columns = ["CustomerIdx", "IsinIdx", "BuySell", "Week", "CustomerInterest"]
print(trade.shape)

(776272, 5)
(24012750, 5)


In [119]:
# fill imputed cases with 0 in target
trade.CustomerInterest.fillna(0, inplace = True)
trade["CustomerInterest"] = trade.CustomerInterest.astype(int)
print("Mean target:", trade.CustomerInterest.mean())

Mean target: 0.032327492686177135


# 6. EXPORT

In [120]:
# check dimensions
print(trade.shape)
print(test.shape)

(24012750, 5)
(484758, 7)


In [122]:
# concatenate train and test
trade["PredictionIdx"] = None
del test["DateKey"]
trade = trade.reindex(test.columns, axis = 1)
data = pd.concat([trade, test])
del trade, test
print(data.shape)

(24497508, 6)


In [123]:
# export CSV
data.to_csv("../data/prepared/data_basic.csv", index = False)