# 1. SETTINGS

In [None]:
# libraries
import numpy as np
import pandas as pd
from datetime import date

In [None]:
# warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
# pandas options
pd.set_option("display.max_columns", None)

# 2. IMPORT

In [None]:
# import datasets
trade = pd.read_csv("../data/raw/Trade.csv")
test  = pd.read_csv("../data/raw/Challenge_20180423.csv")

In [None]:
# check trade
print("Trade data:", trade.shape)
trade.head()

In [None]:
# check trade
print("Test data:", test.shape)
test.head()

# 3. PREPROCESSING

In [None]:
# create target variable
trade["CustomerInterest"] = 1
trade["CustomerInterest"][trade["TradeStatus"] == "Holding"] = 0

In [None]:
# deleting holding cases
trade = trade[trade["TradeStatus"] != "Holding"]

In [None]:
# convert dates
trade["TradeDateKey"] = pd.to_datetime(trade["TradeDateKey"], format = '%Y%m%d')

In [None]:
### FUNCTION FOR COMPUTING WEEK INDEX
def week_idx(date, end_date):
    return round((end_date - date).dt.days / 7 + 0.4).astype(int)

# add week index
trade["Week"] = week_idx(trade["TradeDateKey"], pd.Timestamp('2018-04-23 00:00:00'))
trade["Week"] = trade["Week"].max() + 1 - trade["Week"]
test["Week"]  = trade["Week"].max() + 1

In [None]:
# aggregate weekly data: target = 1 if there is at least single 1 during week
trade = trade.groupby(["CustomerIdx", "Week", "IsinIdx", "BuySell"], 
                      as_index = False).agg({'CustomerInterest': 'max', 'Price': 'mean', 'NotionalEUR': 'mean'})

# 4. SUBSETING

In [None]:
### 1: Keep only customers that are also in test data
print(trade.shape)
trade = trade.loc[trade['CustomerIdx'].isin(set(test.CustomerIdx.unique()))]
print(trade.shape)

In [None]:
### 2: Drop the first 100 weekss (for now)s
print(trade.shape)
trade = trade[trade.Week > 100]
print(trade.shape)

In [None]:
### 3. Delete PRICE and NOTIONALUER (for now)
print(trade.shape)
del trade["Price"]
del trade["NotionalEUR"]
print(trade.shape)

# 5. ADD ZEROES

In [None]:
# add missing zeroes
print(trade.shape)
trade = trade.groupby(["CustomerIdx", "Week", "IsinIdx", "BuySell"]).CustomerInterest.unique().unstack("Week").stack("Week", dropna = False)
trade = trade.reset_index()
trade.columns = ["CustomerIdx", "IsinIdx", "BuySell", "Week", "CustomerInterest"]
print(trade.shape)

In [None]:
# fill imputed cases with 0 in target
trade.CustomerInterest.fillna(0, inplace = True)
trade["CustomerInterest"] = trade.CustomerInterest.astype(int)
print("Mean target:", trade.CustomerInterest.mean())

In [None]:
### 4. Undersample the 0 class
#from imblearn.under_sampling import RandomUnderSampler
#sampler = RandomUnderSampler(0.5, return_indices=True, random_state=1337)
#_,_,sampleIdx = sampler.fit_sample(trade[["CustomerIdx","IsinIdx"]], trade.CustomerInterest)
#trade = trade.iloc[sampleIdx]

# 6. EXPORT

In [None]:
# check dimensions
print(trade.shape)
print(test.shape)

In [None]:
# concatenate train and test
trade["PredictionIdx"] = None
del test["DateKey"]

In [None]:
trade = trade.reindex(test.columns, axis = 1)
data = pd.concat([trade, test])
del trade, test
print(data.shape)

In [None]:
# export CSV
data.to_csv("../data/prepared/data_basic.csv", index = False, compression="gzip")