# 1. SETTINGS

In [None]:
# libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
import scipy.stats

In [None]:
# pandas options
pd.set_option("display.max_columns", None)

In [None]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
# garbage collection
import gc
gc.enable()

# 2. FUNCTIONS

In [None]:
### FUNCTION FOR COUNTING MISSINGS
def count_missings(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum() / data.isnull().count() * 100).sort_values(ascending = False)
    table = pd.concat([total, percent], axis = 1, keys = ["Total", "Percent"])
    table = table[table["Total"] > 0]
    return table

# 3. DATA IMPORT

In [None]:
# import datasets
test  = pd.read_csv("../data/raw/Challenge_20180423.csv")
cust  = pd.read_csv("../data/raw/Customer.csv")
bond  = pd.read_csv("../data/raw/Isin.csv")
markt = pd.read_csv("../data/raw/Market.csv")
macro = pd.read_csv("../data/raw/MarketData_Macro.csv")
trade = pd.read_csv("../data/raw/Trade.csv")

In [None]:
# check all datasets
display(test.head(3))
print("Test data:", test.shape)
print("------------------------------")
display(cust.head(3))
print("Customer data:", cust.shape)
print("------------------------------")
display(bond.head(3))
print("Bonds data:", bond.shape)
print("------------------------------")
display(markt.head(3))
print("Market data:", markt.shape)
print("------------------------------")
display(macro.head(3))
print("Macroeconomic data:", macro.shape)
print("------------------------------")
display(trade.head(3))
print("Trade data:", trade.shape)

# 4. PREPROCESSING

## 4.1. TRADE DATA

In [None]:
# check missings
count_missings(trade)

In [None]:
trade.agg(lambda x: x.isna().sum())

In [None]:
trade.describe()

In [None]:
# create target variable
trade["CustomerInterest"] = 1
trade["CustomerInterest"][trade["TradeStatus"] == "Holding"] = 0

In [None]:
# convert dates
trade["TradeDateKey"] = pd.to_datetime(trade["TradeDateKey"], format = '%Y%m%d')

In [None]:
### ADDING MISSING COMBINATIONS

# extract all unique values
#l1 = list(trade.TradeDateKey.unique())
#l2 = list(trade.CustomerIdx.unique())
#l3 = list(trade.IsinIdx.unique())
#l4 = list(trade.BuySell.unique())

# create lists with combinations
#lp1, lp2, lp3, lp4 = pd.core.reshape.util.cartesian_product([l1, l2, l3, l4])

# convert to dataframe
#combs = pd.DataFrame(dict(TradeDateKey = lp1, CustomerIdx = lp2, IsinIdx = lp3, BuySell = lp4))

# merge missing combinations
#print(trade.shape)
#trade = trade.merge(combs, how = "right")
#print(trade.shape)

## 4.2. CUSTOMER DATA

There are 3471 unique customers in the training data of which 2495 show up in the test set. There are no unknown customers in the test set.

In [None]:
cust.CustomerIdx.nunique()

In [None]:
# Compare number of test customers showing up in cust set to number of unique customers in test set
np.sum(np.in1d(test.CustomerIdx.unique(), cust.CustomerIdx.unique()))/test.CustomerIdx.nunique()

Some customers show up as rarely as once (30% are at or below 5), while some create a lot of trades (40% are above 136 trades). We have to be careful with the imbalance when working on the customer level.

In [None]:
np.percentile(trade.groupby("CustomerIdx").size(), range(0,100,10))

## 4.3. MACROECONOMIC DATA

## 4.4. BONDS DATA

## 4.5. MARKET DATA

# 5. DATA EXPORT

In [None]:
# export CSV
train.to_csv("../data/prepared/train.csv", index = False, float_format = "%.8f")
test.to_csv("../data/prepared/test.csv",   index = False, float_format = "%.8f")