# 1. SETTINGS

In [1]:
# libraries
import numpy as np
import pandas as pd
import seaborn as sns
from datetime import date
import matplotlib.pyplot as plt
from IPython.display import display
import scipy.stats

In [2]:
# pandas options
pd.set_option("display.max_columns", None)

# 2. FUNCTIONS

In [3]:
### FUNCTION FOR COUNTING MISSINGS
def count_missings(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum() / data.isnull().count() * 100).sort_values(ascending = False)
    table = pd.concat([total, percent], axis = 1, keys = ["Total", "Percent"])
    table = table[table["Total"] > 0]
    return table

In [4]:
### FUNCTION FOR COMPUTING WEEK INDEX
def week_idx(date, end_date):
    return round((end_date - date).dt.days / 7 + 0.4)

# 3. DATA IMPORT

In [5]:
# import datasets
test  = pd.read_csv("../data/raw/Challenge_20180423.csv")
#cust  = pd.read_csv("../data/raw/Customer.csv")
#bond  = pd.read_csv("../data/raw/Isin.csv")
#markt = pd.read_csv("../data/raw/Market.csv")
#macro = pd.read_csv("../data/raw/MarketData_Macro.csv")
trade = pd.read_csv("../data/raw/Trade.csv")

# check all datasets
print("Test data:", test.shape)
display(test.head(3))
print("------------------------------")
print("Customer data:", cust.shape)
display(cust.head(3))
print("------------------------------")
print("Bonds data:", bond.shape)
display(bond.head(3))
print("------------------------------")
print("Market data:", markt.shape)
display(markt.head(3))
print("------------------------------")
print("Macroeconomic data:", macro.shape)
display(macro.head(3))
print("------------------------------")
print("Trade data:", trade.shape)
display(trade.head(3))

# 4. PREPROCESSING

## 4.1. TRADE & TEST DATA

#### SOME CHECKS

In [7]:
# check missings
count_missings(trade)

Unnamed: 0,Total,Percent
Price,4617933,68.292201


In [8]:
# descriptive stats
trade.describe()

Unnamed: 0,TradeDateKey,CustomerIdx,IsinIdx,NotionalEUR,Price,CustomerInterest
count,6762021.0,6762021.0,6762021.0,6762021.0,2144088.0,6762021.0
mean,20167500.0,1922.354,14603.69,6313228.0,177900.8,0.3271106
std,6768.082,857.9293,7963.809,271240800.0,1318503.0,0.4691581
min,20160100.0,0.0,0.0,-148554700.0,-999999.0,0.0
25%,20160630.0,1288.0,7392.0,341098.0,93.1378,0.0
50%,20170230.0,2090.0,15229.0,881212.0,101.0,0.0
75%,20170930.0,2574.0,22119.0,2136842.0,107.5,1.0
max,20180420.0,3470.0,27394.0,200000000000.0,9999999.0,1.0


The number of observations and bonds per customer is different in trade and test data. Trade data only contains bonds that a given customer has actually traded. In test data, for each customer, the set of bonds is only a subset of the bonds that he actually traded in the past (but not the whole set, which leads to a smaller number of observations per customer in the test data). Also, the test set is biased towards very active traders.

In [9]:
# check if bonds in test are a subset of bonds in trade
# print "No" if bonds in test are new for that customer
for i in test.CustomerIdx.unique():
    A = trade[trade.CustomerIdx == i].IsinIdx.unique()
    B = test[test.CustomerIdx == i].IsinIdx.unique()
    C = set(B).issubset(set(A))
    if C == False: 
        print("No!")
print("Finished")

Finished


In [10]:
# check number of bonds per customer
display(trade.groupby("CustomerIdx").IsinIdx.nunique().describe())
print("------------------------------")
display(test.groupby("CustomerIdx").IsinIdx.nunique().describe())

count     3439.000000
mean       405.832800
std       1145.875089
min          1.000000
25%          5.000000
50%         33.000000
75%        233.500000
max      14132.000000
Name: IsinIdx, dtype: float64

------------------------------


count    2495.000000
mean       97.145892
std       301.375388
min         1.000000
25%         3.000000
50%        12.000000
75%        58.000000
max      5044.000000
Name: IsinIdx, dtype: float64

#### PREPROCESSING

Target equals 0 if TradeStatus = "Holding" and 1 in all other cases.

In [11]:
# create target variable
trade["CustomerInterest"] = 1
trade["CustomerInterest"][trade["TradeStatus"] == "Holding"] = 0
trade.CustomerInterest.mean()
trade = trade[trade["TradeStatus"] != "Holding"]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


Test data does not contain features "Price", "NotionalEUR", so it is not possible to use them as predictors. 

In [12]:
# delete features
del trade["NotionalEUR"]
del trade["Price"]

In [13]:
# convert dates
trade["TradeDateKey"] = pd.to_datetime(trade["TradeDateKey"], format = '%Y%m%d')

In [14]:
# add week index
trade["Week"] = week_idx(trade["TradeDateKey"], pd.Timestamp('2018-04-23 00:00:00'))
trade["Week"] = trade["Week"].max() + 1 - trade["Week"]
test["Week"]  = trade["Week"].max() + 1

In the test data, each observation cover one week, whereas the training data is on a daily basis. We can aggregate the training data to a week level to have the same granularity. The target variable is computed as max over a week, whereas for different features we can compute different stats describing behavior during that week (e.g. mean, sd, range, etc).

In [15]:
# aggregate weekly data: target = 1 if there is at least single 1 during week
trade = trade.groupby(["CustomerIdx", "Week", "IsinIdx", "BuySell"], as_index = False).CustomerInterest.max()

The training data contains 0 only if a customer has explicitly stated that she holds a bond. However, there are a lot of 0 missing from the table for the cases when a customer does not interact with a bond at all. Hence, it is necessary to impute these missing observations. For each customer, we can look at the set of bonds that he ever traded and add all missing weeks for each of that bonds as new rows with target = 0 (the rows when customer did not interact with a specific bond). That significantly increases the sample size but also makes data much closer to the test set.

Current implementation increases the sample size from 6m to 247m rows (6.2 Gb).

In [16]:
# add missing weeks
print(trade.shape)
trade = trade.groupby(["CustomerIdx", "Week", "IsinIdx", "BuySell"]).CustomerInterest.unique().unstack("Week").stack("Week", dropna = False)
trade = trade.reset_index()
trade.columns = ["CustomerIdx", "IsinIdx", "BuySell", "Week", "CustomerInterest"]
print(trade.shape)

(1772720, 5)
(111293880, 5)


Mean CustomerInterest should be around 1.5% according to the organizers.

In [17]:
# fill new cases with 0 in target
trade.CustomerInterest.fillna(0, inplace = True)
trade["CustomerInterest"] = trade.CustomerInterest.astype(int)
print(trade.CustomerInterest.mean())

0.015928279254888048


## 4.2. CUSTOMER DATA

There are 3471 unique customers in the training data of which 2495 show up in the test set. There are no unknown customers in the test set.

In [18]:
cust.CustomerIdx.nunique()

3471

In [19]:
# Compare number of test customers showing up in cust set to number of unique customers in test set
np.sum(np.in1d(test.CustomerIdx.unique(), cust.CustomerIdx.unique()))/test.CustomerIdx.nunique()

1.0

Some customers show up as rarely as once (30% are at or below 5), while some create a lot of trades (40% are above 136 trades). We have to be careful with the imbalance when working on the customer level.

In [20]:
np.percentile(trade.groupby("CustomerIdx").size(), range(0,100,10))

array([  1.21000000e+02,   1.21000000e+02,   4.84000000e+02,
         9.68000000e+02,   2.05700000e+03,   4.35600000e+03,
         9.77680000e+03,   2.20220000e+04,   5.60714000e+04,
         1.76442200e+05])

## 4.3. MACROECONOMIC DATA

## 4.4. BONDS DATA

## 4.5. MARKET DATA

# 5. DATA EXPORT

In [18]:
# export CSV
trade.to_csv("../data/prepared/train_no_holding.csv", index = False, float_format = "%.4f")
test.to_csv("../data/prepared/test_no_holding.csv",   index = False, float_format = "%.4f")