# 1. SETTINGS

In [86]:
# libraries
import numpy as np
import pandas as pd
from datetime import date
import matplotlib.pyplot as plt
from IPython.display import display
import scipy.stats

In [87]:
# pandas options
pd.set_option("display.max_columns", None)

# 2. FUNCTIONS

### FUNCTION FOR COUNTING MISSINGS
def count_missings(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum() / data.isnull().count() * 100).sort_values(ascending = False)
    table = pd.concat([total, percent], axis = 1, keys = ["Total", "Percent"])
    table = table[table["Total"] > 0]
    return table

In [88]:
### FUNCTION FOR COMPUTING WEEK INDEX
def week_idx(date, end_date):
    # TODO: Hows does this work?
    return round((end_date - date).dt.days / 7 + 0.4)

# 3. DATA IMPORT

In [89]:
# import datasets
test  = pd.read_csv("../data/raw/Challenge_20180423.csv")
#cust  = pd.read_csv("../data/raw/Customer.csv")
#bond  = pd.read_csv("../data/raw/Isin.csv")
#markt = pd.read_csv("../data/raw/Market.csv")
#macro = pd.read_csv("../data/raw/MarketData_Macro.csv")
trade = pd.read_csv("../data/raw/Trade.csv")

# check all datasets
print("Test data:", test.shape)
display(test.head(3))
print("------------------------------")
print("Customer data:", cust.shape)
display(cust.head(3))
print("------------------------------")
print("Bonds data:", bond.shape)
display(bond.head(3))
print("------------------------------")
print("Market data:", markt.shape)
display(markt.head(3))
print("------------------------------")
print("Macroeconomic data:", macro.shape)
display(macro.head(3))
print("------------------------------")
print("Trade data:", trade.shape)
display(trade.head(3))

# 4. PREPROCESSING

## 4.1. TRADE & TEST DATA

#### SOME CHECKS

In [19]:
# check missings
#count_missings(trade)

In [20]:
# descriptive stats
#trade.describe()

The number of observations and bonds per customer is different in trade and test data. Trade data only contains bonds that a given customer has actually traded. In test data, for each customer, the set of bonds is only a subset of the bonds that he actually traded in the past (but not the whole set, which leads to a smaller number of observations per customer in the test data). Also, the test set is biased towards very active traders.

# check if bonds in test are a subset of bonds in trade
# print "No" if bonds in test are new for that customer
for i in test.CustomerIdx.unique():
    A = trade[trade.CustomerIdx == i].IsinIdx.unique()
    B = test[test.CustomerIdx == i].IsinIdx.unique()
    C = set(B).issubset(set(A))
    if C == False: 
        print("No!")
print("Finished")

# check number of bonds per customer
display(trade.groupby("CustomerIdx").IsinIdx.nunique().describe())
print("------------------------------")
display(test.groupby("CustomerIdx").IsinIdx.nunique().describe())

#### PREPROCESSING

Target equals 0 if TradeStatus = "Holding" and 1 in all other cases. The holding operations are artificial and do not actually mean a customer-bond interaction on the considered market, so we drop these observations from the dataset.

In [90]:
# create target variable
trade["CustomerInterest"] = 1
trade["CustomerInterest"][trade["TradeStatus"] == "Holding"] = 0
trade.CustomerInterest.mean()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


0.32711063748545

In [91]:
# deleting holding cases
trade = trade[trade["TradeStatus"] != "Holding"]

Test data does not contain features "Price", "NotionalEUR", so it is not possible to use them as predictors. 

# delete features
del trade["NotionalEUR"]
del trade["Price"]

In [92]:
# convert dates
trade["TradeDateKey"] = pd.to_datetime(trade["TradeDateKey"], format = '%Y%m%d')

In [93]:
# add week index
trade["Week"] = week_idx(trade["TradeDateKey"], pd.Timestamp('2018-04-23 00:00:00'))
trade["Week"] = trade["Week"].max() + 1 - trade["Week"]
test["Week"]  = trade["Week"].max() + 1

In [94]:
# only customers that are also in test
trade = trade.loc[trade['CustomerIdx'].isin(set(test.CustomerIdx.unique()))]

In the test data, each observation covers one week, whereas the training data is on a daily basis. We can aggregate the training data to a week level to have the same granularity. The target variable is computed as max over a week, whereas for different features we can compute different stats describing behavior during that week (e.g. mean, sd, range, etc).

In [95]:
# aggregate weekly data: target = 1 if there is at least single 1 during week
trade = trade.groupby(["CustomerIdx", "Week", "IsinIdx", "BuySell"], as_index = False).CustomerInterest.max()

The training data contains 0 only if a customer has explicitly stated that she holds a bond. However, there are a lot of 0 missing from the table for the cases when a customer does not interact with a bond at all. Hence, it is necessary to impute these missing observations. For each customer, we can look at the set of bonds that he ever traded and add all missing weeks for each of that bonds as new rows with target = 0 (the rows when customer did not interact with a specific bond). That significantly increases the sample size but also makes data much closer to the test set.

Current implementation increases the sample size from 1.7m to 111m rows (~3Gb). This takes a few minutes.

In [96]:
trade_weeks_min = trade.groupby('CustomerIdx',as_index=False)['Week'].min().rename(columns={'Week':'FirstWeekCustomer'})
trade_weeks_max = trade.groupby('CustomerIdx',as_index=False)['Week'].max().rename(columns={'Week':'LastWeekCustomer'})

In [97]:
trade = trade[trade.Week>=68]

In [100]:
# add missing weeks
print(trade.shape)
trade = trade.groupby(["CustomerIdx", "Week", "IsinIdx", "BuySell"]).CustomerInterest.unique().unstack("Week").stack("Week", dropna = False)
trade = trade.reset_index()
trade.columns = ["CustomerIdx", "IsinIdx", "BuySell", "Week", "CustomerInterest"]
print(trade.shape)

(819549, 5)
(26543142, 5)


In [101]:
trade = trade.merge(trade_weeks_min, how='left', on='CustomerIdx')
trade = trade.merge(trade_weeks_max, how='left', on='CustomerIdx')

In [104]:
#trade = trade.loc[trade.Week>=trade.FirstWeekCustomer,:]

(72623, 7)

In [105]:
trade.shape

(26543142, 7)

Mean "CustomerInterest" should be around 1.5% according to the organizers.

In [106]:
# fill new cases with 0 in target
trade.CustomerInterest.fillna(0, inplace = True)
trade["CustomerInterest"] = trade.CustomerInterest.astype(int)
trade.CustomerInterest.mean()

0.03087611104970165

## 4.2. CUSTOMER DATA

There are 3471 unique customers in the training data of which 2495 show up in the test set. There are no unknown customers in the test set.

# unique customers
cust.CustomerIdx.nunique()

# Compare number of test customers showing up in cust set to number of unique customers in test set
np.sum(np.in1d(test.CustomerIdx.unique(), cust.CustomerIdx.unique()))/test.CustomerIdx.nunique()

trade.columns

More than 20% of customers are interested only once in the training period, more than 50% are interested less than 15 times total -> There are a few customers that are interested a lot, maybe we should focus on them

# check percentiles
np.percentile(trade.loc[trade.CustomerInterest==1].groupby("CustomerIdx").size(), range(0,100,10))

Customer information:
- 5 different sectors
- 41 subsectors
- 3 regions
- 99 countries

We can include sectors and regions directly, but may want to reduce the dimension of subsectors and countries

# check columns
cust.columns

# create dummies for customers
cust_dummies = pd.concat([cust.CustomerIdx, pd.get_dummies(cust.Sector), pd.get_dummies(cust.Region)], axis = 1)

Most customers are asset managers or banks/intermediaries (possibly high volume groups?). Distribution over Americas, Europe/Africa, Asia is 53/25/22

# merge customer dummies
trade = trade.merge(cust_dummies, on = "CustomerIdx", how = "left")
test  = test.merge(cust_dummies,  on = "CustomerIdx", how = "left")

Asset Managers/Hedge funds and Banks/Intermediaries show clearly more interest than privat and corporate investors

# distribution of customer types
trade.loc[trade.CustomerInterest==1,['Asset Managers & Hedge Funds', 'Asset Owners',
       'Banks and Intermediaries', 'Corporation', 'Official Institution - OI']].mean()

## 4.4. BONDS DATA

Check that bonds after maturity are not tradeable. If this is correct, the easiest solution would be to manually correct these to 0 in model predictions

# descriptive stats
bond.describe(include = "all")

Create bond dummies for model training and merging into trade data

# create bond dumiies
bond_dummies = pd.concat([bond.IsinIdx,
                          bond.Currency, # Used later to merge currency data
                          pd.get_dummies(bond.ActivityGroup), 
                          pd.get_dummies(bond.CompositeRating)], axis = 1)

# merge bond dummies
trade = trade.merge(bond_dummies, on = "IsinIdx", how = "left")
test  = test.merge(bond_dummies,  on = "IsinIdx", how = "left")

## MACROECONOMIC DATA

We have >100 variables here:
- Stock indices (DAX, FTSE100, ...)
- Volatility indices (VSTOXX, VIX, )
- Currency exchange rates (USD <> EUR/CNY/...)
- Inter-bank money lending rate (Money Market) 3-month for each currency 
- Mid- to long-term swaps (2-10 years). TODO: Unsure of the effect on bond trades

# descriptive stats
macro.columns.tolist()[0:15]

Heuristically fill missing values with the previous value or 2xprevious value. If still missing, fill values with the following or 2xfollowing value

macro = macro.fillna(macro.shift(1)).fillna(macro.shift(2)).fillna(macro.shift(-1)).fillna(macro.shift(-2))

# convert dates
macro["DateKey"] = pd.to_datetime(macro["DateKey"], format = '%Y%m%d')

# add week index
macro["Week"] = week_idx(macro["DateKey"], pd.Timestamp('2018-04-23 00:00:00'))
macro["Week"] = macro["Week"].max() + 1 - macro["Week"]

Aggregate the macro values by week. 

TODO: We could also take the lag first and then aggregate, not sure what makes more sense (JH).

macro = macro.groupby(["Week"]).agg("mean")

We are interested in the change in the macro variable compared to the previous date, I think, to check if e.g. the currency value went up or down

# Replace missing lag for first week with 0
macro_diff1 = (macro - macro.shift(1)).fillna(0)

TODO: I think it makes sense to create a common variable e.g. "currency trend" that relates to the specific currency of the bond and/or holder.

fx_diff1 = macro_diff1.filter(like="FX",axis=1)

fx_diff1["USD"] = 1

fx_diff1 = fx_diff1.reset_index().melt(id_vars="Week", var_name="Currency", value_name="Currency_trend1w")

fx_diff1.Currency = fx_diff1.Currency.str[-3:]

The currency data 'fx' can be merged into the bond data

fx_diff1.Currency.unique()

trade = trade.merge(fx_diff1, how='left', on=["Week","Currency"])
test = test.merge(fx_diff1, how='left', on=["Week","Currency"])

There are a few weird currencies (or typos?) for which we don't have information, e.g. CNH

trade.Currency_trend1w.fillna(1, inplace=True)
test.Currency_trend1w.fillna(1, inplace=True)

## 4.5. MARKET DATA

# 5. DATA EXPORT

# check dimensions
print(trade.shape)
print(test.shape)

In [107]:
trade.CustomerIdx.max()

3470

In [108]:
trade.IsinIdx.nunique()

22913

In [109]:
trade.dtypes

CustomerIdx            int64
IsinIdx                int64
BuySell               object
Week                 float64
CustomerInterest       int32
FirstWeekCustomer    float64
LastWeekCustomer     float64
dtype: object

types = {'CustomerIdx':'int32', 'IsinIdx':'int32', 'BuySell':'object', 'Week':'int32', 'CustomerInterest':'bool',
       'FirstWeekCustomer':'int32', 'Asset Managers & Hedge Funds':'bool', 'Asset Owners':'bool',
       'Banks and Intermediaries':'bool', 'Corporation':'bool', 'Official Institution - OI':'bool',
       'Americas':'bool', 'Asia Pacific':'bool', 'Europe, Middle East and Africa':'bool',
       'Currency':'bool', 'FLOW G10':'bool', 'FLOW LOCAL MARKET':'bool', 'SAS & COVERED BONDS':'bool', 'A':'bool',
       'A+':'bool', 'A-':'bool', 'AA':'bool', 'AA+':'bool', 'AA-':'bool', 'AAA':'bool', 'B':'bool', 'B+':'bool',
    'B-':'bool', 'BB':'bool', 'BB+':'bool',
       'BB-':'bool', 'BBB':'bool', 'BBB+':'bool', 'BBB-':'bool', 'C':'bool', 'C+':'bool', 'CC':'bool', 'CC+':'bool',
 'CC-':'bool', 'CCC':'bool',
       'CCC+':'bool', 'CCC-':'bool', 'D':'bool', 'DD+':'bool', 'DDD':'bool', 'DDD+':'bool', 'NR':'bool', 'Currency_trend1w':'bool'}

trade.A.unique

for column in trade.columns:
    print(column)
    trade[column] = trade[column].astype(types[column])

In [112]:
test = test.merge(trade_weeks_min, how='left', on='CustomerIdx')
test = test.merge(trade_weeks_max, how='left', on='CustomerIdx')

In [116]:
trade["PredictionIdx"] = None
del test["DateKey"]
trade = trade.reindex(test.columns, axis = 1)
data = pd.concat([trade, test])

In [118]:
data.to_csv("../data/prepared/datafull_last_year.csv", index=False)