# 1. SETTINGS

In [101]:
# libraries
import numpy as np
import pandas as pd
from datetime import date

In [102]:
# warnings
import warnings
warnings.filterwarnings("ignore")

In [103]:
# pandas options
pd.set_option("display.max_columns", None)

# 2. IMPORT

In [104]:
# import datasets
data = pd.read_csv("../data/prepared/data_basic.csv")

In [105]:
# check data
print("Dimensions:", data.shape)
data.head()

Dimensions: (5232538, 6)


Unnamed: 0,PredictionIdx,CustomerIdx,IsinIdx,BuySell,CustomerInterest,Week
0,,0,21856,Buy,0.0,101
1,,0,21856,Buy,0.0,102
2,,0,21856,Buy,1.0,103
3,,0,21856,Buy,0.0,104
4,,0,21856,Buy,0.0,105


# 3. FUNCTIONS TO CREATE FEATURES

### RECENCY (TIME SINCE LAST TRADE)

In [106]:
##### FUNCTION TO COMPUTE 4 RECENCY-BASED FEATURES 
# 1) Time since last customer trade with that bond with the same BuySell direction
# 2) Time since last customer trade with that bond with any direction
# 3) Time since last customer trade with any bond with any direction
# 4) Time since last trade with that bond by any of the customers

def compute_recency(data):

    tmp = data[data.CustomerInterest == 1]
    
    tmp["PrevWeek"] = tmp.sort_values(by = ["Week"], ascending = True).groupby(["CustomerIdx", "IsinIdx", "BuySell"]).Week.shift(1)
    tmp["CurrWeek"] = tmp.Week

    data = data.merge(tmp, how = "left")

    data["CurrWeek"] = data.groupby(["CustomerIdx", "IsinIdx", "BuySell"]).CurrWeek.fillna(method = "ffill")
    data["PrevWeek"] = data.groupby(["CustomerIdx", "IsinIdx", "BuySell"]).PrevWeek.fillna(method = "bfill")
    data["PrevWeek"][data.PrevWeek.isnull()] = data["CurrWeek"]
    data["PrevWeek"][data.PrevWeek >= data.Week] = None
    
    data["Recency1"] = data["Week"] - data["PrevWeek"]
    data["Recency1"][data.Recency1 <= 0] = None
    del data["PrevWeek"], data["CurrWeek"]

    Recency2 = data.groupby(["CustomerIdx", "IsinIdx", "Week"]).Recency1.min().reset_index()
    Recency2.columns = ["CustomerIdx", "IsinIdx", "Week", "Recency2"]
    data = data.merge(Recency2, how = "left", on = ["CustomerIdx", "IsinIdx", "Week"])
    
    Recency3 = data.groupby(["CustomerIdx", "Week"]).Recency1.min().reset_index()
    Recency3.columns = ["CustomerIdx", "Week", "Recency3"]
    data = data.merge(Recency3, how = "left", on = ["CustomerIdx", "Week"])
    
    Recency4 = data.groupby(["IsinIdx", "Week"]).Recency1.min().reset_index()
    Recency4.columns = ["IsinIdx", "Week", "Recency4"]
    data = data.merge(Recency4, how = "left", on = ["IsinIdx", "Week"])
    
    print("Computed 4 recency features...")
    
    return(data)

### FREQUENCY (TOTAL NUMBER OF TRADES)

In [107]:
##### FUNCTION TO COMPUTE 4 FREQUENCY-BASED FEATURES 
# 1) Number of previous customer trades with that bond with the same BuySell direction
# 2) Number of previous customer trades with that bond with any direction
# 3) Number of previous customer trades with any bond with any direction
# 4) Number of previous trades with that bond by any of the customers
# all frequencies are divided by the number of weeks during which they were observed

def compute_frequency(data):

    tmp = data[data.CustomerInterest == 1]

    tmp["Frequency1"] = tmp.sort_values(by = ["Week"], ascending = True).groupby(["CustomerIdx", "IsinIdx", "BuySell"]).CustomerInterest.cumsum()

    data = data.merge(tmp, how = "left")
    data["Frequency1"] = data.groupby(["CustomerIdx", "IsinIdx", "BuySell"]).Frequency1.fillna(method = "ffill")
    data["Frequency1"][data["CustomerInterest"] == 1] = data["Frequency1"][data["CustomerInterest"] == 1] - 1
    data["Frequency1"].fillna(0, inplace = True)

    Frequency2 = data.groupby(["CustomerIdx", "IsinIdx", "Week"]).Frequency1.sum().reset_index()
    Frequency2.columns = ["CustomerIdx", "IsinIdx", "Week", "Frequency2"]
    data = data.merge(Frequency2, how = "left", on = ["CustomerIdx", "IsinIdx", "Week"])

    Frequency3 = data.groupby(["CustomerIdx", "Week"]).Frequency1.sum().reset_index()
    Frequency3.columns = ["CustomerIdx", "Week", "Frequency3"]
    data = data.merge(Frequency3, how = "left", on = ["CustomerIdx", "Week"])

    Frequency4 = data.groupby(["IsinIdx", "Week"]).Frequency1.sum().reset_index()
    Frequency4.columns = ["IsinIdx", "Week", "Frequency4"]
    data = data.merge(Frequency4, how = "left", on = ["IsinIdx", "Week"])
    
    div = data.Week - data.Week.min()
    div[div == 0] = 1
    for var in ["Frequency1", "Frequency2", "Frequency3", "Frequency4"]:
        data[var] = data[var] / div
    
    print("Computed 4 frequency features...")

    return(data)

### MONTH AND WEEK ID

# 4. COMPUTING FEATURES

In [108]:
# compute all features
data = compute_recency(data)
data = compute_frequency(data)

# 5. EXPORT

In [109]:
# check dimensions
print(data.shape)

(5232538, 14)


In [110]:
# check NA
data.isnull().sum()

PredictionIdx       4747780
CustomerIdx               0
IsinIdx                   0
BuySell                   0
CustomerInterest     484758
Week                      0
Recency1            2706471
Recency2            2392883
Recency3             290669
Recency4             660922
Frequency1                0
Frequency2                0
Frequency3                0
Frequency4                0
dtype: int64

In [None]:
# export CSV
data.to_csv("../data/prepared/data_v1.csv", index = False)