# 1. SETTINGS

In [None]:
# libraries
import numpy as np
import pandas as pd
from datetime import date

In [None]:
# warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
# plots
import matplotlib as plt
%matplotlib inline

In [None]:
# pandas options
pd.set_option("display.max_columns", None)

# 2. IMPORT

In [None]:
# import datasets
data = pd.read_csv("../data/prepared/data_basic.csv", compression = "gzip")

In [None]:
# check data
print("Dimensions:", data.shape)
data.head()

# 3. FUNCTIONS TO CREATE FEATURES

In [None]:
### FUNCTION FOR COMPUTING WEEK INDEX
def week_idx(date, end_date):
    return round((end_date - date).dt.days / 7 + 0.4).astype(int)

### RECENCY (TIME SINCE LAST TRADE)

In [None]:
##### FUNCTION TO COMPUTE 6 RECENCY-BASED FEATURES 
# 1) Time since last customer trade with that bond with the same BuySell direction
# 2) Time since last customer trade with that bond with any direction
# 3) Time since last customer trade with any bond with the same BuySell direction
# 4) Time since last customer trade with any bond with any direction
# 5) Time since last trade with that bond by any of the customers with the same BuySell direction
# 6) Time since last trade with that bond by any of the customers

def compute_recency(data):

    tmp = data[data.CustomerInterest == 1]
    
    tmp["PrevWeek"] = tmp.sort_values(by = ["Week"], ascending = True).groupby(["CustomerIdx", "IsinIdx", "BuySell"]).Week.shift(1)
    tmp["CurrWeek"] = tmp.Week

    data = data.merge(tmp, how = "left")

    data["CurrWeek"] = data.groupby(["CustomerIdx", "IsinIdx", "BuySell"]).CurrWeek.fillna(method = "ffill")
    data["PrevWeek"] = data.groupby(["CustomerIdx", "IsinIdx", "BuySell"]).PrevWeek.fillna(method = "bfill")
    data["PrevWeek"][data.PrevWeek.isnull()] = data["CurrWeek"]
    data["PrevWeek"][data.PrevWeek >= data.Week] = None
    
    data["Recency1"] = data["Week"] - data["PrevWeek"]
    data["Recency1"][data.Recency1 <= 0] = None
    del data["PrevWeek"], data["CurrWeek"]

    Recency2 = data.groupby(["CustomerIdx", "IsinIdx", "Week"]).Recency1.min().reset_index()
    Recency2.columns = ["CustomerIdx", "IsinIdx", "Week", "Recency2"]
    data = data.merge(Recency2, how = "left", on = ["CustomerIdx", "IsinIdx", "Week"])
    
    Recency3 = data.groupby(["CustomerIdx", "BuySell", "Week"]).Recency1.min().reset_index()
    Recency3.columns = ["CustomerIdx", "BuySell", "Week", "Recency3"]
    data = data.merge(Recency3, how = "left", on = ["CustomerIdx", "BuySell", "Week"])
    
    Recency4 = data.groupby(["CustomerIdx", "Week"]).Recency1.min().reset_index()
    Recency4.columns = ["CustomerIdx", "Week", "Recency4"]
    data = data.merge(Recency4, how = "left", on = ["CustomerIdx", "Week"])
    
    Recency5 = data.groupby(["IsinIdx", "BuySell", "Week"]).Recency1.min().reset_index()
    Recency5.columns = ["IsinIdx", "BuySell", "Week", "Recency5"]
    data = data.merge(Recency5, how = "left", on = ["IsinIdx", "BuySell", "Week"])
    
    Recency6 = data.groupby(["IsinIdx", "Week"]).Recency1.min().reset_index()
    Recency6.columns = ["IsinIdx", "Week", "Recency6"]
    data = data.merge(Recency6, how = "left", on = ["IsinIdx", "Week"])
    
    data.Recency1.fillna(data.Recency1.max(), inplace = True)
    data.Recency2.fillna(data.Recency2.max(), inplace = True)
    data.Recency3.fillna(data.Recency3.max(), inplace = True)
    data.Recency4.fillna(data.Recency4.max(), inplace = True)
    data.Recency5.fillna(data.Recency5.max(), inplace = True)
    data.Recency6.fillna(data.Recency6.max(), inplace = True)
    
    data["Recency1isLowerRecency2"] = 0
    data["Recency1isLowerRecency2"][data.Recency1 <= data.Recency2] = 1
    
    data["Recency2isLowerRecency4"] = 0
    data["Recency2isLowerRecency4"][data.Recency2 <= data.Recency4] = 1
    
    print("Computed 8 recency features...")
    
    return(data)

### FREQUENCY (TOTAL NUMBER OF TRADES)

In [None]:
##### FUNCTION TO COMPUTE 6 FREQUENCY-BASED FEATURES 
# 1) Number of previous customer trades with that bond with the same BuySell direction
# 2) Number of previous customer trades with that bond with any direction
# 3) Number of previous customer trades with any bond with the same BuySell direction
# 4) Number of previous customer trades with any bond with any direction
# 5) Number of previous trades with that bond by any of the same BuySell direction
# 6) Number of previous trades with that bond by any of the customers
# all frequencies are divided by the number of weeks during which they were observed

def compute_frequency(data):

    tmp = data[data.CustomerInterest == 1]

    tmp["Frequency1"] = tmp.sort_values(by = ["Week"], ascending = True).groupby(["CustomerIdx", "IsinIdx", "BuySell"]).CustomerInterest.cumsum()

    data = data.merge(tmp, how = "left")
    data["Frequency1"] = data.groupby(["CustomerIdx", "IsinIdx", "BuySell"]).Frequency1.fillna(method = "ffill")
    data["Frequency1"][data["CustomerInterest"] == 1] = data["Frequency1"][data["CustomerInterest"] == 1] - 1
    data["Frequency1"].fillna(0, inplace = True)

    Frequency2 = data.groupby(["CustomerIdx", "IsinIdx", "Week"]).Frequency1.sum().reset_index()
    Frequency2.columns = ["CustomerIdx", "IsinIdx", "Week", "Frequency2"]
    data = data.merge(Frequency2, how = "left", on = ["CustomerIdx", "IsinIdx", "Week"])
    
    Frequency3 = data.groupby(["CustomerIdx", "BuySell", "Week"]).Frequency1.sum().reset_index()
    Frequency3.columns = ["CustomerIdx", "BuySell", "Week", "Frequency3"]
    data = data.merge(Frequency3, how = "left", on = ["CustomerIdx", "BuySell", "Week"])

    Frequency4 = data.groupby(["CustomerIdx", "Week"]).Frequency1.sum().reset_index()
    Frequency4.columns = ["CustomerIdx", "Week", "Frequency4"]
    data = data.merge(Frequency4, how = "left", on = ["CustomerIdx", "Week"])

    Frequency5 = data.groupby(["IsinIdx", "Week", "BuySell"]).Frequency1.sum().reset_index()
    Frequency5.columns = ["IsinIdx", "Week", "BuySell", "Frequency5"]
    data = data.merge(Frequency5, how = "left", on = ["IsinIdx", "Week", "BuySell"])
    
    Frequency6 = data.groupby(["IsinIdx", "Week"]).Frequency1.sum().reset_index()
    Frequency6.columns = ["IsinIdx", "Week", "Frequency6"]
    data = data.merge(Frequency6, how = "left", on = ["IsinIdx", "Week"])
    
    data["Frequecny1isLowerFrequency2"] = 0
    data["Frequecny1isLowerFrequency2"][data.Frequency1 < data.Frequency2] = 1
    
    data["Frequecny2isLowerFrequency4"] = 0
    data["Frequecny2isLowerFrequency4"][data.Frequency2 < data.Frequency4] = 1
    
    div = data.Week - data.Week.min()
    div[div == 0] = 1
    for var in ["Frequency1", "Frequency2", "Frequency3", "Frequency4", "Frequency5", "Frequency6"]:
        data[var] = data[var] / div
    
    print("Computed 8 frequency features...")

    return(data)

### MONTH ID

In [None]:
##### FUNCTION TO COMPUTE MONTH
# 1) Month of each trade

def compute_month(data):

    tmp = pd.read_csv("../data/raw/Trade.csv")
    tmp = tmp[["TradeDateKey"]]

    ### FUNCTION FOR COMPUTING WEEK INDEX
    def week_idx(date, end_date):
        return round((end_date - date).dt.days / 7 + 0.4).astype(int)

    tmp["TradeDateKey"] = pd.to_datetime(tmp["TradeDateKey"], format = '%Y%m%d')
    tmp["Week"] = week_idx(tmp["TradeDateKey"], pd.Timestamp('2018-04-23 00:00:00'))
    tmp["Month"] = tmp["TradeDateKey"].dt.month.astype("object")
    del tmp["TradeDateKey"]
    tmp = tmp.drop_duplicates()

    data = data.merge(tmp, how = "left", on = "Week")
    data["Month"][data.Week == 121] = "4"
    
    print("Computed 1 month feature...")
    
    return data

# 4. COMPUTING FEATURES

In [None]:
# compute past features
data = compute_recency(data)
data = compute_frequency(data)

In [None]:
# compute months
data = compute_month(data)

# 5. CORRECT AND EXPORT

In [None]:
# check recency distribution
data.Recency1.hist()

In [None]:
# clear memory
del bond_stat, macro_diff1, fx_diff1, bond, bond_dummies, cust_dummies

In [None]:
# remove first X weeks
data = data[data.Week > 60]

In [None]:
# check dimensions
print(data.shape)

In [None]:
# check NA
nas = data.isnull().sum()
nas[nas > 0]

In [None]:
# export CSV
data.to_csv("../data/prepared/data_v4_0_60_under.csv", index = False, compression = "gzip")