# 1. SETTINGS

In [2]:
# libraries
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
import scipy.stats

In [3]:
# pandas options
pd.set_option("display.max_columns", None)

In [4]:
# random seed
seed = 42

# 2. DATA PARTITIONING

In [5]:
# import data
data = pd.read_csv("../data/prepared/train_new.csv")
print("Train data:", data.shape)

Train data: (111293880, 45)


In [6]:
# check data
data.head()

Unnamed: 0,CustomerIdx,IsinIdx,BuySell,Week,CustomerInterest,Asset Managers & Hedge Funds,Asset Owners,Banks and Intermediaries,Corporation,Official Institution - OI,Americas,Asia Pacific,"Europe, Middle East and Africa",FLOW G10,FLOW LOCAL MARKET,SAS & COVERED BONDS,A,A+,A-,AA,AA+,AA-,AAA,B,B+,B-,BB,BB+,BB-,BBB,BBB+,BBB-,C,C+,CC,CC+,CC-,CCC,CCC+,CCC-,D,DD+,DDD,DDD+,NR
0,0,16471,Buy,1.0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,16471,Buy,2.0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,16471,Buy,3.0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,16471,Buy,4.0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,16471,Buy,5.0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Data is currently partitioned into three folds:

1) STATS - used to compute historical ratios and merge them to train.

2) TRAIN - used to train the algorithms; compute historical ratios and merge them to valid.

3) VALID - used to validate the models.

In [7]:
# data partitioning
stats = data[(data["Week"] >= 1)   & (data["Week"] <= 50)]
train = data[(data["Week"] >= 51)  & (data["Week"] <= 110)]
valid = data[(data["Week"] >= 111) & (data["Week"] <= 121)]
del data

# 3. FEATURE ENGINEERING

In [8]:
##### FUNCTION FOR COMPUTING NAIVE RATIOS

# EXPLANATION
# 1) Computes ratios of target based on
# - training data
# - last t weeks
# - by groupby variable
# 2) Mergres ratios to testing data

# ARGUMENTS:
# train and test = data subsets
# groupby = by what variable to compute target ratios
# t = how many weeks to go in the past

# RETURNS:
# test data with merged ratios

def compute_ratio(train, test, groupby, target = "CustomerInterest", t = 50):
    tmp   = train[train["Week"] >= (train.Week.max() - t)]
    feats = [f for f in tmp.columns if (f in groupby) | (f == target)]
    tmp   = tmp[feats]
    tmp   = tmp.groupby(groupby, as_index = False).mean()
    tmp.rename(columns = {'CustomerInterest': "ratio_" + "".join(groupby) + "_" + str(t)}, inplace = True)
    test = test.merge(tmp, how = "left", on = groupby)
    return test

In [9]:
##### FUNCTION FOR COMPUTING ALL RATIOS

# ARGUMENTS:
# train and test = data subsets

# RETURNS:
# test data with all merged ratios

def compute_all_ratios(train, test):
    
    test = compute_ratio(train, test, groupby = ["CustomerIdx"],            t = 50)
    test = compute_ratio(train, test, groupby = ["IsinIdx"],                t = 50)
    test = compute_ratio(train, test, groupby = ["CustomerIdx", "IsinIdx"], t = 50)
    test = compute_ratio(train, test, groupby = ["CustomerIdx", "BuySell"], t = 50)
    test = compute_ratio(train, test, groupby = ["IsinIdx", "BuySell"],     t = 50)

    test = compute_ratio(train, test, groupby = ["CustomerIdx"],            t = 30)
    test = compute_ratio(train, test, groupby = ["IsinIdx"],                t = 30)
    test = compute_ratio(train, test, groupby = ["CustomerIdx", "IsinIdx"], t = 30)
    test = compute_ratio(train, test, groupby = ["CustomerIdx", "BuySell"], t = 30)
    test = compute_ratio(train, test, groupby = ["IsinIdx", "BuySell"],     t = 30)

    test = compute_ratio(train, test, groupby = ["CustomerIdx"],            t = 10)
    test = compute_ratio(train, test, groupby = ["IsinIdx"],                t = 10)
    test = compute_ratio(train, test, groupby = ["CustomerIdx", "IsinIdx"], t = 10)
    test = compute_ratio(train, test, groupby = ["CustomerIdx", "BuySell"], t = 10)
    test = compute_ratio(train, test, groupby = ["IsinIdx", "BuySell"],     t = 10)

    return test

In [10]:
# computing ratios
train = compute_all_ratios(stats, train)
valid = compute_all_ratios(train, valid)

In [16]:
# check shapes
print(train.shape)
print(valid.shape)

(55646940, 55)
(9274490, 55)


# 4. MODELING

In [12]:
# extract target
y_train = train.CustomerInterest
y_valid = valid.CustomerInterest

In [13]:
# list of features
exclude  = ["Week", "CustomerIdx", "IsinIdx", "BuySell", "CustomerInterest"]
features = [f for f in train.columns if f not in exclude]

In [14]:
# subset data
train = train[features]
valid = valid[features]

In [15]:
# check shapes
print(train.shape)
print(valid.shape)

(55646940, 55)
(9274490, 55)


In [23]:
### PARAMETERS

# parallel settings
cores = 16

# learner settings
metric   = "auc"
verbose  = 100
stopping = 300

# lightGBM
gbm = lgb.LGBMClassifier(n_estimators     = 10000,
                         learning_rate    = 0.005,
                         num_leaves       = 70,
                         colsample_bytree = 0.8,
                         subsample        = 0.9,
                         max_depth        = 7,
                         reg_alpha        = 0.1,
                         reg_lambda       = 0.1,
                         min_split_gain   = 0.01,
                         min_child_weight = 2,
                         random_state     = seed,
                         num_threads      = cores)

In [None]:
# train lightGBM
gbm = gbm.fit(train, y_train, 
              eval_set = [(train, y_train), (valid, y_valid)], 
              eval_metric = metric, verbose = verbose, 
              early_stopping_rounds = stopping)
    
# save number of iterations
num_iters = gbm.best_iteration_

Training until validation scores don't improve for 300 rounds.


In [None]:
##### VARIABLE IMPORTANCE

# load variable importance
importance = pd.DataFrame()
importance["feature"] = features
importance["importance"] = gbm.feature_importances_

# plot variable importance
plt.figure(figsize = (10, 10))
sns.barplot(x = "importance", y = "feature", data = best_features.sort_values(by = "importance", ascending = False))
plt.title('LGBM Feature Importance')
plt.tight_layout()

# 5. PREDICTIONS

In [None]:
# reload data
del train, valid
train = pd.read_csv("../data/prepared/train_new.csv")
test  = pd.read_csv("../data/prepared/test_new.csv")

In [None]:
# computing ratios
test = compute_all_ratios(train, test)

In [None]:
# extract target
y_train = train.CustomerInterest

In [None]:
# list of features
exclude  = ["Week", "CustomerIdx", "IsinIdx", "BuySell", "CustomerInterest"]
features = [f for f in train.columns if f not in exclude]

In [None]:
# subset data
train = train[features]

In [None]:
# check shapes
print(train.shape)
print(test.shape)

In [29]:
# train lightGBM
gbm = gbm.fit(train, y_train, 
              eval_set = [(train, y_train)], 
              eval_metric = metric, verbose = verbose, 
              early_stopping_rounds = stopping)
    
# predict valid and test
test["CustomerInterest"] = gbm.predict_proba(test[features], num_iteration = num_iters)[:, 1]

KeyError: "['ratio1' 'ratio2' 'ratio3' 'ratio4' 'ratio5' 'ratio6' 'RatioMean'] not in index"

# 7. SUBMISSION

In [51]:
# export CSV
subm = test[["PredictionIdx", "CustomerInterest"]]
subm.CustomerInterest = probs 
subm.to_csv("../../submissions/lgb_ratios_dummies.csv", index = False, float_format = "%.8f")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
