# 1. SETTINGS

In [None]:
# libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb

In [None]:
# warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
# pandas options
pd.set_option("display.max_columns", None)

# 2. IMPORT

In [None]:
# import datasets
data = pd.read_csv("../data/prepared/data_v2_0_100.csv", compression = "gzip")

In [None]:
# check data
print("Dimensions:", data.shape)
data.head()

# 3. PREPARATIONS

In [None]:
# check NA
#data.isnull().sum()

In [None]:
# list of features
excluded_features = ["PredictionIdx", "CustomerIdx", "IsinIdx", "BuySell", "Week", "CustomerInterest"]
features = [var for var in data.columns if var not in excluded_features]
len(features)

In [None]:
### drop high recency

# drop from train only
data = data[(data.Week == 121) | (data.Recency1 < data.Recency1.max())]

In [None]:
### data partitioning

# training
X_train = data.loc[data.Week < 120, features]
y_train = data.loc[data.Week < 120].CustomerInterest

# validation
X_valid = data.loc[data.Week == 120, features]
y_valid = data.loc[data.Week == 120].CustomerInterest

# test set
test = data.loc[data.Week == 121]

# 4. MODELING - STAGE 1

## LOGISTIC REGRESSION

In [None]:
# preparations
#losses = []
#costs = [1, 2, 3]

In [None]:
# modeling loop
#for c in costs:
#    print(f'Training for c = {c}...')
#    lr = LogisticRegression(C = c)
#    lr = lr.fit(X_train, y_train)
#    probs = lr.predict_proba(X_valid)[:,1]
#    loss = roc_auc_score(y_valid, probs)
#    print(f'- AUC for c = {c} = {loss}')
#    losses.append(loss)

In [None]:
# extract the best C
#cost = costs[np.argmax(losses)]
#print("Best C =", cost)

## LIGHT GBM

In [None]:
### PARAMETERS

# parallel settings
cores = 16

# learner settings
metric   = "auc"
verbose  = 10
stopping = 30

# lightGBM
gbm = lgb.LGBMClassifier(n_estimators     = 500,
                         learning_rate    = 0.005,
                         num_leaves       = 70,
                         colsample_bytree = 0.8,
                         subsample        = 0.9,
                         max_depth        = 7,
                         reg_alpha        = 0.1,
                         reg_alambda       = 0.1,
                         min_split_gain   = 0.01,
                         min_child_weight = 2,
                         random_state     = 42,
                         num_threads      = cores)

In [None]:
# train lightGBM
gbm = gbm.fit(X_train, y_train, 
              eval_set = [(X_train, y_train), (X_valid, y_valid)], 
              eval_metric = metric, verbose = verbose, 
              early_stopping_rounds = stopping)
    
# save number of iterations
num_iters = gbm.best_iteration_  

### RESULTS
# k = 8,  train = last 20 weeks, drop 80 weeks, only recency & frequency:           0.780024 (0.75749 LB)
# k = 57, train = last 20 weeks, drop 80 weeks, added dummies & bond data:          0.782063 (0.76356 LB)
# k = 59, train = last 20 weeks, drop 80 weeks, additional recency and frequency:   0.781909 ()
# k = 57, train = last 20 weeks, drop 30 weeks, added 50 weeks for RF computation:  0.815764 ()
# k = 59, train = last 20 weeks, drop 30 weeks, more weeks and 6 RF features:       0.817509 (0.77096 LB)
# k = 59, train = last 20 weeks, drop 0 weeks, no new features:                     0.832469 ()

In [None]:
##### VARIABLE IMPORTANCE

# load variable importance
importance = pd.DataFrame()
importance["feature"] = features
importance["importance"] = gbm.feature_importances_

# plot variable importance
plt.figure(figsize = (10, 10))
sns.barplot(x = "importance", y = "feature", data = importance.sort_values(by = "importance", ascending = False))
plt.title('LGBM Feature Importance')
plt.tight_layout()
plt.savefig("../var_importance.pdf")

# 5. MODELING - STAGE 2

In [None]:
# use full data as train
X_train = data.loc[data.Week <= 120, features]
y_train = data.loc[data.Week <= 120].CustomerInterest

## LOGISTIC REGRESSION

In [None]:
# retrain model with the best C
#print(f'Training for c = {cost}...')
#lr = LogisticRegression(C = cost)
#lr = lr.fit(X_train, y_train)

In [None]:
# predict test set
#test["CustomerInterestLOG"] = lr.predict_proba(test[features])[:,1]

## LIGHT GBM

In [None]:
### retrain model with the best iters

# lightGBM
gbm = lgb.LGBMClassifier(n_estimators     = num_iters,
                         learning_rate    = 0.005,
                         num_leaves       = 70,
                         colsample_bytree = 0.8,
                         subsample        = 0.9,
                         max_depth        = 7,
                         reg_alpha        = 0.1,
                         reg_alambda      = 0.1,
                         min_split_gain   = 0.01,
                         min_child_weight = 2,
                         random_state     = 42,
                         num_threads      = cores)

# train lightGBM
gbm = gbm.fit(X_train, y_train, 
              eval_set = [(X_train, y_train)], 
              eval_metric = metric, verbose = verbose)

In [None]:
# predict test set
test["CustomerInterestLGB"] = gbm.predict_proba(test[features], num_iteration = num_iters)[:, 1]

# 6. SUBMISSION

In [None]:
# check rank correlation with the best submission
from scipy.stats import spearmanr
best = pd.read_csv("../submissions/data_v2_30_lgb_val08175.csv")
spearmanr(test["CustomerInterestLGB"], best.CustomerInterest)

In [None]:
# export CSV
subm = test[["PredictionIdx", "CustomerInterestLGB"]]
subm.columns =["PredictionIdx", "CustomerInterest"]
subm.to_csv("../submissions/data_v2_0_100_lgb_val0832469.csv", index = False, float_format = "%.8f")