In [62]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll.base import scope
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, make_scorer, fbeta_score
from statsmodels.tsa.vector_ar.var_model import VAR

import datetime

In [85]:
def date_to_float(v):
    res = np.empty(len(v))
    for (idx, date_str) in enumerate(v):
        date_converted = datetime.datetime.strptime(date_str, "%Y-%m-%d")
        res[idx] = date_converted.year + date_converted.month / 12

    return res

def shift_date_columns(df, date_cols, horizon=12, up=True):
    date_df = df[date_cols].copy()
    new_df = pd.DataFrame()

    for c in date_cols:
        if up:
            new_df[c] = date_df[c].iloc[-horizon:, :] + 1
        else:
            df[c] = date_df[c].iloc[-horizon:, :] - 1

    date_df.append(new_df)

    return date_df

In [78]:
Y_COL = 'prepaid_amount'
TIME_COLS = [
    'date', # + 1/12
    'years_to_matur', # - 1/12
    'age_owner_years', # + 1/12
]

COLS_TO_ROLL = [ # difficult to implement, rather have to be omitted
    'outstanding_volume',
]

KNOWN_FUTURE_COLS = [
    'volume_schedule',
    'planned_installments',
]

STATIC_COLS = [
    'id',
    'client_rate',
    'original_volume',
    'type',
    'original_matur_years'
]

APROXIMATELY_STATIC = [
    'EDUCATION',
    'NUMBER_OF_FAMILY_MEMBERS',
    'RESIDENTAL_STATUS',
    'MARITAL_STATUS',
    'FIXED_MONTHLY_EXPENSES',
    'Flat_House',
]

MACRO_COLS = [
    'avg_monthly_product_client_rate_cln',
    'avg_monthly_product_client_mtg',
    'avg_empl_enterprise',
    'register_unemployed',
    'unemployment_rate',
    'avg_monthly_salary_enterprise_val',
    'avg_monthly_salary_enterprise_index',
    'wheat_purchase_price_index',
    'milk_purchase_price_index',
    'production_price_energy_index',
    'production_price_water_supply_index',
    'inflation',
    'inflation_apartment_usage',
    'new_flats',
    'economy_index',
    'economy_index_real_estate'
]

OTHER_COLS = [
    'dpd' # random, not being able to forecast
]

REDUNDANT_COLS = [
    'Unnamed: 0',
    'date_str_x',
    'date_str_y',
    'age_load_years', # collinear with "years_to_matur"
    'RES_ADDR_CITY', # too many unique values
    'OPEN_DATE', # collinear with "date", "years_to_matur" and "original_matur_days"
    'avg_monthly_product_client_rate_mtg_grn', # missing data -> unable to forecast
]

In [82]:
df_train = pd.read_csv("../data/data_train.csv")
df_macro = pd.read_excel("../data/Task_data.xlsx", sheet_name='macro', engine='openpyxl')
df_val = pd.read_csv("../data/data_val.csv")

df_train["date"] = date_to_float(df_train["date"].values)
df_train_non_macro = df_train[TIME_COLS + KNOWN_FUTURE_COLS + STATIC_COLS + APROXIMATELY_STATIC].copy()  # in the future add COLS_TO_ROLL
df_train_macro = df_macro[MACRO_COLS].iloc[:(df_macro.shape[0] - 12), :].copy()

In [84]:
macro_var_model_fitted = VAR(endog=df_train_macro).fit(maxlags=None, trend='c')

var_model_forecast = macro_var_model_fitted.forecast(steps=12, y=df_train_macro.to_numpy())
var_model_forecast

array([[5.58447820e-02, 3.29118833e-02, 6.46080486e+03, 8.90554219e+02,
        5.18324077e+00, 5.36847395e+03, 1.03360061e+02, 8.52350150e+01,
        9.91294933e+01, 1.03471179e+02, 1.05050041e+02, 1.04861823e+02,
        1.06880665e+02, 1.78530850e+04, 2.03258984e+00, 1.18692183e+01],
       [5.62880208e-02, 3.35300673e-02, 6.48584246e+03, 8.58864816e+02,
        4.98037349e+00, 5.23608745e+03, 1.03948091e+02, 8.98139705e+01,
        9.48565172e+01, 1.03695948e+02, 1.03888811e+02, 1.04430089e+02,
        1.06233970e+02, 1.77752395e+04, 4.99246975e+00, 1.19045667e+01],
       [5.72507933e-02, 3.33602208e-02, 6.48100139e+03, 8.30455631e+02,
        4.85435749e+00, 5.28121108e+03, 1.04421115e+02, 9.40007500e+01,
        9.14890163e+01, 1.03645032e+02, 1.02868086e+02, 1.03778936e+02,
        1.05269494e+02, 1.77744194e+04, 5.36377793e+00, 1.21988766e+01],
       [5.74335187e-02, 3.32382101e-02, 6.48576947e+03, 8.17179431e+02,
        4.79098062e+00, 5.32544483e+03, 1.05106547e+02, 9.547

In [None]:
def adjusted_rmse(y_true, y_pred, weights):
    np.sqrt(np.sum(
        weights * (y_true - y_pred)**2
    ) / np.sum(weights))

def hyperopt_train_test(params):
    clf = lgb.LGBMRegressor(**params)
    loss = cross_val_score(clf, X_train, y_train, scoring=make_scorer(adjusted_rmse, weights=X_train.volumes)).mean()
    return loss

lgb_class_params = {
    'learning_rate':    hp.uniform('learning_rate', 0.001, 0.2),
    'max_depth':        scope.int(hp.quniform('max_depth', 4, 15, 1)),
    # 'min_child_weight': scope.int(hp.quniform('min_child_weight', 1, 20, 1)),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 0.8),
    'subsample':        hp.choice('subsample', (0.7, 0.8, 0.9)),
    'n_estimators':     hp.choice('n_estimators', (100, 200, 400, 800)),
    'n_jobs': -1,
    'device' : 'gpu',
    'gpu_platform_id' : 0,
    'gpu_device_id' : 0,
    'random_state': 69
}

def f(params):
    loss = hyperopt_train_test(params)
    return {'loss': -loss, 'status': STATUS_OK}


trials = Trials()

best = fmin(f, lgb_class_params, algo=tpe.suggest, max_evals=1, trials=trials)

In [None]:
parameters = ['learning_rate', 'max_depth', 'min_child_weight']
cols = len(parameters)
f, axes = plt.subplots(nrows=1, ncols=cols, figsize=(20,5))

cmap = plt.cm.jet
for i, val in enumerate(parameters):
  xs = np.array([t['misc']['vals'][val] for t in trials.trials]).ravel()
  ys = [-t['result']['loss'] for t in trials.trials]
  axes[i].scatter(xs, ys, s=20, linewidth=0.01, alpha=0.25, c=cmap(float(i)/len(parameters)))
  axes[i].set_title(val)
  axes[i].set_ylim([0.1, 1.0])