In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll.base import scope
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, make_scorer, fbeta_score

import datetime

In [27]:
Y_COL = 'prepaid_amount'
X_COLS = [
    'date',
    'id',
    'years_to_matur',
    'age_owner_years',
    'original_matur_years',
    'client_rate',
    'original_volume',
    'age_loan_years',
    'outstanding_volume',
    'planned_installments',
    # 'type',
    # 'RES_ADDR_CITY',
    'EDUCATION',
    'NUMBER_OF_FAMILY_MEMBERS',
    'RESIDENTAL_STATUS',
    'MARITAL_STATUS',
    'FIXED_MONTHLY_EXPENSES',
    'Flat_House',
    'OPEN_DATE',
    'INCOME_houshold',
    'dpd',
    'date_str_y',
    'avg_monthly_product_client_rate_cln',
    'avg_monthly_product_client_mtg',
    'avg_monthly_product_client_rate_mtg_grn',
    'avg_empl_enterprise',
    'register_unemployed',
    'unemployment_rate',
    'avg_monthly_salary_enterprise_val',
    'avg_monthly_salary_enterprise_index',
    'wheat_purchase_price_index',
    'milk_purchase_price_index',
    'production_price_energy_index',
    'production_price_water_supply_index',
    'inflation',
    'inflation_apartment_usage',
    'new_flats',
    'economy_index',
    'economy_index_real_estate'
]

df_train = pd.read_csv("../data/data_train.csv")[[Y_COL] + X_COLS]
df_val = pd.read_csv("../data/data_val.csv")[[Y_COL] + X_COLS]

def factorize_df(df):
    df_new = df.copy()
    for col in df_new.columns.values:
        if df_new[col].dtype == 'object':
            df_new[col], _ = pd.factorize(df_new[col], na_sentinel=None)

# y_train = df_train[Y_COL].to_numpy()
# df_train.drop(labels=Y_COL, axis=1, inplace=True)
# X_train = df_train.to_numpy()
#
# y_val = df_val[Y_COL].to_numpy()
# df_val.drop(labels=Y_COL, axis=1, inplace=True)
# X_val = df_val.to_numpy()

In [25]:
# isinstance(df_train.date.values[0], datetime.date)
# type(df_train.dtypes)
pd.factorize(["ab", "xd", "ab"])

(array([0, 1, 0]), array(['ab', 'xd'], dtype=object))

In [None]:
def adjusted_rmse(y_true, y_pred, weights):
    np.sqrt(np.sum(
        weights * (y_true - y_pred)**2
    ) / np.sum(weights))

def hyperopt_train_test(params):
    clf = lgb.LGBMRegressor(**params)
    loss = cross_val_score(clf, X_train, y_train, scoring=make_scorer(adjusted_rmse, weights=X_train.volumes)).mean()
    return loss

lgb_class_params = {
    'learning_rate':    hp.uniform('learning_rate', 0.001, 0.2),
    'max_depth':        scope.int(hp.quniform('max_depth', 4, 15, 1)),
    # 'min_child_weight': scope.int(hp.quniform('min_child_weight', 1, 20, 1)),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 0.8),
    'subsample':        hp.choice('subsample', (0.7, 0.8, 0.9)),
    'n_estimators':     hp.choice('n_estimators', (100, 200, 400, 800)),
    'n_jobs': -1,
    'device' : 'gpu',
    'gpu_platform_id' : 0,
    'gpu_device_id' : 0,
    'random_state': 69
}

def f(params):
    loss = hyperopt_train_test(params)
    return {'loss': -loss, 'status': STATUS_OK}


trials = Trials()

best = fmin(f, lgb_class_params, algo=tpe.suggest, max_evals=1, trials=trials)

In [None]:
parameters = ['learning_rate', 'max_depth', 'min_child_weight']
cols = len(parameters)
f, axes = plt.subplots(nrows=1, ncols=cols, figsize=(20,5))

cmap = plt.cm.jet
for i, val in enumerate(parameters):
  xs = np.array([t['misc']['vals'][val] for t in trials.trials]).ravel()
  ys = [-t['result']['loss'] for t in trials.trials]
  axes[i].scatter(xs, ys, s=20, linewidth=0.01, alpha=0.25, c=cmap(float(i)/len(parameters)))
  axes[i].set_title(val)
  axes[i].set_ylim([0.1, 1.0])