In [1]:
import pandas as pd 
import numpy as np 
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import gc
gc.enable()

In [4]:
with pd.HDFStore('../data/feat/data.h5') as store:
    print(store.keys())
    X_train = store['X_train']
    X_cv = store['X_cv']
    y_train = store['y_train']
    y_cv = store['y_cv']
    X_test = store['X_test']

['/X_cv', '/X_test', '/X_train', '/y_cv', '/y_train']


In [6]:
X_train.date_block_num.unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32], dtype=int64)

In [37]:
## clip to (0,20)
y_train = y_train.clip(0,20)
y_cv = y_cv.clip(0,20)

In [13]:
X_train.shape

(10675632, 54)

In [9]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

In [10]:
X_train = downcast_dtypes(X_train)
X_cv = downcast_dtypes(X_cv)
X_test = downcast_dtypes(X_test)

In [11]:
y_train = y_train.astype(np.float32)
y_cv = y_cv.astype(np.float32)

In [12]:
gc.collect()

49

In [22]:
X_train.fillna(0,inplace=True)

In [23]:
X_cv.fillna(0,inplace=True)

# Train /test reduce
- memory issue
- time cost issue

In [40]:
train_dates = X_train.date_block_num
mask = train_dates >= 12
X_train = X_train[mask]
y_train = y_train[mask]

In [41]:
train_dates = train_dates[mask]

In [43]:
X_train.shape

(6186922, 54)

In [44]:
gc.collect()

221

# First Level Training

In [45]:
train_dates = X_train.date_block_num

In [46]:
# level2_date_block = [27, 28, 29, 30, 31, 32]
level2_date_block = [32]
level2_mask = train_dates.isin(level2_date_block)
# level2_mask = pd.Series(train_dates).isin(level2_date_block)
train_dates_level2 = train_dates[level2_mask]
train_y_level2 = y_train[level2_mask]

In [57]:
import copy
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import mean_squared_error

def customized_grid_search_cv_evaluate(clf, X_train, y_train, param_grid):
    
    params_list = list(ParameterGrid(param_grid))
    res_df = pd.DataFrame(data=np.zeros((len(params_list),3)), columns=['params', 'mean_test_score', 'std_test_score'])
    res_df.loc[:,'params'] = params_list
    
    for i, params in enumerate(params_list):
        scores = []
        print('Fitting: ', params, '...', end='\n\tscores= ')
        for cur_block in level2_date_block:
            copy_clf = copy.deepcopy(clf)
            original_param = copy_clf.get_params()
            original_param.update(params)
            copy_clf.set_params(**original_param) # update copy clf with trying params 
            
            copy_clf.fit(X_train[train_dates < cur_block].values, y_train[train_dates < cur_block])
            pred_y = copy_clf.predict(X_train[train_dates == cur_block].values)
            pred_y = np.clip(pred_y, 0., 20.)
            score = mean_squared_error(y_train[train_dates == cur_block], pred_y)**.5
            print('{:.5f} '.format(score), end='')
            scores.append(score)
            del copy_clf; gc.collect()
        
        print('')
        res_df.loc[i, 'mean_test_score'] = np.mean(scores)
        res_df.loc[i, 'std_test_score'] = np.std(scores)
        
    print('Fitting finished')
    res_df = res_df.sort_values(by=['mean_test_score', 'std_test_score'], ascending=True).reset_index(drop=True)
    best_params = res_df.loc[0, 'params']
    
    print('Selected hyper-params:', best_params)
    print('cv score: {:.4f}, std: {:.4f}'.format(res_df.loc[0, 'mean_test_score'], res_df.loc[0, 'std_test_score']))
    del res_df, params_list; gc.collect()

In [34]:
def customized_grid_search_simple_holdout_evaluate(clf, X_train, y_train, param_grid, level2_date_block=[32]):
    
    params_list = list(ParameterGrid(param_grid))
    res_df = pd.DataFrame(data=np.zeros((len(params_list),2)), columns=['params', 'val_score'])
    res_df.loc[:,'params'] = params_list
    
    train_mask = train_dates < level2_date_block[0]
    validation_mask = (train_dates >= level2_date_block[0]) & (train_dates <= level2_date_block[-1])
    for i, params in enumerate(params_list):
        print('Fitting: ', params, '...', end='\n\tscore= ')
        
        copy_clf = copy.deepcopy(clf)
        original_param = copy_clf.get_params()
        original_param.update(params)
        copy_clf.set_params(**original_param) # update copy clf with trying params 

        copy_clf.fit(X_train[train_mask].values, y_train[train_mask])
        pred_y = copy_clf.predict(X_train[validation_mask].values)
        pred_y = np.clip(pred_y, 0., 20.)
        score = mean_squared_error(y_train[validation_mask], pred_y)**.5
        print('{:.5f} '.format(score), end='\n')
        del copy_clf; gc.collect()

        res_df.loc[i, 'val_score'] = score
        
    print('Fitting finished')
    res_df = res_df.sort_values(by=['val_score'], ascending=True).reset_index(drop=True)
    best_params = res_df.loc[0, 'params']
    
    print('Selected hyper-params:', best_params)
    print('cv score: {:.4f}'.format(res_df.loc[0, 'val_score']))
    del res_df, params_list; gc.collect()

In [54]:
params_grid = { 
    'max_depth' : hp.quniform("max_depth", 4, 16, 1),
    'num_leaves': hp.quniform('num_leaves', 8, 128, 2),
    'feature_fraction': hp.uniform('feature_fraction', 0.3, 1.0),
    'bagging_fraction': hp.uniform ('bagging_fraction', 0.7, 1),
    'reg_lambda': hp.uniform('reg_lambda',0,1),
    'gamma' : hp.uniform('gamma', 0.1,0.5),
}


# Optimize test

In [60]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [None]:
def customized_opt_simple_holdout_evaluate(clf, X_train, y_train, params, level2_date_block=[32]):
    """ 
    use hyperopt to find `best` params for clf 
    """
    
    

In [81]:
def objective(params):
    
    clf = params.get('clf')
    new_params = params.get('params') ## update params
#     print(params,clf.get_params())

    copy_clf = copy.deepcopy(clf)
    original_params = copy_clf.get_params()
    original_params.update(new_params)
    copy_clf.set_params(**original_params) # update copy clf with trying new_params     
    
    train_mask = train_dates < level2_date_block[0]
    validation_mask = (train_dates >= level2_date_block[0]) & (train_dates <= level2_date_block[-1])
    
    copy_clf.fit(X_train[train_mask].values, y_train[train_mask])
    
    
    pred_y = copy_clf.predict(X_train[validation_mask].values)
    pred_y = np.clip(pred_y, 0., 20.)
    rmse = mean_squared_error(y_train[validation_mask], pred_y)**.5
    print('rmse: {:.5f} '.format(rmse), end='\n')
    
    return {'loss':rmse,'status':STATUS_OK}

In [84]:
trials = Trials()
rg_clf = Pipeline([
    ('standardscaler', StandardScaler(copy=False, with_mean=True, with_std=True)),
    ('rg', Ridge(fit_intercept=True, normalize=True, max_iter=2000, random_state=0))
])
clf_params = {
    'clf':rg_clf,
    'params': {'rg__alpha': hp.uniform('rg__alpha', 0.01, 1.0)},
}

best = fmin(fn = objective,
            space = clf_params,
            algo = tpe.suggest,
            trials = trials,
            max_evals = 10
           )

rmse: 1.06686 
rmse: 1.06106 
rmse: 1.06349 
rmse: 1.05911 
rmse: 1.06687 
rmse: 1.06715 
rmse: 1.05252 
rmse: 1.06831 
rmse: 1.06796 
rmse: 1.06450 


In [85]:
best

{'rg__alpha': 0.012834387887666377}

## Linear model 
- Ridge

In [32]:
from sklearn.linear_model import Ridge, Lasso #featuring L2/L1 regularized linear models
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import ParameterGrid
import copy

In [58]:
rg_clf = Pipeline([
    ('standardscaler', StandardScaler(copy=False, with_mean=True, with_std=True)),
    ('rg', Ridge(fit_intercept=True, normalize=True, max_iter=2000, random_state=0))
])

rg_params = {
    'rg__alpha': [0.1, 1., 2., 4.],
}
customized_grid_search_cv_evaluate(rg_clf, X_train, y_train, rg_params)

Fitting:  {'rg__alpha': 0.1} ...
	scores= 4.47786 3.66831 2.06927 1.88897 1.97926 6.70534 
Fitting:  {'rg__alpha': 1.0} ...
	scores= 4.47854 3.66718 2.06686 1.89577 1.98347 6.72269 
Fitting:  {'rg__alpha': 2.0} ...
	scores= 4.47944 3.66787 2.06959 1.89999 1.99009 6.73109 
Fitting:  {'rg__alpha': 4.0} ...
	scores= 4.48113 3.67167 2.07809 1.90774 2.00280 6.74004 
Fitting finished
Selected hyper-params: {'rg__alpha': 0.1}
cv score: 3.4648, std: 1.7420


In [62]:
rg_clf = Pipeline([
    ('standardscaler', StandardScaler(copy=False, with_mean=True, with_std=True)),
    ('rg', Ridge(fit_intercept=True, normalize=True, max_iter=2000, random_state=0))
])

rg_params = {
    'rg__alpha': [0.01, 0.05, 0.1, 0.25, 0.5],
}
customized_grid_search_cv_evaluate(rg_clf, X_train, y_train, rg_params)

Fitting:  {'rg__alpha': 0.01} ...
	scores= 4.47895 3.66958 2.07293 1.88877 1.98052 6.70177 
Fitting:  {'rg__alpha': 0.05} ...
	scores= 4.47828 3.66887 2.07093 1.88875 1.97977 6.70344 
Fitting:  {'rg__alpha': 0.1} ...
	scores= 4.47786 3.66831 2.06927 1.88897 1.97926 6.70534 
Fitting:  {'rg__alpha': 0.25} ...
	scores= 4.47758 3.66745 2.06701 1.89016 1.97912 6.70990 
Fitting:  {'rg__alpha': 0.5} ...
	scores= 4.47787 3.66703 2.06623 1.89239 1.98027 6.71513 
Fitting finished
Selected hyper-params: {'rg__alpha': 0.1}
cv score: 3.4648, std: 1.7420


In [64]:
Ridge().get_params()

{'alpha': 1.0,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': None,
 'normalize': False,
 'random_state': None,
 'solver': 'auto',
 'tol': 0.001}

In [39]:
rg_clf = Pipeline([
    ('standardscaler', StandardScaler(copy=False, with_mean=True, with_std=True)),
    ('rg', Ridge(fit_intercept=True, normalize=True, max_iter=2000, random_state=0))
])

rg_params = {
    'rg__alpha': [0.01, 0.05, 0.1, 0.25, 0.5],
}
customized_grid_search_simple_holdout_evaluate(rg_clf,X_train,y_train,rg_params,level2_date_block=[32])

Fitting:  {'rg__alpha': 0.01} ...
	score= 1.04800 
Fitting:  {'rg__alpha': 0.05} ...
	score= 1.04902 
Fitting:  {'rg__alpha': 0.1} ...
	score= 1.05023 
Fitting:  {'rg__alpha': 0.25} ...
	score= 1.05349 
Fitting:  {'rg__alpha': 0.5} ...
	score= 1.05799 
Fitting finished
Selected hyper-params: {'rg__alpha': 0.01}
cv score: 1.0480


##### Gradient Boosting with lightgbm

In [86]:
import lightgbm as lgb

In [87]:
lgb.LGBMRegressor()

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
       n_jobs=-1, num_leaves=31, objective=None, random_state=None,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)

In [88]:
lgb_clf = lgb.LGBMRegressor(random_state=0, n_jobs=4)

# 1. find an optimal n_esti for rather larger learning rate
lgb_params = {
    'boosting_type': ['gbdt', 'dart']
}
customized_grid_search_simple_holdout_evaluate(lgb_clf, X_train, y_train, lgb_params)

Fitting:  {'boosting_type': 'gbdt'} ...
	score= 3.86143 
Fitting:  {'boosting_type': 'dart'} ...
	score= 3.86417 
Fitting finished
Selected hyper-params: {'boosting_type': 'gbdt'}
cv score: 3.8614


In [38]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'mse'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}
gbm = lgb.LGBMRegressor(**params)
gbm.fit(X_train,y_train,eval_metric='l2',eval_set=[(X_cv,y_cv)],verbose=True)

[1]	valid_0's l2: 1.25148
[2]	valid_0's l2: 1.21532
[3]	valid_0's l2: 1.18213
[4]	valid_0's l2: 1.15218
[5]	valid_0's l2: 1.12544
[6]	valid_0's l2: 1.10107
[7]	valid_0's l2: 1.08161
[8]	valid_0's l2: 1.06201
[9]	valid_0's l2: 1.04644
[10]	valid_0's l2: 1.03051
[11]	valid_0's l2: 1.01674
[12]	valid_0's l2: 1.00508
[13]	valid_0's l2: 0.994445
[14]	valid_0's l2: 0.983983
[15]	valid_0's l2: 0.976331
[16]	valid_0's l2: 0.967952
[17]	valid_0's l2: 0.961462
[18]	valid_0's l2: 0.955311
[19]	valid_0's l2: 0.949478
[20]	valid_0's l2: 0.944066
[21]	valid_0's l2: 0.939371
[22]	valid_0's l2: 0.934854
[23]	valid_0's l2: 0.931173
[24]	valid_0's l2: 0.927503
[25]	valid_0's l2: 0.92477
[26]	valid_0's l2: 0.922092
[27]	valid_0's l2: 0.919534
[28]	valid_0's l2: 0.917342
[29]	valid_0's l2: 0.914299
[30]	valid_0's l2: 0.912371
[31]	valid_0's l2: 0.91075
[32]	valid_0's l2: 0.907972
[33]	valid_0's l2: 0.906328
[34]	valid_0's l2: 0.904798
[35]	valid_0's l2: 0.9034
[36]	valid_0's l2: 0.902357
[37]	valid_0's l2

LGBMRegressor(bagging_fraction=0.8, bagging_freq=5, boosting_type='gbdt',
       class_weight=None, colsample_bytree=1.0, feature_fraction=0.9,
       learning_rate=0.05, max_depth=-1, metric={'l2', 'mse'},
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=100, n_jobs=-1, num_leaves=31, objective='regression',
       random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=1,
       task='train', verbose=0)