# First Level Training 

1. Load pre-processing data
    - downcast to float32, int32 
2. Train the first level models
    - validate model with simple hold out method
    - several models
        - linear 
        - tree based
        - knn
3. dump to output for 2nd level training        

# 1. Load Data

In [1]:
import pandas as pd 
import numpy as np 
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import gc
gc.enable()

In [2]:
with pd.HDFStore('../data/feat/data.h5') as store:
    print(store.keys())
    X_train = store['X_train']
    X_cv = store['X_cv']
    y_train = store['y_train']
    y_cv = store['y_cv']
    X_test = store['X_test']

['/X_cv', '/X_test', '/X_train', '/y_cv', '/y_train']


In [3]:
X_train.date_block_num.unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32], dtype=int64)

 - clip to (0,20)

In [9]:
y_train = y_train.clip(0,20)
y_cv = y_cv.clip(0,20)

In [10]:
X_train.shape

(6186922, 54)

- downcast to `float32`, `int32`

In [4]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

In [11]:
X_train = downcast_dtypes(X_train)
X_cv = downcast_dtypes(X_cv)
X_test = downcast_dtypes(X_test)

In [7]:
y_train = y_train.astype(np.float32)
y_cv = y_cv.astype(np.float32)

In [8]:
gc.collect()

42

- `fillna` with 0

In [49]:
X_train.fillna(0,inplace=True)

In [50]:
X_cv.fillna(0,inplace=True)

In [51]:
X_test.fillna(0,inplace=True)

## 1.1 Train /test reduce
We take only `date_block_num` between `12~32`
- memory issue
- time cost issue

In [6]:
train_dates = X_train.date_block_num

mask = train_dates >= 12
X_train = X_train[mask]
y_train = y_train[mask]

train_dates = train_dates[mask]
test_dates = X_test.date_block_num
cv_dates = X_cv.date_block_num

In [12]:
X_train.shape

(6186922, 54)

In [13]:
gc.collect()

26

___

# 2. First Level Training

In [14]:
train_dates = X_train.date_block_num

In [22]:
level2_date_block = [27, 28, 29, 30, 31, 32]
# level2_date_block = [32]
level2_mask = train_dates.isin(level2_date_block)
train_dates_level2 = train_dates[level2_mask]
train_y_level2 = y_train[level2_mask]

## 2.0 custom grid search 
    - stolen from top20 kaggler

In [57]:
import copy
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import mean_squared_error

def customized_grid_search_cv_evaluate(clf, X_train, y_train, param_grid):
    
    params_list = list(ParameterGrid(param_grid))
    res_df = pd.DataFrame(data=np.zeros((len(params_list),3)), columns=['params', 'mean_test_score', 'std_test_score'])
    res_df.loc[:,'params'] = params_list
    
    for i, params in enumerate(params_list):
        scores = []
        print('Fitting: ', params, '...', end='\n\tscores= ')
        for cur_block in level2_date_block:
            copy_clf = copy.deepcopy(clf)
            original_param = copy_clf.get_params()
            original_param.update(params)
            copy_clf.set_params(**original_param) # update copy clf with trying params 
            
            copy_clf.fit(X_train[train_dates < cur_block].values, y_train[train_dates < cur_block])
            pred_y = copy_clf.predict(X_train[train_dates == cur_block].values)
            pred_y = np.clip(pred_y, 0., 20.)
            score = mean_squared_error(y_train[train_dates == cur_block], pred_y)**.5
            print('{:.5f} '.format(score), end='')
            scores.append(score)
            del copy_clf; gc.collect()
        
        print('')
        res_df.loc[i, 'mean_test_score'] = np.mean(scores)
        res_df.loc[i, 'std_test_score'] = np.std(scores)
        
    print('Fitting finished')
    res_df = res_df.sort_values(by=['mean_test_score', 'std_test_score'], ascending=True).reset_index(drop=True)
    best_params = res_df.loc[0, 'params']
    
    print('Selected hyper-params:', best_params)
    print('cv score: {:.4f}, std: {:.4f}'.format(res_df.loc[0, 'mean_test_score'], res_df.loc[0, 'std_test_score']))
    del res_df, params_list; gc.collect()

In [34]:
def customized_grid_search_simple_holdout_evaluate(clf, X_train, y_train, param_grid, level2_date_block=[32]):
    
    params_list = list(ParameterGrid(param_grid))
    res_df = pd.DataFrame(data=np.zeros((len(params_list),2)), columns=['params', 'val_score'])
    res_df.loc[:,'params'] = params_list
    
    train_mask = train_dates < level2_date_block[0]
    validation_mask = (train_dates >= level2_date_block[0]) & (train_dates <= level2_date_block[-1])
    for i, params in enumerate(params_list):
        print('Fitting: ', params, '...', end='\n\tscore= ')
        
        copy_clf = copy.deepcopy(clf)
        original_param = copy_clf.get_params()
        original_param.update(params)
        copy_clf.set_params(**original_param) # update copy clf with trying params 

        copy_clf.fit(X_train[train_mask].values, y_train[train_mask])
        pred_y = copy_clf.predict(X_train[validation_mask].values)
        pred_y = np.clip(pred_y, 0., 20.)
        score = mean_squared_error(y_train[validation_mask], pred_y)**.5
        print('{:.5f} '.format(score), end='\n')
        del copy_clf; gc.collect()

        res_df.loc[i, 'val_score'] = score
        
    print('Fitting finished')
    res_df = res_df.sort_values(by=['val_score'], ascending=True).reset_index(drop=True)
    best_params = res_df.loc[0, 'params']
    
    print('Selected hyper-params:', best_params)
    print('cv score: {:.4f}'.format(res_df.loc[0, 'val_score']))
    del res_df, params_list; gc.collect()

## 2.1 Optimization
- with `hyperopt` library 

In [32]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [17]:
level2_date_block[0]

27

- simple hold out method to optimize data

In [33]:
def objective(params):
    
    clf = params.get('clf')
    new_params = params.get('params') ## update params
#     print(params,clf.get_params())

    copy_clf = copy.deepcopy(clf)
    original_params = copy_clf.get_params()
    original_params.update(new_params)
    copy_clf.set_params(**original_params) # update copy clf with trying new_params     
    
    train_mask = train_dates < level2_date_block[0]
    ## simple hold out 
    validation_mask = (train_dates >= level2_date_block[0]) & (train_dates <= level2_date_block[-1]) ## validate on the level2_date_block
    
    copy_clf.fit(X_train[train_mask].values, y_train[train_mask])
    
    
    pred_y = copy_clf.predict(X_train[validation_mask].values)
    pred_y = np.clip(pred_y, 0., 20.) # clip to (0,20)
    rmse = mean_squared_error(y_train[validation_mask], pred_y)**.5
    print('rmse: {:.5f} '.format(rmse), end='\t')
    print('params: {}'.format(new_params))
    
    return {'loss':rmse,'status':STATUS_OK}

### Linear model 
- Ridge

In [72]:
from sklearn.linear_model import Ridge, Lasso #featuring L2/L1 regularized linear models
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import copy

In [20]:
trials = Trials()
rg_clf = Pipeline([
    ('standardscaler', StandardScaler(copy=False, with_mean=True, with_std=True)),
    ('rg', Ridge(fit_intercept=True, normalize=True, max_iter=2000, random_state=0))
])
rg_params = {
    'clf':rg_clf,
    'params': {'rg__alpha': hp.uniform('rg__alpha', 0., 0.1)},
}

best = fmin(fn = objective,
            space = rg_params,
            algo = tpe.suggest,
            trials = trials,
            max_evals = 10
           )

rmse: 0.96954 	params: {'rg__alpha': 0.02395342034810215}
rmse: 0.96877 	params: {'rg__alpha': 0.08622612281878517}
rmse: 0.97019 	params: {'rg__alpha': 0.006143823056185916}
rmse: 0.97012 	params: {'rg__alpha': 0.007586858260528862}
rmse: 0.96892 	params: {'rg__alpha': 0.06046861417459726}
rmse: 0.96877 	params: {'rg__alpha': 0.08514347438485358}
rmse: 0.96976 	params: {'rg__alpha': 0.016729534532894386}
rmse: 0.96990 	params: {'rg__alpha': 0.012824642520251152}
rmse: 0.96878 	params: {'rg__alpha': 0.08249481663262564}
rmse: 0.96895 	params: {'rg__alpha': 0.056949530801044014}


> alpha : 0.0862

- Lasso

In [22]:
trials = Trials()
lasso_clf = Pipeline([
    ('standardscaler', StandardScaler(copy=False, with_mean=True, with_std=True)),
    ('lasso', Lasso(random_state=0))
])

lasso_params = {
    'clf':lasso_clf,
    'params': {
        'lasso__alpha': hp.uniform('lasso__alpha',0,0.1)
    },    
}

best = fmin(fn = objective,
            space = lasso_params,
            algo = tpe.suggest,
            trials = trials,
            max_evals = 10
           )

rmse: 0.96804 	params: {'lasso__alpha': 0.004394884942825128}
rmse: 0.98362 	params: {'lasso__alpha': 0.06646726609030369}
rmse: 0.96786 	params: {'lasso__alpha': 0.004998820940001936}
rmse: 0.96740 	params: {'lasso__alpha': 0.009427113921589337}
rmse: 0.97831 	params: {'lasso__alpha': 0.04878072359969961}
rmse: 0.98360 	params: {'lasso__alpha': 0.06638403138329811}
rmse: 0.98354 	params: {'lasso__alpha': 0.0661535831372162}
rmse: 0.97449 	params: {'lasso__alpha': 0.036369168515062424}
rmse: 0.97950 	params: {'lasso__alpha': 0.052813083979313946}
rmse: 0.97373 	params: {'lasso__alpha': 0.033979532057506666}


In [24]:
best

{'lasso__alpha': 0.009427113921589337}

> alpha: 0.009

### Tree based model 
- lightgbm

In [25]:
import lightgbm as lgb

In [26]:
lgb.LGBMRegressor()

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
       n_jobs=-1, num_leaves=31, objective=None, random_state=None,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)

In [73]:
np.arange(8,129,2,dtype=int)

array([  8,  10,  12,  14,  16,  18,  20,  22,  24,  26,  28,  30,  32,
        34,  36,  38,  40,  42,  44,  46,  48,  50,  52,  54,  56,  58,
        60,  62,  64,  66,  68,  70,  72,  74,  76,  78,  80,  82,  84,
        86,  88,  90,  92,  94,  96,  98, 100, 102, 104, 106, 108, 110,
       112, 114, 116, 118, 120, 122, 124, 126, 128])

In [27]:
lgb_clf = lgb.LGBMRegressor(random_state=0, n_jobs=8)

# 1. find an optimal n_esti for rather larger learning rate
lgb_params = {
    'clf' : lgb_clf,
    'params': {
#         'boosting_type': hp.choice('boosting_type',['gbdt', 'dart']), ## gbdt 
        'boosting_type': 'gbdt',
        'num_leaves'   : hp.choice('num_leaves', np.arange(8,129,2,dtype=int)),
        'max_depth' : hp.choice("max_depth", np.arange(4, 17, dtype=int)),    
        'feature_fraction': hp.uniform('feature_fraction', 0.3, 1.0),
        'bagging_fraction': hp.uniform ('bagging_fraction', 0.7, 1),
        'reg_lambda': hp.uniform('reg_lambda',0,1),
        'gamma' : hp.uniform('gamma', 0.1,0.5)
    }
    
}
best =fmin(fn = objective,
           space = lgb_params,
           algo = tpe.suggest,
           max_evals = 10
           )

rmse: 0.81950 	params: {'bagging_fraction': 0.7172229271595173, 'boosting_type': 'gbdt', 'feature_fraction': 0.827392642600081, 'gamma': 0.21667646875759672, 'max_depth': 15, 'num_leaves': 118, 'reg_lambda': 0.9624085073983754}
rmse: 0.82392 	params: {'bagging_fraction': 0.8023285188966327, 'boosting_type': 'gbdt', 'feature_fraction': 0.5526832299069823, 'gamma': 0.34253916651015426, 'max_depth': 6, 'num_leaves': 50, 'reg_lambda': 0.8534276453674441}
rmse: 0.83590 	params: {'bagging_fraction': 0.7549939993469661, 'boosting_type': 'gbdt', 'feature_fraction': 0.707750760639974, 'gamma': 0.15597004387841393, 'max_depth': 4, 'num_leaves': 42, 'reg_lambda': 0.41571075492555565}
rmse: 0.81929 	params: {'bagging_fraction': 0.8050181753055815, 'boosting_type': 'gbdt', 'feature_fraction': 0.5625718074967717, 'gamma': 0.43392537648110086, 'max_depth': 14, 'num_leaves': 44, 'reg_lambda': 0.32832485259738964}
rmse: 0.83001 	params: {'bagging_fraction': 0.7053748599933174, 'boosting_type': 'gbdt', 

In [28]:
best

{'bagging_fraction': 0.9788870584216797,
 'feature_fraction': 0.7591012763757823,
 'gamma': 0.1517233454100392,
 'max_depth': 8,
 'num_leaves': 51,
 'reg_lambda': 0.6813318433297506}

> rmse:0.81525

    > bagging_fraction:`0.9789`, 
    > feature_fraction:`0.759`,
    > gamma : `0.1517`,
    > reg_labmda : `0.6813`
    > max_depth : `12`,
    > num_leaves : `110`,
    > bossting_type : `gbdt`,
    


- random forest

In [29]:
from sklearn.ensemble import RandomForestRegressor

RandomForestRegressor()

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [36]:
rf_clf.get_params()

{'bootstrap': True,
 'criterion': 'mse',
 'max_depth': 4,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 30,
 'min_samples_split': 300,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': 4,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}

In [39]:
original_params = rf_clf.get_params()
original_params.update(rf_params.get('params'))
# copy_clf.set_params(**original_params) 

In [42]:
rf_clf.set_params(**original_params)

RandomForestRegressor(bootstrap=True, criterion='mse',
           max_depth=<hyperopt.pyll.base.Apply object at 0x000000379CB62E48>,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=30, min_samples_split=300,
           min_weight_fraction_leaf=0.0,
           n_estimators=<hyperopt.pyll.base.Apply object at 0x000000379CB62BE0>,
           n_jobs=4, oob_score=False, random_state=None, verbose=0,
           warm_start=False)

In [52]:
train_mask = train_dates < level2_date_block[0]
    ## simple hold out 
validation_mask = (train_dates >= level2_date_block[0]) & (train_dates <= level2_date_block[-1]) ## validate on the level2_date_block


In [53]:
rf_clf.fit(X_train[train_mask].values, y_train[train_mask])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=4,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=30, min_samples_split=300,
           min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=4,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [61]:
list(range(50,301,50))

[50, 100, 150, 200, 250, 300]

In [63]:
rf_clf.set_params(**rf_params.get('params'))

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=4,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=30, min_samples_split=300,
           min_weight_fraction_leaf=0.0,
           n_estimators=<hyperopt.pyll.base.Apply object at 0x00000037866A6A90>,
           n_jobs=4, oob_score=False, random_state=None, verbose=0,
           warm_start=False)

In [73]:
rf_clf = RandomForestRegressor(min_samples_split=300, min_samples_leaf=30, max_features='sqrt',n_estimators=50,
                               max_depth=4, n_jobs=4, criterion='mse')

rf_params = {
    'clf' : rf_clf,
    'params' : {
#         'n_estimators' : hp.choice('n_estimators', np.arange(50,301,50))
        'max_depth': hp.choice('max_depth',[4, 8, 12])        
    }    
}

best =fmin(fn = objective,
           space = rf_params,
           algo = tpe.suggest,
           max_evals = 1
           )

AttributeError: 'RandomForestRegressor' object has no attribute 'estimators_'

In [58]:
rf_params.get('params')

{'max_depth': <hyperopt.pyll.base.Apply at 0x378d772898>}

### KNN  model

- KNN: Prediction and Neighbor distances features

In [74]:
from sklearn.neighbors import KNeighborsRegressor
KNeighborsRegressor()

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform')

In [None]:
trials = Trials()
knn_clf = KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                              metric_params=None, n_jobs=10, n_neighbors=5, p=1,
                              weights='uniform')
knn_params = {
    'clf' : knn_clf,
    'params' :{
        'weights': hp.choice('weights',['uniform', 'distance']),
        'n_neighbors': hp.choice('n_neighbors', np.arange(5, 21, 5,dtype=int))        
    }
}
best = fmin(fn = objective,
            space = knn_params,
            algo = tpe.suggest,
            trials = trials,
            max_evals = 6
           )   

## Kmeans
- mini-batch kmeans cast to low dimensional features

In [19]:
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.metrics import calinski_harabaz_score

In [25]:
X_cv.shape

(238172, 54)

In [39]:
merge = pd.concat([X_train, X_cv,X_test], ignore_index=True)

In [29]:
merge.shape

(6639294, 54)

In [40]:
merge.tail()

Unnamed: 0,shop_id,item_id,date_block_num,item_category_id,item_id_avg_item_price_lag_1,item_id_sum_item_cnt_day_lag_1,item_id_avg_item_cnt_day_lag_1,shop_id_avg_item_price_lag_1,shop_id_sum_item_cnt_day_lag_1,shop_id_avg_item_cnt_day_lag_1,...,item_id_avg_item_price_lag_12,item_id_sum_item_cnt_day_lag_12,item_id_avg_item_cnt_day_lag_12,shop_id_avg_item_price_lag_12,shop_id_sum_item_cnt_day_lag_12,shop_id_avg_item_cnt_day_lag_12,item_category_id_avg_item_price_lag_12,item_category_id_sum_item_cnt_day_lag_12,item_category_id_avg_item_cnt_day_lag_12,item_cnt_month_lag_12
6639289,45,18454,34,55,99.0,2.0,1.0,1176.795898,702.0,1.125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6639290,45,16188,34,64,1359.0,1.0,1.0,1176.795898,702.0,1.125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6639291,45,15757,34,55,229.0,5.0,1.0,1176.795898,702.0,1.125,...,199.0,9.0,1.0,1273.734375,1251.0,1.268763,297.181396,9809.0,1.041406,0.0
6639292,45,19648,34,40,89.099998,2.0,1.0,1176.795898,702.0,1.125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6639293,45,969,34,37,198.0,3.0,1.0,1176.795898,702.0,1.125,...,549.0,6.0,1.0,1273.734375,1251.0,1.268763,457.671997,5185.0,1.07662,0.0


In [42]:
for c in np.arange(4,16,2):
    print('n_clusters =', c, end=' score= ')
    km = Pipeline([
        ('standardscaler', StandardScaler(copy=False, with_mean=True, with_std=True)),
        ('kmean', MiniBatchKMeans(n_clusters=c, max_no_improvement=30, 
                                  verbose=0, batch_size=1000000, random_state=0))
    ])
#     mini_kmean = MiniBatchKMeans(n_clusters=8, batch_size=10000, verbose=2, random_state=0)
    labels = km.fit_predict(merge)
    print(calinski_harabaz_score(merge, labels))

n_clusters = 4 score= 341409.019388
n_clusters = 6 score= 263314.025485
n_clusters = 8 score= 809444.498912
n_clusters = 10 score= 723085.429986
n_clusters = 12 score= 744213.776063
n_clusters = 14 score= 632340.335443


> `n_cluster = 8` ... highest score means better clustering

## 2.2 Pipeline of several models

In [13]:
merge_dates = np.concatenate([train_dates,cv_dates, test_dates], axis=0)

In [15]:
merge_dates.shape

(6639294,)

In [20]:
merge_y = np.concatenate([y_train,y_cv, np.zeros((len(test_dates),))], axis=0)
print(merge_y.shape)

(6639294,)


In [23]:
stage2_train_dates = merge_dates[merge_dates>=level2_date_block[0]] ## 27 - 34
stage2_train_mask = (stage2_train_dates <= level2_date_block[-1]) # train on : 27-32
stage2_valid_mask = (stage2_train_dates == 33) ## validate on : 33
stage2_final_train_mask = (stage2_train_dates <= 33) ## 
stage2_test_mask = (stage2_train_dates==34)
stage2_train_y = merge_y[merge_dates>=level2_date_block[0]]

In [27]:
level2

array([12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
       29, 30, 31, 32, 33, 34])

In [28]:
level2_date_block[-1]

32

______