# First Level Training 

1. Load pre-processing data
    - downcast to float32, int32 
2. Train the first level models
    - validate model with simple hold out method
    - several models
        - linear 
        - tree based
        - knn
        - kmean
    - features gen by stacking 
3. Text features extraction 
    - tfidf + svd
    - tfidf(Binary) + svd
    - hash + svd
    - hash(Binary) + svd

# 1. Load Data

In [1]:
import pandas as pd 
import numpy as np 
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import gc
gc.enable()

In [2]:
with pd.HDFStore('../data/feat/data.h5') as store:
    print(store.keys())
    X_train = store['X_train']
    X_cv = store['X_cv']
    y_train = store['y_train']
    y_cv = store['y_cv']
    X_test = store['X_test']

['/X_cv', '/X_test', '/X_train', '/y_cv', '/y_train']


In [3]:
X_train.date_block_num.unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32], dtype=int64)

 - clip to (0,20)

In [4]:
y_train = y_train.clip(0,20)
y_cv = y_cv.clip(0,20)

In [5]:
X_train.shape

(10675632, 54)

- downcast to `float32`, `int32`

In [6]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

In [7]:
X_train = downcast_dtypes(X_train)
X_cv = downcast_dtypes(X_cv)
X_test = downcast_dtypes(X_test)

In [8]:
y_train = y_train.astype(np.float32)
y_cv = y_cv.astype(np.float32)

In [9]:
gc.collect()

42

- `fillna` with 0

In [10]:
X_train.fillna(0,inplace=True)
X_cv.fillna(0,inplace=True)
X_test.fillna(0,inplace=True)

## 1.1 Train /test reduce
We take only `date_block_num` between `12~32`
- memory issue
- time cost issue


In [12]:
train_dates = X_train.date_block_num

mask = train_dates >= 12 # mask=0 : all consider , mask>=12
X_train = X_train[mask]
y_train = y_train[mask]

train_dates = train_dates[mask]
test_dates = X_test.date_block_num
cv_dates = X_cv.date_block_num

In [13]:
X_train.shape

(6186922, 54)

In [14]:
gc.collect()

139

___

# 2. First Level Training

level2_date_block @ 31,32

In [18]:
train_dates = X_train.date_block_num

# level2_date_block = [27, 28, 29, 30, 31, 32]
level2_date_block = [31,32]
# level2_date_block = [32]
level2_mask = train_dates.isin(level2_date_block)
train_dates_level2 = train_dates[level2_mask]
train_y_level2 = y_train[level2_mask]

In [19]:
train_dates_level2.shape

(433191,)

## 2.0 custom grid search 
    - stolen from top20 kaggler

In [17]:
import copy
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import mean_squared_error

In [17]:
def customized_grid_search_cv_evaluate(clf, X_train, y_train, param_grid):
    
    params_list = list(ParameterGrid(param_grid))
    res_df = pd.DataFrame(data=np.zeros((len(params_list),3)), columns=['params', 'mean_test_score', 'std_test_score'])
    res_df.loc[:,'params'] = params_list
    
    for i, params in enumerate(params_list):
        scores = []
        print('Fitting: ', params, '...', end='\n\tscores= ')
        for cur_block in level2_date_block:
            copy_clf = copy.deepcopy(clf)
            original_param = copy_clf.get_params()
            original_param.update(params)
            copy_clf.set_params(**original_param) # update copy clf with trying params 
            
            copy_clf.fit(X_train[train_dates < cur_block].values, y_train[train_dates < cur_block])
            pred_y = copy_clf.predict(X_train[train_dates == cur_block].values)
            pred_y = np.clip(pred_y, 0., 20.)
            score = mean_squared_error(y_train[train_dates == cur_block], pred_y)**.5
            print('{:.5f} '.format(score), end='')
            scores.append(score)
            del copy_clf; gc.collect()
        
        print('')
        res_df.loc[i, 'mean_test_score'] = np.mean(scores)
        res_df.loc[i, 'std_test_score'] = np.std(scores)
        
    print('Fitting finished')
    res_df = res_df.sort_values(by=['mean_test_score', 'std_test_score'], ascending=True).reset_index(drop=True)
    best_params = res_df.loc[0, 'params']
    
    print('Selected hyper-params:', best_params)
    print('cv score: {:.4f}, std: {:.4f}'.format(res_df.loc[0, 'mean_test_score'], res_df.loc[0, 'std_test_score']))
    del res_df, params_list; gc.collect()

In [18]:
def customized_grid_search_simple_holdout_evaluate(clf, X_train, y_train, param_grid, level2_date_block=[32]):
    
    params_list = list(ParameterGrid(param_grid))
    res_df = pd.DataFrame(data=np.zeros((len(params_list),2)), columns=['params', 'val_score'])
    res_df.loc[:,'params'] = params_list
    
    train_mask = train_dates < level2_date_block[0]
    validation_mask = (train_dates >= level2_date_block[0]) & (train_dates <= level2_date_block[-1])
    for i, params in enumerate(params_list):
        print('Fitting: ', params, '...', end='\n\tscore= ')
        
        copy_clf = copy.deepcopy(clf)
        original_param = copy_clf.get_params()
        original_param.update(params)
        copy_clf.set_params(**original_param) # update copy clf with trying params 

        copy_clf.fit(X_train[train_mask].values, y_train[train_mask])
        pred_y = copy_clf.predict(X_train[validation_mask].values)
        pred_y = np.clip(pred_y, 0., 20.)
        score = mean_squared_error(y_train[validation_mask], pred_y)**.5
        print('{:.5f} '.format(score), end='\n')
        del copy_clf; gc.collect()

        res_df.loc[i, 'val_score'] = score
        
    print('Fitting finished')
    res_df = res_df.sort_values(by=['val_score'], ascending=True).reset_index(drop=True)
    best_params = res_df.loc[0, 'params']
    
    print('Selected hyper-params:', best_params)
    print('cv score: {:.4f}'.format(res_df.loc[0, 'val_score']))
    del res_df, params_list; gc.collect()

## 2.1 Optimization
- with `hyperopt` library 

In [20]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

- simple hold out method to optimize data

In [21]:
def objective(params):
    '''
        calculate rmse with simple holdout method 
        
        -- train on `train_dates < level2_date_block[0]` 
        -- validate on `leve2_date_block`  (date_block_num : 27-32)
    '''
    ## extract clf and new_params from `params`
    clf = params.get('clf')
    new_params = params.get('params') ## update params
    
    ## update params
    copy_clf = copy.deepcopy(clf)
    original_params = copy_clf.get_params()
    original_params.update(new_params)
    copy_clf.set_params(**original_params) # update copy clf with trying new_params     
    
    train_mask = train_dates < level2_date_block[0]
    
    ## simple hold out method
    ## validate on the level2_date_block
    validation_mask = (train_dates >= level2_date_block[0]) & (train_dates <= level2_date_block[-1]) 
    
    copy_clf.fit(X_train[train_mask].values, y_train[train_mask]) ## train on <27 , validate on 27-32
        
    pred_y = copy_clf.predict(X_train[validation_mask].values)
    pred_y = np.clip(pred_y, 0., 20.) # clip to (0,20)
    rmse = mean_squared_error(y_train[validation_mask], pred_y)**.5
    print('rmse: {:.5f} '.format(rmse), end='\t')
    print('params: {}'.format(new_params))
    
    return {'loss':rmse,'status':STATUS_OK}

### Linear model 
- Ridge

In [22]:
from sklearn.linear_model import Ridge, Lasso #featuring L2/L1 regularized linear models
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import copy

In [22]:
## try simple-hold-out grid serach
rg_clf = Pipeline([
    ('standardscaler', StandardScaler(copy=False, with_mean=True, with_std=True)),
    ('rg', Ridge(fit_intercept=True, normalize=True, max_iter=2000, random_state=0))
])
rg_params = {
    'rg__alpha': np.arange(0,1,0.2)
}

customized_grid_search_simple_holdout_evaluate(rg_clf,X_train,y_train,param_grid=rg_params,level2_date_block=level2_date_block)

Fitting:  {'rg__alpha': 0.0} ...
	score= 0.96775 
Fitting:  {'rg__alpha': 0.20000000000000001} ...
	score= 0.96510 
Fitting:  {'rg__alpha': 0.40000000000000002} ...
	score= 0.96600 
Fitting:  {'rg__alpha': 0.60000000000000009} ...
	score= 0.96738 
Fitting:  {'rg__alpha': 0.80000000000000004} ...
	score= 0.96910 
Fitting finished
Selected hyper-params: {'rg__alpha': 0.20000000000000001}
cv score: 0.9651


In [23]:
trials = Trials()
rg_clf = Pipeline([
    ('standardscaler', StandardScaler(copy=False, with_mean=True, with_std=True)),
    ('rg', Ridge(fit_intercept=True, normalize=True, max_iter=2000, random_state=0))
])
rg_params = {
    'clf':rg_clf,
    'params': {'rg__alpha': hp.uniform('rg__alpha', 0, 1)},
}

best = fmin(fn = objective,
            space = rg_params,
            algo = tpe.suggest,
            trials = trials,
            max_evals = 10
           )

rmse: 1.00357 	params: {'rg__alpha': 0.04080360347886225}
rmse: 1.00317 	params: {'rg__alpha': 0.025410732521240376}
rmse: 1.00342 	params: {'rg__alpha': 0.034911349319165064}
rmse: 1.01571 	params: {'rg__alpha': 0.7118020171649374}
rmse: 1.01847 	params: {'rg__alpha': 0.9214224958287398}
rmse: 1.00790 	params: {'rg__alpha': 0.23520375805316973}
rmse: 1.01593 	params: {'rg__alpha': 0.7279100564363649}
rmse: 1.01644 	params: {'rg__alpha': 0.7650441014491024}
rmse: 1.01877 	params: {'rg__alpha': 0.945986312934316}
rmse: 1.01007 	params: {'rg__alpha': 0.35088290894136054}


In [24]:
best

{'rg__alpha': 0.025410732521240376}

>rmse:0.96508 
 - alpha : 0.193929

* linear svm

In [30]:
svm.LinearSVR()

LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
     random_state=None, tol=0.0001, verbose=0)

- Lasso

In [25]:
trials = Trials()
lasso_clf = Pipeline([
    ('standardscaler', StandardScaler(copy=False, with_mean=True, with_std=True)),
    ('lasso', Lasso(random_state=0))
])

lasso_params = {
    'clf':lasso_clf,
    'params': {
        'lasso__alpha': hp.uniform('lasso__alpha',0,0.1)
    },    
}

best = fmin(fn = objective,
            space = lasso_params,
            algo = tpe.suggest,
            trials = trials,
            max_evals = 10
           )

rmse: 1.00416 	params: {'lasso__alpha': 0.009869086395721482}
rmse: 1.02777 	params: {'lasso__alpha': 0.0907611500382878}
rmse: 1.02902 	params: {'lasso__alpha': 0.09572609233971596}
rmse: 1.02897 	params: {'lasso__alpha': 0.09552780779733748}
rmse: 1.01632 	params: {'lasso__alpha': 0.050759576343417726}
rmse: 1.01541 	params: {'lasso__alpha': 0.047752693076762553}
rmse: 1.01148 	params: {'lasso__alpha': 0.03489274382258616}
rmse: 1.00366 	params: {'lasso__alpha': 0.006940518212747282}
rmse: 1.01811 	params: {'lasso__alpha': 0.05646730195223748}
rmse: 1.02094 	params: {'lasso__alpha': 0.06545538392838314}


In [26]:
best

{'lasso__alpha': 0.006940518212747282}

In [28]:
# lasso_params = {
#     'clf':lasso_clf,
#     'params': {
#         'lasso__alpha': 0.021311
#     },    
# }
# objective(lasso_params)

rmse: 0.96565 	params: {'lasso__alpha': 0.021311}


{'loss': 0.9656547294753367, 'status': 'ok'}

> rmse: 0.96565
    - alpha: 0.021311

### Tree based model 
- lightgbm

In [26]:
import lightgbm as lgb

In [27]:
lgb.LGBMRegressor()

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
       n_jobs=-1, num_leaves=31, objective=None, random_state=None,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)

In [None]:
lgb_clf = lgb.LGBMRegressor(random_state=0, n_jobs=4)

# 1. find an optimal n_esti for rather larger learning rate
lgb_params = {
    'boosting_type': ['gbdt', 'dart']
}


customized_grid_search_simple_holdout_evaluate(lgb_clf, X_train, y_train, lgb_params)

In [35]:
lgb_clf = lgb.LGBMRegressor(boosting_type='gbdt', learning_rate=0.1, random_state=0, n_jobs =4)

# 1. find an optimal n_esti for rather larger learning rate
lgb_params = {
    'n_estimators': [10, 20, 40, 60, 80],
    'learning_rate': [0.2, 0.1, 0.075, 0.05]
}
customized_grid_search_simple_holdout_evaluate(lgb_clf, X_train, y_train, lgb_params)

Fitting:  {'learning_rate': 0.2, 'n_estimators': 10} ...
	score= 0.90978 
Fitting:  {'learning_rate': 0.2, 'n_estimators': 20} ...
	score= 0.89607 
Fitting:  {'learning_rate': 0.2, 'n_estimators': 40} ...
	score= 0.89238 
Fitting:  {'learning_rate': 0.2, 'n_estimators': 60} ...
	score= 0.89089 
Fitting:  {'learning_rate': 0.2, 'n_estimators': 80} ...
	score= 0.89274 
Fitting:  {'learning_rate': 0.1, 'n_estimators': 10} ...
	score= 0.94879 
Fitting:  {'learning_rate': 0.1, 'n_estimators': 20} ...
	score= 0.91117 
Fitting:  {'learning_rate': 0.1, 'n_estimators': 40} ...
	score= 0.89477 
Fitting:  {'learning_rate': 0.1, 'n_estimators': 60} ...
	score= 0.89334 
Fitting:  {'learning_rate': 0.1, 'n_estimators': 80} ...
	score= 0.89265 
Fitting:  {'learning_rate': 0.075, 'n_estimators': 10} ...
	score= 0.97447 
Fitting:  {'learning_rate': 0.075, 'n_estimators': 20} ...
	score= 0.92518 
Fitting:  {'learning_rate': 0.075, 'n_estimators': 40} ...
	score= 0.90328 
Fitting:  {'learning_rate': 0.07

KeyboardInterrupt: 

In [None]:
lgb_clf = lgb.LGBMRegressor(boosting_type='gbdt', learning_rate=0.1, n_estimators=40,
                            random_state=0, n_jobs =4)



# 2. tune tree-specific params
lgb_params = {
    'max_depth': [4, 6, 8, 10, 12],
}
customized_grid_search_simple_holdout_evaluate(lgb_clf, X_train, y_train, lgb_params)

In [None]:
lgb_clf = lgb.LGBMRegressor(boosting_type='gbdt', learning_rate=0.1, n_estimators=40,
                            max_depth=12, 
                            random_state=0, n_jobs =4)


# 2. tune tree-specific params
lgb_params = {
    'min_split_gain': [.0, .1, .2],
    'min_child_samples': [20, 40, 80], ## min_data_in_leaf
}
customized_grid_search_simple_holdout_evaluate(lgb_clf, X_train, y_train, lgb_params)

In [None]:
lgb_clf = lgb.LGBMRegressor(boosting_type='gbdt', learning_rate=0.1, n_estimators=40,
                            max_depth=12, min_child_samples=20, min_split_gain=0.0,
                            random_state=0, n_jobs=4)

# 2. tune tree-specific params
lgb_params = {
    'subsample': [.2, .4, .6, .8, 1.],
    'colsample_bytree': [.2, .4, .6, .8, 1.]
}
customized_grid_search_simple_holdout_evaluate(lgb_clf, X_train, y_train, lgb_params)

> use hyperopt

In [27]:
lgb_clf = lgb.LGBMRegressor(random_state=0, n_jobs=8)
trials = Trials()
# 1. find an optimal n_esti for rather larger learning rate
lgb_params = {
    'clf' : lgb_clf,
    'params': {
        'boosting_type': hp.choice('boosting_type',['gbdt', 'dart']), ## gbdt 
#         'boosting_type': []'gbdt',
        'num_leaves'   : hp.choice('num_leaves', np.arange(8,129,2,dtype=int)),
        'max_depth' : hp.choice("max_depth", np.arange(4, 17, dtype=int)),    
        'feature_fraction': hp.uniform('feature_fraction', 0.3, 1.0),
        'bagging_fraction': hp.uniform ('bagging_fraction', 0.7, 1),
        'reg_lambda': hp.uniform('reg_lambda',0,1),
        'gamma' : hp.uniform('gamma', 0.1,0.5)
    }
    
}
best =fmin(fn = objective,
           space = lgb_params,
           algo = tpe.suggest,
           trials = trials,
           max_evals = 10
           )

rmse: 0.86781 	params: {'bagging_fraction': 0.8526347938264912, 'boosting_type': 'dart', 'feature_fraction': 0.3713832005356714, 'gamma': 0.3513437876057838, 'max_depth': 12, 'num_leaves': 36, 'reg_lambda': 0.3314183540297525}
rmse: 0.88221 	params: {'bagging_fraction': 0.7561365505609431, 'boosting_type': 'dart', 'feature_fraction': 0.6631183939697767, 'gamma': 0.10476140323208019, 'max_depth': 4, 'num_leaves': 30, 'reg_lambda': 0.6313148071988549}
rmse: 0.83194 	params: {'bagging_fraction': 0.9187387422237163, 'boosting_type': 'gbdt', 'feature_fraction': 0.8084801260455161, 'gamma': 0.25342233715729756, 'max_depth': 12, 'num_leaves': 36, 'reg_lambda': 0.9346223808836792}
rmse: 0.83162 	params: {'bagging_fraction': 0.8914728193944422, 'boosting_type': 'gbdt', 'feature_fraction': 0.6805724910142892, 'gamma': 0.49912184111068736, 'max_depth': 7, 'num_leaves': 106, 'reg_lambda': 0.4603614731830482}
rmse: 0.85809 	params: {'bagging_fraction': 0.8772487776479582, 'boosting_type': 'dart', '

In [30]:
trials.best_trial['result']

{'loss': 0.8316196011425457, 'status': 'ok'}

rmse: 0.83162 

params: {'bagging_fraction': 0.8914728193944422, 'boosting_type': 'gbdt', 'feature_fraction': 0.6805724910142892, 'gamma': 0.49912184111068736, 'max_depth': 7, 'num_leaves': 106, 'reg_lambda': 0.4603614731830482}


In [34]:
## use pre-train params from lgb_model1
lgb_clf = lgb.LGBMRegressor(random_state=0)
lgb_params = {
    'clf':lgb_clf,
    'params':{
        'bagging_fraction': 0.9568845079308161,
        'bossting_type': 'gbdt',
        'feature_fraction': 0.6203248801718259,
        'gamma': 0.39624896070423066,
        'max_depth': 12,
        'metric': 'rmse',
        'num_leaves': 64,
        'objective': 'regression',
        'reg_lambda': 0.38856229720270463
    }
}
objective(lgb_params)

rmse: 0.81909 	params: {'bagging_fraction': 0.9568845079308161, 'bossting_type': 'gbdt', 'feature_fraction': 0.6203248801718259, 'gamma': 0.39624896070423066, 'max_depth': 12, 'metric': 'rmse', 'num_leaves': 64, 'objective': 'regression', 'reg_lambda': 0.38856229720270463}


{'loss': 0.81908550940125358, 'status': 'ok'}

> rmse: 0.81909
    - {'bagging_fraction': 0.9568845079308161, 'bossting_type': 'gbdt', 'feature_fraction': 0.6203248801718259, 'gamma': 0.39624896070423066, 'max_depth': 12, 'metric': 'rmse', 'num_leaves': 64, 'objective': 'regression', 'reg_lambda': 0.38856229720270463}

In [33]:
best

{'bagging_fraction': 0.9194917662233979,
 'boosting_type': 0,
 'feature_fraction': 0.4682043014298397,
 'gamma': 0.3957261011649602,
 'max_depth': 8,
 'num_leaves': 37,
 'reg_lambda': 0.14857448039561033}

> rmse:0.81525

    > bagging_fraction:`0.9789`, 
    > feature_fraction:`0.759`,
    > gamma : `0.1517`,
    > reg_labmda : `0.6813`
    > max_depth : `12`,
    > num_leaves : `110`,
    > bossting_type : `gbdt`,
    


- random forest

In [32]:
from sklearn.ensemble import RandomForestRegressor

RandomForestRegressor()

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [33]:
rf_clf = RandomForestRegressor(min_samples_split=300, min_samples_leaf=30, max_features='sqrt',n_estimators=50,
                               max_depth=4, n_jobs=4, criterion='mse',random_state=0)

rf_clf.estimators_=1 # need to set a value otherwise rise AttributeError in hyperopt
rf_params = {
    'clf' : rf_clf,
    'params' : {
        'min_samples_split' : hp.choice('min_samples_split',np.arange(100,500,100)),
        'min_samples_leaf': hp.choice('min_samples_leaf',np.arange(30,100,10)),
        'n_estimators' : hp.choice('n_estimators', np.arange(50,301,50)),
        'max_depth': hp.choice('max_depth',[4, 8, 12, 16])        
    }    
}

best =fmin(fn = objective,
           space = rf_params,
           algo = tpe.suggest,
           max_evals = 10
           )

rmse: 0.85051 	params: {'max_depth': 16, 'min_samples_leaf': 30, 'min_samples_split': 400, 'n_estimators': 150}
rmse: 0.88084 	params: {'max_depth': 8, 'min_samples_leaf': 30, 'min_samples_split': 200, 'n_estimators': 300}
rmse: 0.92693 	params: {'max_depth': 4, 'min_samples_leaf': 30, 'min_samples_split': 400, 'n_estimators': 250}
rmse: 0.86244 	params: {'max_depth': 12, 'min_samples_leaf': 50, 'min_samples_split': 400, 'n_estimators': 200}
rmse: 0.92699 	params: {'max_depth': 4, 'min_samples_leaf': 80, 'min_samples_split': 200, 'n_estimators': 250}
rmse: 0.84882 	params: {'max_depth': 16, 'min_samples_leaf': 40, 'min_samples_split': 100, 'n_estimators': 300}
rmse: 0.85072 	params: {'max_depth': 16, 'min_samples_leaf': 70, 'min_samples_split': 100, 'n_estimators': 300}
rmse: 0.92655 	params: {'max_depth': 4, 'min_samples_leaf': 50, 'min_samples_split': 200, 'n_estimators': 200}
rmse: 0.92616 	params: {'max_depth': 4, 'min_samples_leaf': 70, 'min_samples_split': 200, 'n_estimators': 10

> rmse: `0.82801`
 - max_depth = `16`
 - min_samples_leaf = `50`
 - min_samples_split = `200`
 - n_estimators = `300`

### KNN  model

- KNN: Prediction and Neighbor distances features

 Time cost a lot(>2 days), I give it up

In [None]:
from sklearn.neighbors import KNeighborsRegressor
KNeighborsRegressor()

In [None]:
trials = Trials()
knn_clf = KNeighborsRegressor(algorithm='auto', leaf_size=100, metric='minkowski',
                              metric_params=None, n_jobs=4, n_neighbors=5, p=1,
                              weights='uniform')
knn_params = {
    'clf' : knn_clf,
    'params' :{
        'p': hp.choice('p',[1,2]),
        'weights': hp.choice('weights',['uniform', 'distance']),
        'n_neighbors': hp.choice('n_neighbors', np.arange(10, 101, 10,dtype=int))
    }
}
best = fmin(fn = objective,
            space = knn_params,
            algo = tpe.suggest,
            trials = trials,
            max_evals = 5
           )   

> knn: 
    - n_neighbors = `15`
    - weights = `distance`

## Kmeans
- mini-batch kmeans cast to low dimensional features

In [34]:
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.metrics import calinski_harabaz_score

In [35]:
X_cv.shape

(238172, 54)

In [36]:
merge = pd.concat([X_train, X_cv,X_test], ignore_index=True)

In [37]:
merge.shape

(6639294, 54)

In [38]:
merge.tail()

Unnamed: 0,shop_id,item_id,date_block_num,item_category_id,item_id_avg_item_price_lag_1,item_id_sum_item_cnt_day_lag_1,item_id_avg_item_cnt_day_lag_1,shop_id_avg_item_price_lag_1,shop_id_sum_item_cnt_day_lag_1,shop_id_avg_item_cnt_day_lag_1,...,item_id_avg_item_price_lag_12,item_id_sum_item_cnt_day_lag_12,item_id_avg_item_cnt_day_lag_12,shop_id_avg_item_price_lag_12,shop_id_sum_item_cnt_day_lag_12,shop_id_avg_item_cnt_day_lag_12,item_category_id_avg_item_price_lag_12,item_category_id_sum_item_cnt_day_lag_12,item_category_id_avg_item_cnt_day_lag_12,item_cnt_month_lag_12
6639289,45,18454,34,55,99.0,2.0,1.0,1176.795898,702.0,1.125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6639290,45,16188,34,64,1359.0,1.0,1.0,1176.795898,702.0,1.125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6639291,45,15757,34,55,229.0,5.0,1.0,1176.795898,702.0,1.125,...,199.0,9.0,1.0,1273.734375,1251.0,1.268763,297.181396,9809.0,1.041406,0.0
6639292,45,19648,34,40,89.099998,2.0,1.0,1176.795898,702.0,1.125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6639293,45,969,34,37,198.0,3.0,1.0,1176.795898,702.0,1.125,...,549.0,6.0,1.0,1273.734375,1251.0,1.268763,457.671997,5185.0,1.07662,0.0


In [39]:
for c in np.arange(4,16,2):
    print('n_clusters =', c, end=' score= ')
    km = Pipeline([
        ('standardscaler', StandardScaler(copy=False, with_mean=True, with_std=True)),
        ('kmean', MiniBatchKMeans(n_clusters=c, max_no_improvement=30, 
                                  verbose=0, batch_size=1000000, random_state=0))
    ])
#     mini_kmean = MiniBatchKMeans(n_clusters=8, batch_size=10000, verbose=2, random_state=0)
    labels = km.fit_predict(merge)
    print(calinski_harabaz_score(merge, labels))

n_clusters = 4 score= 341409.019388
n_clusters = 6 score= 263314.025485
n_clusters = 8 score= 809444.498912
n_clusters = 10 score= 723085.429986
n_clusters = 12 score= 744213.776063
n_clusters = 14 score= 632340.335443


> `n_cluster = 8` ... highest score means better clustering

## 2.2 Stacking 

In [40]:
merge_dates = np.concatenate([train_dates,cv_dates, test_dates], axis=0)

In [41]:
merge_dates.shape

(6639294,)

In [42]:
merge_y = np.concatenate([y_train,y_cv, np.zeros((len(test_dates),))], axis=0)
print(merge_y.shape)

(6639294,)


In [43]:
stage2_train_dates = merge_dates[merge_dates>=level2_date_block[0]] ## 27 - 34
stage2_train_mask = (stage2_train_dates <= level2_date_block[-1]) # train on : 27-32
stage2_valid_mask = (stage2_train_dates == 33) ## validate on : 33
stage2_final_train_mask = (stage2_train_dates <= 33) ## 
stage2_test_mask = (stage2_train_dates==34)
stage2_train_y = merge_y[merge_dates>=level2_date_block[0]] ## 27-34

In [44]:
supervised_models = {
    'rg': Pipeline([
                ('standardscaler', StandardScaler(copy=False, with_mean=True, with_std=True)),
                ('rg', Ridge(alpha=0.025, fit_intercept=True, normalize=True, max_iter=2000, random_state=0))
            ]),
    'lasso':Pipeline([
                    ('standardscaler', StandardScaler(copy=False, with_mean=True, with_std=True)),
                    ('lasso', Lasso(alpha=0.0094, normalize=False, fit_intercept=True, max_iter=2000, random_state=0))
                ]),
    'rf': RandomForestRegressor(n_estimators=300,
                               min_samples_split=100, min_samples_leaf=40, max_features='sqrt',
                               max_depth=16, n_jobs=4, criterion='mse', random_state=0),    
    'lgbm': lgb.LGBMRegressor(boosting_type='gbdt', 
                                max_depth=7,
                                num_leaves=106,
                                bagging_fraction=0.8914728193944422,
                                feature_fraction=0.6805724910142892,
                                reg_lambda = 0.4603614731830482,
                                gamma = 0.49912184111068736,
                                metric='rmse',                            
#                               subsample=.55, colsample_bytree=.75, ## 
                                n_jobs=4,
                                random_state=0)
}

 
# knn = KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
#                               metric_params=None, n_jobs=4, n_neighbors=15, p=1,
#                               weights='distance')

mini_kmean = Pipeline([
                ('standardscaler', StandardScaler(copy=False, with_mean=True, with_std=True)),
                ('kmean', MiniBatchKMeans(n_clusters=8, max_no_improvement=30, 
                                          verbose=0, batch_size=1000000, random_state=0))
            ])

In [45]:
stage2_train_y.shape

(885563,)

In [49]:
def stage2_feature_generate():
    
#     print('Training supervised models')
    
    all_preds = []
    for model_name, model in supervised_models.items():
        
        print(model_name, end=': ')
        preds = []
        for cur_block in np.arange(31,35,1):#np.arange(27, 35, 1):
            X_tr = merge[merge_dates < cur_block].values ## ndarray
            y_tr = merge_y[merge_dates < cur_block]
            X_test = merge[merge_dates == cur_block].values
            
            copy_clf = copy.deepcopy(model)
            copy_clf.fit(X_tr, y_tr)
            pred_test = copy_clf.predict(X_test)
            pred_test = np.clip(pred_test, 0., 20.)
            preds.append(pred_test)
            print(cur_block, end=' ')

        preds = np.concatenate(preds, axis=0)
        preds = preds.reshape((len(preds), 1))
        all_preds.append(preds)
    
        print('')
        
    #knn:
#     print('knn')
#     X_tr = merge[merge_dates < level2_date_block[0]].values
#     y_tr = merge_y[merge_dates < level2_date_block[0]]

#     X_test = merge[merge_dates >= level2_date_block[0]].values
#     knn.fit(X_tr, y_tr)
#     knn_pred = knn.predict(X_test)
#     knn_pred = np.clip(knn_pred, 0., 20.).reshape((len(knn_pred), 1))
#     knn_dist = knn.kneighbors(X_test, return_distance=True)[0] # distances
#     print(np.array(knn_dist).shape)
#     all_preds.append(knn_pred)
#     all_preds.append(knn_dist)
    
    # kmeans
    X_test = merge[merge_dates >= level2_date_block[0]].values # >=27
    mini_kmean.fit(merge.values)
    kmean_pred = mini_kmean.predict(X_test)
    kmean_dist = mini_kmean.transform(X_test)
    
    kmean_pred = np.array(kmean_pred).reshape((len(kmean_pred),1))
    all_preds.append(kmean_pred)
    all_preds.append(kmean_dist)
    
    return np.concatenate(all_preds, axis=1)

In [50]:
stage2_data = stage2_feature_generate()

rg: 31 32 33 34 
lasso: 31 32 33 34 
rf: 31 32 33 34 
lgbm: 31 32 33 34 


In [51]:
stage2_data.shape

(885563, 13)

In [52]:
columns = ['rg', 'lasso', 'rf', 'lgbm'] ##'knn']
# columns.extend(['knn_dist_'+str(i) for i in range(15)])
columns.extend(['kmean_dist_label'])
columns.extend(['kmean_dist_'+str(i) for i in range(8)])
print(len(columns))
columns

13


['rg',
 'lasso',
 'rf',
 'lgbm',
 'kmean_dist_label',
 'kmean_dist_0',
 'kmean_dist_1',
 'kmean_dist_2',
 'kmean_dist_3',
 'kmean_dist_4',
 'kmean_dist_5',
 'kmean_dist_6',
 'kmean_dist_7']

In [53]:
stage2_df = pd.DataFrame(data=stage2_data, columns=columns)

In [54]:
stage2_df.head()

Unnamed: 0,rg,lasso,rf,lgbm,kmean_dist_label,kmean_dist_0,kmean_dist_1,kmean_dist_2,kmean_dist_3,kmean_dist_4,kmean_dist_5,kmean_dist_6,kmean_dist_7
0,0.457809,0.51821,1.104751,1.234071,1.0,8.857702,8.712146,6.878428,7.236687,160.683082,8.372746,10.067307,32.780615
1,0.990932,0.951176,1.78617,2.333394,5.0,12.80689,13.463366,11.329752,7.886795,162.224312,10.882863,14.441723,33.89432
2,0.473575,0.448425,0.640284,0.663168,5.0,9.285941,10.126022,8.115691,6.480298,161.687453,7.767159,11.381747,32.819151
3,0.421733,0.397657,0.457851,0.487791,6.0,8.528922,8.592862,6.444938,6.3499,161.312547,7.830429,9.995115,32.689497
4,0.406524,0.390211,0.397525,0.391166,6.0,8.568154,8.632067,6.464528,6.335744,161.346125,7.866945,10.024922,32.699838


In [55]:
stage2_io = pd.HDFStore('../data/feat/stage2_data.h5')
stage2_io['stage2_df'] = stage2_df
stage2_io.close()

In [None]:
## load 
# with pd.HDFStore('../data/feat/stage2_data.h5') as stage2_io:
#     print(stage2_io.keys())
#     stage2_df = stage2_io['stage2_df']

______

# 3. Extract Text Features

In [56]:
with pd.HDFStore('../data/feat/text_feats.h5') as text_io:
    print(text_io.keys())
    X_text_feats_test = text_io['X_text_feats_test']
    X_text_feats_cv = text_io['X_text_feats_cv']
    X_text_feats_train = text_io['X_text_feats_train']

['/X_text_feats', '/X_text_feats_cv', '/X_text_feats_test', '/X_text_feats_train']


In [57]:
X_text_feats_train.shape

(10675632, 4)

In [58]:
mask.value_counts() ## reduce memory use only num_date_block >= 12

True     6186922
False    4488710
Name: date_block_num, dtype: int64

In [59]:
X_text_feats_train.shape

(10675632, 4)

In [60]:
train_text_df = pd.concat([X_text_feats_train[mask],X_text_feats_cv])
test_text_df = X_text_feats_test

In [61]:
train_text_df.shape[0] + test_text_df.shape[0]

6639294

In [62]:
train_texts = train_text_df['item_name'].map(str) + ' ' + train_text_df['item_category_name'].map(str) + ' ' + train_text_df['shop_name'].map(str)
test_texts = test_text_df['item_name'].map(str) + ' ' + test_text_df['item_category_name'].map(str) + ' ' + test_text_df['shop_name'].map(str)

In [63]:
all_texts = pd.Series(np.concatenate([train_texts, test_texts], axis=0))
del train_text_df, test_text_df, train_texts, test_texts; gc.collect()
all_texts.shape

(6639294,)

In [64]:
del X_text_feats_train,X_text_feats_cv,X_text_feats_test; gc.collect()

21

### 1. TFIDF - Truncated SVD

In [65]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(lowercase=False, ngram_range=(1,2))

In [66]:
tv_features = tv.fit_transform(all_texts)
tv_features.shape

(6639294, 63999)

In [67]:
from sklearn.decomposition import TruncatedSVD

In [68]:
svd = TruncatedSVD(n_components=20, random_state=0)
tv_svd_features = svd.fit_transform(tv_features)

In [69]:
import sys
sys.getsizeof(tv_svd_features)/(1024*1024*1024)

0.9893320053815842

### 2. TF-IDF(binarize)- Truncated SVD

In [70]:
tvb_features = tv_features.astype(bool).astype(float)
del tv_features; gc.collect()

0

In [71]:
svd = TruncatedSVD(n_components=20, random_state=0)
tvb_svd_features = svd.fit_transform(tvb_features)
tvb_svd_features.shape

(6639294, 20)

In [72]:
sys.getsizeof(tvb_svd_features)/(1024*1024*1024)

0.9893320053815842

### 3. Hasing + Truncated SVD

In [73]:
from sklearn.feature_extraction.text import HashingVectorizer
hv = HashingVectorizer(ngram_range=(1, 2), lowercase=False)

In [None]:
hv_features = hv.fit_transform(all_texts).tocsr()
hv_features.shape

(6639294, 1048576)

In [None]:
svd = TruncatedSVD(n_components=20, random_state=0)
hv_svd_features = svd.fit_transform(hv_features)
hv_svd_features.shape

In [None]:
sys.getsizeof(hv_svd_features)/(1024*1024*1024)

### 4. Hasing(binarize) + Truncated SVD

In [None]:
hvb_features = hv_features.astype(bool).astype(float)
del hv_features; gc.collect()

In [None]:
svd = TruncatedSVD(n_components=20, random_state=0)
hvb_svd_features = svd.fit_transform(hvb_features)
hvb_svd_features.shape

In [None]:
print(sys.getsizeof(hvb_svd_features)/(1024*1024*1024))
del hvb_features; gc.collect()

### 5. Stack them

In [None]:
text_features = np.concatenate([tv_svd_features, tvb_svd_features, hv_svd_features, hvb_svd_features], axis=1)
text_features.shape

In [None]:
sys.getsizeof(text_features)/(1024*1024*1024)

In [None]:
text_features_df = pd.DataFrame(data=text_features, columns=['text_f_'+str(i) for i in range(80)])

text_io = pd.HDFStore('../data/feat/text_feat_df.h5') 
text_io['text_feats_df'] = text_features_df
text_io.close()

In [None]:
del all_texts,hv,hv_svd_features,hvb_svd_features, km, text_features;gc.collect()

## Generate stage2 text features

In [195]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


### reload data 

In [14]:
with pd.HDFStore('../data/feat/text_feat_df.h5') as text_io:
    text_features_df = text_io['text_feats_df'] ## 12-34

In [15]:
merge = pd.concat([X_train, X_cv,X_test], ignore_index=True)
merge_dates = np.concatenate([train_dates,cv_dates, test_dates], axis=0) ## 12-34

In [16]:
merge_y = np.concatenate([y_train,y_cv, np.zeros((len(test_dates),))], axis=0) ## 12-34

In [17]:
merge_y.shape

(11128004,)

In [18]:
merge.shape

(11128004, 54)

In [19]:
train_dates.shape # 

(10675632,)

In [20]:
# level2_date_block = [27,28,29,30,31,32]
level2_date_block = [31,32]
stage2_train_dates = merge_dates[merge_dates>=level2_date_block[0]] # 27-34
stage2_train_mask = (stage2_train_dates <= level2_date_block[-1]) # 27-32
stage2_valid_mask = (stage2_train_dates == 33) # 33
stage2_final_train_mask = (stage2_train_dates <= 33) # 27-33
stage2_test_mask = (stage2_train_dates==34) # 34
stage2_train_y = merge_y[merge_dates>=level2_date_block[0]] # 27-34

In [21]:
text_features_df.head()

Unnamed: 0,text_f_0,text_f_1,text_f_2,text_f_3,text_f_4,text_f_5,text_f_6,text_f_7,text_f_8,text_f_9,...,text_f_70,text_f_71,text_f_72,text_f_73,text_f_74,text_f_75,text_f_76,text_f_77,text_f_78,text_f_79
0,0.277484,-0.173652,-0.06324,-0.090207,-0.023441,-0.013428,-0.019452,0.007843,-0.021224,0.023921,...,-0.03381,-0.234008,-0.110437,-0.110263,-0.022642,-0.003122,0.056965,0.105615,0.012552,0.030759
1,0.039884,0.02749,0.048064,-0.010053,-0.005663,-0.008881,-0.013801,0.008955,-0.008782,0.018617,...,0.050828,-0.241337,-0.108003,-0.252701,-0.059449,0.121994,-0.172504,0.083715,-0.342954,0.62724
2,0.029936,0.017483,0.032786,-0.004073,-0.004317,-0.006618,-0.011949,0.008032,-0.008361,0.019091,...,0.053172,-0.237839,-0.102671,-0.253724,-0.060433,0.113566,-0.16443,0.098105,-0.327809,0.576223
3,0.065214,0.058522,0.142886,-0.053096,-0.019396,-0.012969,-0.012155,0.005954,-0.010409,0.008706,...,0.017964,-0.241615,-0.115704,-0.12515,-0.025767,0.021801,-0.000432,0.083451,-0.112115,0.189098
4,0.043049,0.019077,0.038894,0.023213,-0.013758,-0.010637,-0.013122,0.005971,-0.012691,0.01775,...,0.027809,-0.240677,-0.094627,-0.177729,-0.043124,0.047342,-0.066383,0.108523,-0.167366,0.356736


In [21]:
X_train.reset_index(drop=True,inplace=True)
X_train = pd.concat([X_train,text_features_df.iloc[0:X_train.shape[0],:]], axis=1)
train = X_train
train.reset_index(drop=True,inplace=True)

In [22]:
# X_train.shape ## 12-32
# y_train.shape ## 

(10675632, 134)

In [25]:
test_text_features_df = text_features_df.iloc[-test_dates.shape[0]:,:].reset_index(drop=True)
# test = test_text_features_df
test = pd.concat([X_test.reset_index(drop=True),test_text_features_df],axis=1)

### hyper params search

In [26]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import copy

In [27]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

- simple hold out method to optimize data

In [28]:
train_dates.reset_index(drop=True,inplace=True) ##reset index
y_train.reset_index(drop=True,inplace=True)

In [29]:
def objective(params):
    
    clf = params.get('clf')
    new_params = params.get('params') ## update params

    copy_clf = copy.deepcopy(clf)
    original_params = copy_clf.get_params()
    original_params.update(new_params)
    copy_clf.set_params(**original_params) # update copy clf with trying new_params     
    
    ## simple hold out 
    ## train_dates --- 12 - 32
    
    
    train_mask = train_dates < level2_date_block[0] #12-26
    ## simple hold out 
    validation_mask = (train_dates >= level2_date_block[0]) & (train_dates <= level2_date_block[-1]) ## validate on the level2_date_block
    
    copy_clf.fit(X_train[train_mask].values, y_train[train_mask]) #  
    
    pred_y = copy_clf.predict(X_train[validation_mask].values) ## 27-32
    pred_y = np.clip(pred_y, 0., 20.) # clip to (0,20)
    rmse = mean_squared_error(y_train[validation_mask], pred_y)**.5
    print('rmse: {:.5f} '.format(rmse), end='\t')
    print('params: {}'.format(new_params))
    
    return {'loss':rmse,'status':STATUS_OK}

In [30]:
trials = Trials()

rg_clf = Pipeline([
    ('standardscaler', StandardScaler(copy=False, with_mean=True, with_std=True)),
    ('rg', Ridge(fit_intercept=True, normalize=True, max_iter=2000, random_state=0))
])

rg_params = {
    'clf' : rg_clf,
    'params':{
        'rg__alpha':  hp.uniform('rg__alpha', 0., 1.0)
    }
    
}
best = fmin(fn = objective,
            space = rg_params,
            algo = tpe.suggest,
            trials = trials,
            max_evals = 10
           )

rmse: 1.06085 	params: {'rg__alpha': 0.8913060076641278}
rmse: 1.06153 	params: {'rg__alpha': 0.3635415790107782}
rmse: 1.06142 	params: {'rg__alpha': 0.12722171982882946}


KeyboardInterrupt: 

In [40]:
trials.best_trial['result']

{'loss': 0.961002177524, 'status': 'ok'}

> rmse: 0.961002
    - alpha: `0.234133853`
    - 

## Generate stage2 features with TEXT

In [47]:
merge.shape

(11128004, 54)

In [48]:
text_features_df.shape

(11128004, 80)

In [49]:
merge = pd.concat([merge,text_features_df],axis=1)

In [50]:
gc.collect()

4261

In [51]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

model = Pipeline([
                ('standardscaler', StandardScaler(copy=False, with_mean=True, with_std=True)),
                ('rg', Ridge(alpha=0.23413385351122218, fit_intercept=True, normalize=True, max_iter=2000, random_state=0))
            ])
model.fit(merge[merge_dates < 27].values, merge_y[merge_dates < 27])  

Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=False, with_mean=True, with_std=True)), ('rg', Ridge(alpha=0.23413385351122218, copy_X=True, fit_intercept=True,
   max_iter=2000, normalize=True, random_state=0, solver='auto', tol=0.001))])

In [52]:
stage2_data_text = model.predict(merge[merge_dates >= 27].values)
stage2_data_text = np.clip(stage2_data_text, 0., 20.)

In [53]:
stage2_data_text.shape

(1828564,)

## save to npy 

In [54]:
np.save('../data/feat/stage2_data_text.npy',stage2_data_text)

'''temp = np.load('../data/feat/stage2_data_text.npy')'''

"temp = np.load('../data/feat/stage2_data_text.npy')"