# First Level Training 

1. Load pre-processing data
    - downcast to float32, int32 
2. Train the first level models
    - validate model with simple hold out method
    - several models
        - linear 
        - tree based
        - knn
    - features gen by stacking 
3. Text features extraction 

# 1. Load Data

In [1]:
import pandas as pd 
import numpy as np 
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import gc
gc.enable()

In [2]:
with pd.HDFStore('../data/feat/data.h5') as store:
    print(store.keys())
    X_train = store['X_train']
    X_cv = store['X_cv']
    y_train = store['y_train']
    y_cv = store['y_cv']
    X_test = store['X_test']

['/X_cv', '/X_test', '/X_train', '/y_cv', '/y_train']


In [3]:
X_train.date_block_num.unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32], dtype=int64)

 - clip to (0,20)

In [4]:
y_train = y_train.clip(0,20)
y_cv = y_cv.clip(0,20)

In [5]:
X_train.shape

(10675632, 54)

- downcast to `float32`, `int32`

In [6]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

In [7]:
X_train = downcast_dtypes(X_train)
X_cv = downcast_dtypes(X_cv)
X_test = downcast_dtypes(X_test)

In [8]:
y_train = y_train.astype(np.float32)
y_cv = y_cv.astype(np.float32)

In [9]:
gc.collect()

42

- `fillna` with 0

In [10]:
X_train.fillna(0,inplace=True)
X_cv.fillna(0,inplace=True)
X_test.fillna(0,inplace=True)

## 1.1 Train /test reduce
We take only `date_block_num` between `12~32`
- memory issue
- time cost issue

But I can't improve rmse with smaller datasets

In [11]:
train_dates = X_train.date_block_num

mask = train_dates >= 0 # mask=0 : all consider
X_train = X_train[mask]
y_train = y_train[mask]

train_dates = train_dates[mask]
test_dates = X_test.date_block_num
cv_dates = X_cv.date_block_num

In [12]:
X_train.shape

(10675632, 54)

In [13]:
gc.collect()

27

___

# 2. First Level Training

In [14]:
train_dates = X_train.date_block_num

In [15]:
level2_date_block = [27, 28, 29, 30, 31, 32]
# level2_date_block = [32]
level2_mask = train_dates.isin(level2_date_block)
train_dates_level2 = train_dates[level2_mask]
train_y_level2 = y_train[level2_mask]

In [16]:
train_dates_level2.shape

(1376192,)

## 2.0 custom grid search 
    - stolen from top20 kaggler

In [17]:
import copy
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import mean_squared_error

In [18]:
def customized_grid_search_cv_evaluate(clf, X_train, y_train, param_grid):
    
    params_list = list(ParameterGrid(param_grid))
    res_df = pd.DataFrame(data=np.zeros((len(params_list),3)), columns=['params', 'mean_test_score', 'std_test_score'])
    res_df.loc[:,'params'] = params_list
    
    for i, params in enumerate(params_list):
        scores = []
        print('Fitting: ', params, '...', end='\n\tscores= ')
        for cur_block in level2_date_block:
            copy_clf = copy.deepcopy(clf)
            original_param = copy_clf.get_params()
            original_param.update(params)
            copy_clf.set_params(**original_param) # update copy clf with trying params 
            
            copy_clf.fit(X_train[train_dates < cur_block].values, y_train[train_dates < cur_block])
            pred_y = copy_clf.predict(X_train[train_dates == cur_block].values)
            pred_y = np.clip(pred_y, 0., 20.)
            score = mean_squared_error(y_train[train_dates == cur_block], pred_y)**.5
            print('{:.5f} '.format(score), end='')
            scores.append(score)
            del copy_clf; gc.collect()
        
        print('')
        res_df.loc[i, 'mean_test_score'] = np.mean(scores)
        res_df.loc[i, 'std_test_score'] = np.std(scores)
        
    print('Fitting finished')
    res_df = res_df.sort_values(by=['mean_test_score', 'std_test_score'], ascending=True).reset_index(drop=True)
    best_params = res_df.loc[0, 'params']
    
    print('Selected hyper-params:', best_params)
    print('cv score: {:.4f}, std: {:.4f}'.format(res_df.loc[0, 'mean_test_score'], res_df.loc[0, 'std_test_score']))
    del res_df, params_list; gc.collect()

In [19]:
def customized_grid_search_simple_holdout_evaluate(clf, X_train, y_train, param_grid, level2_date_block=[32]):
    
    params_list = list(ParameterGrid(param_grid))
    res_df = pd.DataFrame(data=np.zeros((len(params_list),2)), columns=['params', 'val_score'])
    res_df.loc[:,'params'] = params_list
    
    train_mask = train_dates < level2_date_block[0]
    validation_mask = (train_dates >= level2_date_block[0]) & (train_dates <= level2_date_block[-1])
    for i, params in enumerate(params_list):
        print('Fitting: ', params, '...', end='\n\tscore= ')
        
        copy_clf = copy.deepcopy(clf)
        original_param = copy_clf.get_params()
        original_param.update(params)
        copy_clf.set_params(**original_param) # update copy clf with trying params 

        copy_clf.fit(X_train[train_mask].values, y_train[train_mask])
        pred_y = copy_clf.predict(X_train[validation_mask].values)
        pred_y = np.clip(pred_y, 0., 20.)
        score = mean_squared_error(y_train[validation_mask], pred_y)**.5
        print('{:.5f} '.format(score), end='\n')
        del copy_clf; gc.collect()

        res_df.loc[i, 'val_score'] = score
        
    print('Fitting finished')
    res_df = res_df.sort_values(by=['val_score'], ascending=True).reset_index(drop=True)
    best_params = res_df.loc[0, 'params']
    
    print('Selected hyper-params:', best_params)
    print('cv score: {:.4f}'.format(res_df.loc[0, 'val_score']))
    del res_df, params_list; gc.collect()

## 2.1 Optimization
- with `hyperopt` library 

In [22]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

- simple hold out method to optimize data

In [21]:
def objective(params):
    '''
        calculate rmse with simple holdout method 
        
        -- train on `train_dates < level2_date_block[0]` 
        -- validate on `leve2_date_block`  (date_block_num : 27-32)
    '''
    ## extract clf and new_params from `params`
    clf = params.get('clf')
    new_params = params.get('params') ## update params
    
    ## update params
    copy_clf = copy.deepcopy(clf)
    original_params = copy_clf.get_params()
    original_params.update(new_params)
    copy_clf.set_params(**original_params) # update copy clf with trying new_params     
    
    train_mask = train_dates < level2_date_block[0]
    
    ## simple hold out method
    ## validate on the level2_date_block
    validation_mask = (train_dates >= level2_date_block[0]) & (train_dates <= level2_date_block[-1]) 
    
    copy_clf.fit(X_train[train_mask].values, y_train[train_mask]) ## train on <27 , validate on 27-32
        
    pred_y = copy_clf.predict(X_train[validation_mask].values)
    pred_y = np.clip(pred_y, 0., 20.) # clip to (0,20)
    rmse = mean_squared_error(y_train[validation_mask], pred_y)**.5
    print('rmse: {:.5f} '.format(rmse), end='\t')
    print('params: {}'.format(new_params))
    
    return {'loss':rmse,'status':STATUS_OK}

### Linear model 
- Ridge

In [24]:
from sklearn.linear_model import Ridge, Lasso #featuring L2/L1 regularized linear models
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import copy

In [26]:
trials = Trials()
rg_clf = Pipeline([
    ('standardscaler', StandardScaler(copy=False, with_mean=True, with_std=True)),
    ('rg', Ridge(fit_intercept=True, normalize=True, max_iter=2000, random_state=0))
])
rg_params = {
    'clf':rg_clf,
    'params': {'rg__alpha': hp.uniform('rg__alpha', 0., 1)},
}

best = fmin(fn = objective,
            space = rg_params,
            algo = tpe.suggest,
            trials = trials,
            max_evals = 10
           )

rmse: 0.96856 	params: {'rg__alpha': 0.7394366826551112}
rmse: 0.96940 	params: {'rg__alpha': 0.8323212952211707}
rmse: 0.96636 	params: {'rg__alpha': 0.45853883904970294}
rmse: 0.96528 	params: {'rg__alpha': 0.2586129746939999}
rmse: 0.96619 	params: {'rg__alpha': 0.43105622957293344}
rmse: 0.96580 	params: {'rg__alpha': 0.36485344088561267}
rmse: 0.96796 	params: {'rg__alpha': 0.6710648897251851}
rmse: 0.96687 	params: {'rg__alpha': 0.5323310197084163}
rmse: 0.96541 	params: {'rg__alpha': 0.08928462490891498}
rmse: 0.96785 	params: {'rg__alpha': 0.6575017014736089}


In [27]:
best

{'rg__alpha': 0.2586129746939999}

>rmse:0.96508 
 - alpha : 0.193929

- Lasso

In [30]:
trials = Trials()
lasso_clf = Pipeline([
    ('standardscaler', StandardScaler(copy=False, with_mean=True, with_std=True)),
    ('lasso', Lasso(random_state=0))
])

lasso_params = {
    'clf':lasso_clf,
    'params': {
        'lasso__alpha': hp.uniform('lasso__alpha',0,0.1)
    },    
}

best = fmin(fn = objective,
            space = lasso_params,
            algo = tpe.suggest,
            trials = trials,
            max_evals = 10
           )

rmse: 0.96888 	params: {'lasso__alpha': 0.03497042093162755}
rmse: 0.96732 	params: {'lasso__alpha': 0.029280833173449317}
rmse: 0.97919 	params: {'lasso__alpha': 0.07070642751102232}
rmse: 0.98575 	params: {'lasso__alpha': 0.09480172401964848}
rmse: 0.97813 	params: {'lasso__alpha': 0.06642684123672093}
rmse: 0.96817 	params: {'lasso__alpha': 0.03247322048195526}
rmse: 0.96565 	params: {'lasso__alpha': 0.021311379705558964}
rmse: 0.97563 	params: {'lasso__alpha': 0.05706225979785251}
rmse: 0.98443 	params: {'lasso__alpha': 0.08997042098849649}
rmse: 0.97709 	params: {'lasso__alpha': 0.0621304034294529}


In [35]:
best

{'lasso__alpha': 0.021311379705558964}

> rmse: 0.96565
    - alpha: 0.021311

### Tree based model 
- lightgbm

In [25]:
import lightgbm as lgb

In [26]:
lgb.LGBMRegressor()

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
       n_jobs=-1, num_leaves=31, objective=None, random_state=None,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)

In [36]:
lgb_clf = lgb.LGBMRegressor(random_state=0, n_jobs=8)

# 1. find an optimal n_esti for rather larger learning rate
lgb_params = {
    'clf' : lgb_clf,
    'params': {
        'boosting_type': hp.choice('boosting_type',['gbdt', 'dart']), ## gbdt 
#         'boosting_type': []'gbdt',
        'num_leaves'   : hp.choice('num_leaves', np.arange(8,129,2,dtype=int)),
        'max_depth' : hp.choice("max_depth", np.arange(4, 17, dtype=int)),    
        'feature_fraction': hp.uniform('feature_fraction', 0.3, 1.0),
        'bagging_fraction': hp.uniform ('bagging_fraction', 0.7, 1),
        'reg_lambda': hp.uniform('reg_lambda',0,1),
        'gamma' : hp.uniform('gamma', 0.1,0.5)
    }
    
}
best =fmin(fn = objective,
           space = lgb_params,
           algo = tpe.suggest,
           max_evals = 10
           )

rmse: 0.83811 	params: {'bagging_fraction': 0.7287250669902532, 'boosting_type': 'gbdt', 'feature_fraction': 0.9454531311886423, 'gamma': 0.35213985144391613, 'max_depth': 4, 'num_leaves': 80, 'reg_lambda': 0.9530539109026833}
rmse: 0.82016 	params: {'bagging_fraction': 0.8936851006379823, 'boosting_type': 'gbdt', 'feature_fraction': 0.33191544462509315, 'gamma': 0.2853047365652319, 'max_depth': 12, 'num_leaves': 86, 'reg_lambda': 0.8372438851255922}
rmse: 0.83413 	params: {'bagging_fraction': 0.9638977666454814, 'boosting_type': 'dart', 'feature_fraction': 0.9633378431302959, 'gamma': 0.11565837787421587, 'max_depth': 7, 'num_leaves': 82, 'reg_lambda': 0.8165793892277495}
rmse: 0.81750 	params: {'bagging_fraction': 0.7784701027506175, 'boosting_type': 'gbdt', 'feature_fraction': 0.4683902745701608, 'gamma': 0.15908835269613442, 'max_depth': 13, 'num_leaves': 86, 'reg_lambda': 0.24404114889403705}
rmse: 0.82944 	params: {'bagging_fraction': 0.9495228671083129, 'boosting_type': 'dart', 

In [34]:
## use pre-train params from lgb_model1
lgb_clf = lgb.LGBMRegressor(random_state=0)
lgb_params = {
    'clf':lgb_clf,
    'params':{
        'bagging_fraction': 0.9568845079308161,
        'bossting_type': 'gbdt',
        'feature_fraction': 0.6203248801718259,
        'gamma': 0.39624896070423066,
        'max_depth': 12,
        'metric': 'rmse',
        'num_leaves': 64,
        'objective': 'regression',
        'reg_lambda': 0.38856229720270463
    }
}
objective(lgb_params)

rmse: 0.81909 	params: {'bagging_fraction': 0.9568845079308161, 'bossting_type': 'gbdt', 'feature_fraction': 0.6203248801718259, 'gamma': 0.39624896070423066, 'max_depth': 12, 'metric': 'rmse', 'num_leaves': 64, 'objective': 'regression', 'reg_lambda': 0.38856229720270463}


{'loss': 0.81908550940125358, 'status': 'ok'}

> rmse: 0.81909
    - {'bagging_fraction': 0.9568845079308161, 'bossting_type': 'gbdt', 'feature_fraction': 0.6203248801718259, 'gamma': 0.39624896070423066, 'max_depth': 12, 'metric': 'rmse', 'num_leaves': 64, 'objective': 'regression', 'reg_lambda': 0.38856229720270463}

In [28]:
best

{'bagging_fraction': 0.9788870584216797,
 'feature_fraction': 0.7591012763757823,
 'gamma': 0.1517233454100392,
 'max_depth': 8,
 'num_leaves': 51,
 'reg_lambda': 0.6813318433297506}

> rmse:0.81525

    > bagging_fraction:`0.9789`, 
    > feature_fraction:`0.759`,
    > gamma : `0.1517`,
    > reg_labmda : `0.6813`
    > max_depth : `12`,
    > num_leaves : `110`,
    > bossting_type : `gbdt`,
    


- random forest

In [37]:
from sklearn.ensemble import RandomForestRegressor

RandomForestRegressor()

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [38]:
rf_clf = RandomForestRegressor(min_samples_split=300, min_samples_leaf=30, max_features='sqrt',n_estimators=50,
                               max_depth=4, n_jobs=4, criterion='mse',random_state=0)

rf_clf.estimators_=1 # need to set a value otherwise rise AttributeError in hyperopt
rf_params = {
    'clf' : rf_clf,
    'params' : {
        'min_samples_split' : hp.choice('min_samples_split',np.arange(100,500,100)),
        'min_samples_leaf': hp.choice('min_samples_leaf',np.arange(30,100,10)),
        'n_estimators' : hp.choice('n_estimators', np.arange(50,301,50)),
        'max_depth': hp.choice('max_depth',[4, 8, 12, 16])        
    }    
}

best =fmin(fn = objective,
           space = rf_params,
           algo = tpe.suggest,
           max_evals = 10
           )

rmse: 0.85067 	params: {'max_depth': 8, 'min_samples_leaf': 50, 'min_samples_split': 200, 'n_estimators': 100}
rmse: 0.85119 	params: {'max_depth': 8, 'min_samples_leaf': 90, 'min_samples_split': 200, 'n_estimators': 250}
rmse: 0.85122 	params: {'max_depth': 8, 'min_samples_leaf': 40, 'min_samples_split': 200, 'n_estimators': 300}
rmse: 0.83015 	params: {'max_depth': 16, 'min_samples_leaf': 60, 'min_samples_split': 400, 'n_estimators': 200}
rmse: 0.82904 	params: {'max_depth': 16, 'min_samples_leaf': 40, 'min_samples_split': 100, 'n_estimators': 50}
rmse: 0.82801 	params: {'max_depth': 16, 'min_samples_leaf': 50, 'min_samples_split': 200, 'n_estimators': 300}
rmse: 0.89163 	params: {'max_depth': 4, 'min_samples_leaf': 30, 'min_samples_split': 300, 'n_estimators': 100}
rmse: 0.89163 	params: {'max_depth': 4, 'min_samples_leaf': 70, 'min_samples_split': 200, 'n_estimators': 100}
rmse: 0.89398 	params: {'max_depth': 4, 'min_samples_leaf': 70, 'min_samples_split': 300, 'n_estimators': 300}

> rmse: `0.82801`
 - max_depth = `16`
 - min_samples_leaf = `50`
 - min_samples_split = `200`
 - n_estimators = `300`

### KNN  model

- KNN: Prediction and Neighbor distances features

In [40]:
from sklearn.neighbors import KNeighborsRegressor
KNeighborsRegressor()

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform')

In [None]:
trials = Trials()
knn_clf = KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                              metric_params=None, n_jobs=10, n_neighbors=5, p=1,
                              weights='uniform')
knn_params = {
    'clf' : knn_clf,
    'params' :{
        'p': hp.choice('p',[0,1]),
        'weights': hp.choice('weights',['uniform', 'distance']),
        'n_neighbors': hp.choice('n_neighbors', np.arange(10, 101, 10,dtype=int))        
    }
}
best = fmin(fn = objective,
            space = knn_params,
            algo = tpe.suggest,
            trials = trials,
            max_evals = 10
           )   

> knn: 
    - n_neighbors = `15`
    - weights = `distance`

## Kmeans
- mini-batch kmeans cast to low dimensional features

In [None]:
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.metrics import calinski_harabaz_score

In [None]:
X_cv.shape

In [None]:
merge = pd.concat([X_train, X_cv,X_test], ignore_index=True)

In [None]:
merge.shape

In [None]:
merge.tail()

In [None]:
for c in np.arange(4,16,2):
    print('n_clusters =', c, end=' score= ')
    km = Pipeline([
        ('standardscaler', StandardScaler(copy=False, with_mean=True, with_std=True)),
        ('kmean', MiniBatchKMeans(n_clusters=c, max_no_improvement=30, 
                                  verbose=0, batch_size=1000000, random_state=0))
    ])
#     mini_kmean = MiniBatchKMeans(n_clusters=8, batch_size=10000, verbose=2, random_state=0)
    labels = km.fit_predict(merge)
    print(calinski_harabaz_score(merge, labels))

> `n_cluster = 8` ... highest score means better clustering

## 2.2 Stacking 

In [50]:
merge_dates = np.concatenate([train_dates,cv_dates, test_dates], axis=0)

In [51]:
merge_dates.shape

(6639294,)

In [52]:
merge_y = np.concatenate([y_train,y_cv, np.zeros((len(test_dates),))], axis=0)
print(merge_y.shape)

(6639294,)


In [53]:
stage2_train_dates = merge_dates[merge_dates>=level2_date_block[0]] ## 27 - 34
stage2_train_mask = (stage2_train_dates <= level2_date_block[-1]) # train on : 27-32
stage2_valid_mask = (stage2_train_dates == 33) ## validate on : 33
stage2_final_train_mask = (stage2_train_dates <= 33) ## 
stage2_test_mask = (stage2_train_dates==34)
stage2_train_y = merge_y[merge_dates>=level2_date_block[0]]

In [67]:
supervised_models = {
    'rg': Pipeline([
                ('standardscaler', StandardScaler(copy=False, with_mean=True, with_std=True)),
                ('rg', Ridge(alpha=0.08, fit_intercept=True, normalize=True, max_iter=2000, random_state=0))
            ]),
    'lasso':Pipeline([
                    ('standardscaler', StandardScaler(copy=False, with_mean=True, with_std=True)),
                    ('lasso', Lasso(alpha=0.0094, normalize=False, fit_intercept=True, max_iter=2000, random_state=0))
                ]),
    'rf': RandomForestRegressor(n_estimators=250,
                               min_samples_split=300, min_samples_leaf=30, max_features='sqrt',
                               max_depth=16, n_jobs=4, criterion='mse', random_state=0),
    'lgbm': lgb.LGBMRegressor(boosting_type='gbdt', 
                              max_depth=12,
                              num_leaves=110,
                              bagging_fraction=0.979,
                              feature_fraction=0.759,                              
                              reg_lambda = 0.681,
                              gamma = 0.1517,                              
#                               subsample=.55, colsample_bytree=.75, ## 
                              n_jobs=8,
                              random_state=0)
}


knn = KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                              metric_params=None, n_jobs=4, n_neighbors=15, p=1,
                              weights='distance')

mini_kmean = Pipeline([
                ('standardscaler', StandardScaler(copy=False, with_mean=True, with_std=True)),
                ('kmean', MiniBatchKMeans(n_clusters=8, max_no_improvement=30, 
                                          verbose=0, batch_size=1000000, random_state=0))
            ])

In [68]:
def stage2_feature_generate():
    
    print('Training supervised models')
    
    all_preds = []
    for model_name, model in supervised_models.items():
        
        print(model_name, end=': ')
        preds = []
        for cur_block in np.arange(27, 35, 1):
            X_tr = merge[merge_dates < cur_block].values ## ndarray
            y_tr = merge_y[merge_dates < cur_block]
            X_test = merge[merge_dates == cur_block].values
            
            copy_clf = copy.deepcopy(model)
            copy_clf.fit(X_tr, y_tr)
            pred_test = copy_clf.predict(X_test)
            pred_test = np.clip(pred_test, 0., 20.)
            preds.append(pred_test)
            print(cur_block, end=' ')

        preds = np.concatenate(preds, axis=0)
        preds = preds.reshape((len(preds), 1))
        all_preds.append(preds)
    
        print('')
        
    #knn:
    print('knn')
    X_tr = merge[merge_dates < 27].values
    y_tr = merge_y[merge_dates < 27]

    X_test = merge[merge_dates >= 27].values
    knn.fit(X_tr, y_tr)
    knn_pred = knn.predict(X_test)
    knn_pred = np.clip(knn_pred, 0., 20.).reshape((len(knn_pred), 1))
    knn_dist = knn.kneighbors(X_test, return_distance=True)[0] # distances
    print(np.array(knn_dist).shape)
    all_preds.append(knn_pred)
    all_preds.append(knn_dist)
    
    # kmeans
    mini_kmean.fit(merge.values)
    kmean_pred = mini_kmean.predict(X_test)
    kmean_dist = mini_kmean.transform(X_test)
    
    kmean_pred = np.array(kmean_pred).reshape((len(kmean_pred),1))
    all_preds.append(kmean_pred)
    all_preds.append(kmean_dist)
    
    return np.concatenate(all_preds, axis=1)

In [69]:
stage2_data = stage2_feature_generate()

Training supervised models
rg: 27 28 29 30 31 32 33 34 
lasso: 27 28 29 30 31 32 33 34 
rf: 27 28 29 30 31 32 33 34 
lgbm: 27 28 29 30 31 32 33 34 
knn
(1828564, 15)


In [75]:
columns = ['rg', 'lasso', 'rf', 'lgbm', 'knn']
columns.extend(['knn_dist_'+str(i) for i in range(15)])
columns.extend(['kmean_dist_label'])
columns.extend(['kmean_dist_'+str(i) for i in range(8)])
print(len(columns))
columns

29


['rg',
 'lasso',
 'rf',
 'lgbm',
 'knn',
 'knn_dist_0',
 'knn_dist_1',
 'knn_dist_2',
 'knn_dist_3',
 'knn_dist_4',
 'knn_dist_5',
 'knn_dist_6',
 'knn_dist_7',
 'knn_dist_8',
 'knn_dist_9',
 'knn_dist_10',
 'knn_dist_11',
 'knn_dist_12',
 'knn_dist_13',
 'knn_dist_14',
 'kmean_dist_label',
 'kmean_dist_0',
 'kmean_dist_1',
 'kmean_dist_2',
 'kmean_dist_3',
 'kmean_dist_4',
 'kmean_dist_5',
 'kmean_dist_6',
 'kmean_dist_7']

In [76]:
stage2_df = pd.DataFrame(data=stage2_data, columns=columns)

In [79]:
stage2_df.head().T

Unnamed: 0,0,1,2,3,4
rg,0.345505,1.083458,0.432379,0.495248,0.458267
lasso,0.299434,1.036473,0.398738,0.437356,0.401383
rf,0.55938,2.350962,0.645353,3.162069,2.405929
lgbm,0.687585,2.604097,0.502553,5.00124,3.038452
knn,0.461946,1.886891,0.400254,0.075603,0.663293
knn_dist_0,15.0,7097.084351,8810.403742,11.0,9.0
knn_dist_1,15.0,7367.291954,8901.891784,12.0,10.0
knn_dist_2,15.0,7391.268279,8916.891784,12.0,10.0
knn_dist_3,16.0,7754.255436,8944.502339,13.0,11.0
knn_dist_4,16.0,7779.592759,8967.15634,13.0,11.0


In [80]:
stage2_io = pd.HDFStore('../data/feat/stage2_data.h5')
stage2_io['stage2_df'] = stage2_df
stage2_io.close()

In [82]:
with pd.HDFStore('../data/feat/stage2_data.h5') as stage2_io:
    print(stage2_io.keys())
    stage2_df = stage2_io['stage2_df']

['/stage2_df']


______

# Extract Text Features

In [16]:
with pd.HDFStore('../data/feat/text_feats.h5') as text_io:
    print(text_io.keys())
    X_text_feats_test = text_io['X_text_feats_test']
    X_text_feats_cv = text_io['X_text_feats_cv']
    X_text_feats_train = text_io['X_text_feats_train']

['/X_text_feats', '/X_text_feats_cv', '/X_text_feats_test', '/X_text_feats_train']


In [17]:
X_text_feats_train.shape

(10675632, 4)

In [18]:
mask.value_counts() ## reduce memory use only num_date_block >= 12

True     6186922
False    4488710
Name: date_block_num, dtype: int64

In [19]:
X_text_feats_train.shape[0] + X_text_feats_cv.shape[0]  #+ X_text_feats_test.shape[0]

10913804

In [22]:
X_text_feats_train.shape

(10675632, 4)

In [23]:
train_text_df = pd.concat([X_text_feats_train[mask],X_text_feats_cv])
test_text_df = X_text_feats_test

In [24]:
train_text_df.shape[0] + test_text_df.shape[0]

6639294

In [27]:
train_texts = train_text_df['item_name'].map(str) + ' ' + train_text_df['item_category_name'].map(str) + ' ' + train_text_df['shop_name'].map(str)
test_texts = test_text_df['item_name'].map(str) + ' ' + test_text_df['item_category_name'].map(str) + ' ' + test_text_df['shop_name'].map(str)

In [30]:
all_texts = pd.Series(np.concatenate([train_texts, test_texts], axis=0))
del train_text_df, test_text_df, train_texts, test_texts; gc.collect()
all_texts.shape

(6639294,)

In [31]:
del X_text_feats_train,X_text_feats_cv,X_text_feats_test; gc.collect()

1896

### 1. TFIDF - Truncated SVD

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(lowercase=False, ngram_range=(1,2))

In [33]:
tv_features = tv.fit_transform(all_texts)
tv_features.shape

(6639294, 63999)

In [34]:
from sklearn.decomposition import TruncatedSVD

In [35]:
svd = TruncatedSVD(n_components=20, random_state=0)
tv_svd_features = svd.fit_transform(tv_features)

In [39]:
import sys
sys.getsizeof(tv_svd_features)/(1024*1024*1024)

0.9893320053815842

### 2. TF-IDF(binarize)- Truncated SVD

In [36]:
tvb_features = tv_features.astype(bool).astype(float)
del tv_features; gc.collect()

0

In [37]:
svd = TruncatedSVD(n_components=20, random_state=0)
tvb_svd_features = svd.fit_transform(tvb_features)
tvb_svd_features.shape

(6639294, 20)

In [40]:
sys.getsizeof(tvb_svd_features)/(1024*1024*1024)

0.9893320053815842

### 3. Hasing + Truncated SVD

In [41]:
from sklearn.feature_extraction.text import HashingVectorizer
hv = HashingVectorizer(ngram_range=(1, 2), lowercase=False)

In [42]:
hv_features = hv.fit_transform(all_texts).tocsr()
hv_features.shape

(6639294, 1048576)

In [43]:
svd = TruncatedSVD(n_components=20, random_state=0)
hv_svd_features = svd.fit_transform(hv_features)
hv_svd_features.shape

(6639294, 20)

In [44]:
sys.getsizeof(hv_svd_features)/(1024*1024*1024)

0.9893320053815842

### 4. Hasing(binarize) + Truncated SVD

In [45]:
hvb_features = hv_features.astype(bool).astype(float)
del hv_features; gc.collect()

59

In [46]:
svd = TruncatedSVD(n_components=20, random_state=0)
hvb_svd_features = svd.fit_transform(hvb_features)
hvb_svd_features.shape

(6639294, 20)

In [47]:
print(sys.getsizeof(hvb_svd_features)/(1024*1024*1024))
del hvb_features; gc.collect()

0.9893320053815842


0

### 5. Stack them

In [48]:
text_features = np.concatenate([tv_svd_features, tvb_svd_features, hv_svd_features, hvb_svd_features], axis=1)
text_features.shape

(6639294, 80)

In [49]:
sys.getsizeof(text_features)/(1024*1024*1024)

3.9573277086019516

In [52]:
text_features_df = pd.DataFrame(data=text_features, columns=['text_f_'+str(i) for i in range(80)])

text_io = pd.HDFStore('../data/feat/text_feat_df.h5') 
text_io['text_feats_df'] = text_features_df
text_io.close()

## Generate stage2 text features

In [22]:
with pd.HDFStore('../data/feat/text_feat_df.h5') as text_io:
    text_features_df = text_io['text_feats_df'] ## 12-34

In [23]:
merge = pd.concat([X_train, X_cv,X_test], ignore_index=True)
merge_dates = np.concatenate([train_dates,cv_dates, test_dates], axis=0) ## 12-34

In [24]:
merge_y = np.concatenate([y_train,y_cv, np.zeros((len(test_dates),))], axis=0) ## 12-34

In [18]:
merge_y.shape

(6639294,)

In [19]:
merge.shape

(6639294, 54)

In [83]:
train_dates.shape # 

(6186922,)

In [25]:
level2_date_block = [27,28,29,30,31,32]
stage2_train_dates = merge_dates[merge_dates>=level2_date_block[0]] # 27-34
stage2_train_mask = (stage2_train_dates <= level2_date_block[-1]) # 27-32
stage2_valid_mask = (stage2_train_dates == 33) # 33
stage2_final_train_mask = (stage2_train_dates <= 33) # 27-33
stage2_test_mask = (stage2_train_dates==34) # 34
stage2_train_y = merge_y[merge_dates>=level2_date_block[0]] # 27-34

In [26]:
gc.collect()

92

In [33]:
X_train.reset_index(drop=True,inplace=True)
X_train = pd.concat([X_train,text_features_df.iloc[0:X_train.shape[0],:]], axis=1)
# train = X_train
# train.reset_index(drop=True,inplace=True)

In [34]:
X_train.shape ## 12-32
# y_train.shape ## 

(6186922, 134)

In [93]:
test_dates.shape[0]

214200

In [40]:
test_text_features_df = text_features_df.iloc[-test_dates.shape[0]:,:].reset_index(drop=True)
test = pd.concat([X_test.reset_index(drop=True),test_text_features_df],axis=1)

In [41]:
test.shape

(214200, 134)

### hyper params search

In [43]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import copy

In [44]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [106]:
level2_date_block[0]

27

- simple hold out method to optimize data

In [54]:
train_dates.reset_index(drop=True,inplace=True) ##reset index
y_train.reset_index(drop=True,inplace=True)

def objective(params):
    
    clf = params.get('clf')
    new_params = params.get('params') ## update params

    copy_clf = copy.deepcopy(clf)
    original_params = copy_clf.get_params()
    original_params.update(new_params)
    copy_clf.set_params(**original_params) # update copy clf with trying new_params     
    
    ## simple hold out 
    ## train_dates --- 12 - 32
    
    
    train_mask = train_dates < level2_date_block[0] #12-26
    ## simple hold out 
    validation_mask = (train_dates >= level2_date_block[0]) & (train_dates <= level2_date_block[-1]) ## validate on the level2_date_block
    
    copy_clf.fit(X_train[train_mask].values, y_train[train_mask]) #  
    
    pred_y = copy_clf.predict(X_train[validation_mask].values) ## 27-32
    pred_y = np.clip(pred_y, 0., 20.) # clip to (0,20)
    rmse = mean_squared_error(y_train[validation_mask], pred_y)**.5
    print('rmse: {:.5f} '.format(rmse), end='\t')
    print('params: {}'.format(new_params))
    
    return {'loss':rmse,'status':STATUS_OK}

In [55]:
trials = Trials()

rg_clf = Pipeline([
    ('standardscaler', StandardScaler(copy=False, with_mean=True, with_std=True)),
    ('rg', Ridge(fit_intercept=True, normalize=True, max_iter=2000, random_state=0))
])

rg_params = {
    'clf' : rg_clf,
    'params':{
        'rg__alpha':  hp.uniform('rg__alpha', 0., 1.0)
    }
    
}
best = fmin(fn = objective,
            space = rg_params,
            algo = tpe.suggest,
            trials = trials,
            max_evals = 10
           )

rmse: 0.96357 	params: {'rg__alpha': 0.3147292713629707}
rmse: 0.96250 	params: {'rg__alpha': 0.08519099879049385}
rmse: 0.96421 	params: {'rg__alpha': 0.4044099788839268}
rmse: 0.96447 	params: {'rg__alpha': 0.43645905573545196}
rmse: 0.96418 	params: {'rg__alpha': 0.4009849415326201}
rmse: 0.96378 	params: {'rg__alpha': 0.3460070240018216}
rmse: 0.96259 	params: {'rg__alpha': 0.03432567333200587}
rmse: 0.96317 	params: {'rg__alpha': 0.2468350021491552}
rmse: 0.96394 	params: {'rg__alpha': 0.36806573677837384}
rmse: 0.96284 	params: {'rg__alpha': 0.17809668480975327}


> alpha
    - 0.0851909

## Generate stage2 features with TEXT

In [74]:
merge.shape

(6639294, 54)

In [75]:
text_features_df.shape

(6639294, 80)

In [78]:
merge = pd.concat([merge,text_features_df],axis=1)

In [79]:
gc.collect()

4571

In [80]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

model = Pipeline([
                ('standardscaler', StandardScaler(copy=False, with_mean=True, with_std=True)),
                ('rg', Ridge(alpha=.0851909, fit_intercept=True, normalize=True, max_iter=2000, random_state=0))
            ])
model.fit(merge[merge_dates < 27].values, merge_y[merge_dates < 27])  

Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=False, with_mean=True, with_std=True)), ('rg', Ridge(alpha=0.0851909, copy_X=True, fit_intercept=True, max_iter=2000,
   normalize=True, random_state=0, solver='auto', tol=0.001))])

In [81]:
merge.head().T

Unnamed: 0,0,1,2,3,4
shop_id,54.000000,54.000000,54.000000,54.000000,54.000000
item_id,10297.000000,10296.000000,10298.000000,10300.000000,10284.000000
date_block_num,12.000000,12.000000,12.000000,12.000000,12.000000
item_category_id,37.000000,38.000000,40.000000,37.000000,57.000000
item_id_avg_item_price_lag_1,741.560730,1599.000000,392.347504,682.033325,266.000000
item_id_sum_item_cnt_day_lag_1,42.000000,24.000000,369.000000,54.000000,4.000000
item_id_avg_item_cnt_day_lag_1,1.000000,1.000000,1.308511,1.058824,1.000000
shop_id_avg_item_price_lag_1,865.302917,865.302917,865.302917,865.302917,865.302917
shop_id_sum_item_cnt_day_lag_1,10055.000000,10055.000000,10055.000000,10055.000000,10055.000000
shop_id_avg_item_cnt_day_lag_1,1.363205,1.363205,1.363205,1.363205,1.363205


In [82]:
stage2_data_text = model.predict(merge[merge_dates >= 27].values)
stage2_data_text = np.clip(stage2_data_text, 0., 20.)

In [64]:
stage2_data_text.shape

(1828564,)

## save to npy 

In [84]:
np.save('../data/feat/stage2_data_text.npy',stage2_data_text)

'''temp = np.load('../data/feat/stage2_data_text.npy')'''