In [1]:
import numpy as np 
import pandas as pd

import os
import time
import datetime
import warnings
warnings.filterwarnings('ignore')

import gc

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

from math import sqrt
import math

import lightgbm as lgb
import xgboost as xgb

from tqdm import tqdm_notebook


In [2]:
class MyModel():
    def __init__(self, X_tr, y_tr, X_val, y_val, X_test):
        self.X_tr = X_tr
        self.y_tr = y_tr
        self.X_val = X_val
        self.y_val = y_val
        self.X_test = X_test
        self.params = {}
    def predict_val(self):
        return self.model.predict(self.X_val)
    def predict_test(self):
        return self.model.predict(self.X_test)


class LgbBoostModel(MyModel):
    def train(self):          
        self.params = { 'objective': 'regression', 'metric': 'rmse', 'boosting': 'gbdt', 'seed':seed, 'is_training_metric': True
                  ,'max_bin': 350 #,'max_bin': 150
                  ,'learning_rate': .005
                  ,'max_depth': -1                  
                  ,'num_leaves': 48
                  ,'feature_fraction': 0.1
                  ,'reg_alpha': 0
                  ,'reg_lambda': 0.2
                  ,'min_child_weight': 10}
        
        self.model = lgb.train(self.params, lgb.Dataset(self.X_tr, label=self.y_tr), 30000, 
                            [lgb.Dataset(self.X_tr, label=self.y_tr), lgb.Dataset(self.X_val, label=self.y_val)], 
                               verbose_eval=200, early_stopping_rounds=200)



In [23]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
#train_feats = pd.read_csv('../input/train_agg_feats.csv')
#test_feats = pd.read_csv('../input/test_agg_feats.csv')
#test_patterns = pd.read_csv('../input/test_patterns_only_7830.csv')

In [24]:
#Make training data from the original training plus the pattern in test data. This is key to improve our model prediction ability
cols = train_feats.columns[1:]
train_feat_final = pd.concat([train_feats[cols], test_feats[cols][test_patterns.target != 0]], axis = 0)
train_feat_id = pd.concat([train['ID'], test['ID'][test_patterns.target != 0]], axis = 0)
test_feat_final = test_feats[cols]    
y = np.array(list(np.log1p(train.target.values)) + list(np.log1p(test_patterns['target'][test_patterns.target != 0])))

X = train_feat_final.values
X_test = test_feat_final.values

print(X.shape)
print(X_test.shape)

(12289, 1287)
(49342, 1287)


In [25]:
n_splits = 10
seed = 42

kf = KFold(n_splits=n_splits, random_state=seed, shuffle=True)


rmse_scores = {}
oof_preds = {}
sub_preds = {}
model_params = {}

model_types = ['lgb']

for model_type in model_types:
    rmse_scores[model_type] = list()
    oof_preds[model_type] = np.zeros((X.shape[0],))
    sub_preds[model_type] = np.zeros((X_test.shape[0],))

print('{} fold..'.format(n_splits))

for fold, (train_index, test_index) in tqdm_notebook(list(enumerate(list(kf.split(y))[:]))):

    # print("TRAIN:", train_index, "TEST:", test_index)
    X_tr, X_val = X[train_index], X[test_index]
    y_tr, y_val = y[train_index], y[test_index]

    for model_type in model_types:
        print ('\n*** ' + model_type)
        #model = get_model_class(model_type,  X_tr, y_tr, X_val, y_val, X_test)
        model = LgbBoostModel(X_tr, y_tr, X_val, y_val, X_test)

        model.train()

        oof_preds[model_type][test_index] = model.predict_val()
        sub_preds[model_type] += model.predict_test() / n_splits        
        rmse = mean_squared_error(y_val, model.predict_val())**0.5
        rmse_scores[model_type].append(rmse)

        model.params['cv'] = n_splits
        #model.params['fold_by_target'] = fold_by_target
        model.params['seed'] = seed            
        model_params[model_type] = model.params

        print('Fold %d: %s Mean Squared Error %f'%(fold, model_type, rmse))

10 fold..


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


*** lgb
Training until validation scores don't improve for 200 rounds
[200]	valid_0's rmse: 1.34384	valid_1's rmse: 1.42626
[400]	valid_0's rmse: 1.19803	valid_1's rmse: 1.35876
[600]	valid_0's rmse: 1.12229	valid_1's rmse: 1.34843
[800]	valid_0's rmse: 1.06631	valid_1's rmse: 1.34684
[1000]	valid_0's rmse: 1.01976	valid_1's rmse: 1.34707
Early stopping, best iteration is:
[830]	valid_0's rmse: 1.05886	valid_1's rmse: 1.34664
Fold 0: lgb Mean Squared Error 1.346639

*** lgb
Training until validation scores don't improve for 200 rounds
[200]	valid_0's rmse: 1.34118	valid_1's rmse: 1.47594
[400]	valid_0's rmse: 1.19771	valid_1's rmse: 1.39015
[600]	valid_0's rmse: 1.12306	valid_1's rmse: 1.3704
[800]	valid_0's rmse: 1.06753	valid_1's rmse: 1.36359
[1000]	valid_0's rmse: 1.02145	valid_1's rmse: 1.35995
[1200]	valid_0's rmse: 0.981049	valid_1's rmse: 1.35787
[1400]	valid_0's rmse: 0.943892	valid_1's rmse: 1.35732
[1600]	valid_0's rmse: 0.90924	valid_1's rmse: 1.35657
[1800]	valid_0's rmse

In [26]:
def mean(values):
    return float(sum(values)) / max(len(values), 1)

def sum_of_square_deviation(values, mean):
    return float(1/len(values) * sum((x - mean)** 2 for x in values))  

subm = pd.read_csv('../input/sample_submission.csv')
subm['target'] = np.expm1(sub_preds[model_type])

oof = pd.DataFrame(train_feat_id.copy())
oof['target'] = np.expm1(y)
oof['prediction'] = np.expm1(oof_preds[model_type])
mean_rmse = np.mean(rmse_scores[model_type])
standard_deviation_rmse = np.std(rmse_scores[model_type])
key = '{}'.format(model_type)
print( '{} Mean Squared Error {}'.format(model_type ,mean_rmse))
print( '{} Stdev Squared Error {}'.format(model_type, standard_deviation_rmse))
file_name = '../output/submission.csv'.format(key)    
subm['target'][test_patterns.target != 0] = test_patterns['target'][test_patterns.target != 0]
subm.to_csv(file_name, index=False, float_format="%.8f")

lgb Mean Squared Error 1.3130204655072009
lgb Stdev Squared Error 0.027746778383404516
