In [51]:
import numpy as np
import pandas as pd
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import StratifiedKFold
from sklearn import preprocessing
from sklearn.preprocessing import Imputer
from sklearn.metrics import log_loss
import xgboost as xgb
import lightgbm as lgb
import gc
import warnings
warnings.filterwarnings("ignore")

# **1. Data Preparation**

In [46]:
print('loading files...')
train = pd.read_csv('./DATA/DATA_V6/stock_train_data_20171006.csv')
test = pd.read_csv('./DATA/DATA_V6/stock_test_data_20171006.csv')

train = train.drop(['weight','group','era'], axis=1)  
test = test.drop('group', axis=1)
print ("Dimension of train {}".format(train.shape))
print ("Dimension of test {}".format(test.shape))

loading files...
Dimension of train (392418, 90)
Dimension of test (211675, 89)


# 2. Training/Predicting Pipeline

In [47]:
# xgb
params = {'eta': 0.02, 'max_depth': 4, 'subsample': 0.9, 'colsample_bytree': 0.9, 
          'objective': 'binary:logistic', 'silent': True}

X = train.drop(['id','label'], axis=1)
features = X.columns
X = X.values
y = train['label'].values
sub=test['id'].to_frame()
sub['proba']=0

In [52]:
nrounds=200  # need to change to 2000
kfold = 2  # need to change to 5
skf = StratifiedKFold(n_splits=kfold, random_state=0)
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(' xgb kfold: {}  of  {} : '.format(i+1, kfold))
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    d_train = xgb.DMatrix(X_train, y_train) 
    d_valid = xgb.DMatrix(X_valid, y_valid) 
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    xgb_model = xgb.train(params, d_train, nrounds, watchlist, early_stopping_rounds=100, 
                           maximize=True, verbose_eval=100)
    sub['proba'] += xgb_model.predict(xgb.DMatrix(test[features].values), 
                        ntree_limit=xgb_model.best_ntree_limit+50) / (2*kfold)
gc.collect()
sub.head(2)

 xgb kfold: 1  of  2 : 
[0]	train-error:0.441914	valid-error:0.496147
Multiple eval metrics have been passed: 'valid-error' will be used for early stopping.

Will train until valid-error hasn't improved in 100 rounds.
[100]	train-error:0.3995	valid-error:0.503746
 xgb kfold: 2  of  2 : 
[0]	train-error:0.430355	valid-error:0.511513
Multiple eval metrics have been passed: 'valid-error' will be used for early stopping.

Will train until valid-error hasn't improved in 100 rounds.
[100]	train-error:0.393757	valid-error:0.502548
Stopping. Best iteration:
[4]	train-error:0.428893	valid-error:0.516345



Unnamed: 0,id,proba
0,392418,0.472684
1,392419,0.543545


In [53]:
# lgb
params = {'metric': 'auc', 'learning_rate' : 0.01, 'max_depth':10, 'max_bin':10,  'objective': 'binary', 
          'feature_fraction': 0.8,'bagging_fraction':0.9,'bagging_freq':10,  'min_data': 500}

skf = StratifiedKFold(n_splits=kfold, random_state=1)
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(' lgb kfold: {}  of  {} : '.format(i+1, kfold))
    X_train, X_eval = X[train_index], X[test_index]
    y_train, y_eval = y[train_index], y[test_index]
    lgb_model = lgb.train(params, lgb.Dataset(X_train, label=y_train), nrounds, 
                  lgb.Dataset(X_eval, label=y_eval), verbose_eval=100, 
                  early_stopping_rounds=100)
    sub['proba'] += lgb_model.predict(test[features].values, 
                        num_iteration=lgb_model.best_iteration) / (2*kfold)

 lgb kfold: 1  of  2 : 
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.499855
Early stopping, best iteration is:
[6]	valid_0's auc: 0.509107
 lgb kfold: 2  of  2 : 
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.499355
[200]	valid_0's auc: 0.50359


# 3. Put submission to csv file

In [55]:
from datetime import datetime
sub.to_csv('./submission/sub{}.csv'.format(datetime.now().strftime('%Y%m%d_%H%M%S')))