In [1]:
import gc
import os
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold, train_test_split

warnings.filterwarnings('ignore')

In [2]:
#logger
def get_logger():
    FORMAT = '[%(levelname)s]%(asctime)s:%(name)s:%(message)s'
    logging.basicConfig(format=FORMAT)
    logger = logging.getLogger('main')
    logger.setLevel(logging.DEBUG)
    return logger

In [3]:
logger = get_logger()

In [8]:
def read_data(nrows=None, test_size=0.2):
    logger.info('Input data')
    train_df = pd.read_csv('./input/train.csv',nrows=nrows)
    train_df, validation_df = train_test_split(train_df, test_size=test_size, random_state=101, stratify=train_df['target'].values)
    test_df = pd.read_csv('./input/test.csv')
    return train_df, validation_df, test_df

In [9]:
train_df, validation_df, test_df = read_data()

[INFO]2019-03-12 00:34:19,525:main:Input data


ValueError: The test_size = 0 should be greater or equal to the number of classes = 2

In [10]:
def process_data(train_df, validation_df, test_df):
    logger.info('Features engineering')
    idx = [c for c in train_df.columns if c not in ['ID_code', 'target']]
    enginering_feats = [('var_26','var_44'),('var_44','var_123'),('var_44','var_155')]
    
    for df in [test_df, validation_df, train_df]:
        #for feat in idx:
        #    df['r2_'+feat] = np.round(df[feat], 2)
        #for fe_id, fe in enumerate(enginering_feats):
        #    # Magic Feature Enginering
        #    df['%s_plus_%s'%fe] = df[fe[0]]+df[fe[1]]
        #    df['%s_minus_%s'%fe] = df[fe[1]]-df[fe[0]]
        #    df.drop(list(fe), axis=1)
        df['sum'] = df[idx].sum(axis=1)  
        df['min'] = df[idx].min(axis=1)
        df['max'] = df[idx].max(axis=1)
        df['mean'] = df[idx].mean(axis=1)
        df['std'] = df[idx].std(axis=1)
        df['skew'] = df[idx].skew(axis=1)
        df['kurt'] = df[idx].kurtosis(axis=1)
        df['med'] = df[idx].median(axis=1)
    print('Train and test shape:',train_df.shape, test_df.shape)
    return train_df, test_df

In [11]:
def run_lgb_model(train_df, validation_df, test_df):
    
    logger.info('Prepare the model')
    features = [c for c in train_df.columns if c not in ['ID_code', 'target']]
    target = train_df['target']
    logger.info('Run model')
    param = {
        'bagging_freq': 5,
        'bagging_fraction': 0.38,
        'boost_from_average':'false',
        'boost': 'gbdt',
        'feature_fraction': 0.045,
        'learning_rate': 0.1,
        'max_depth': -1,  
        'metric':'auc',
        'min_data_in_leaf': 80,
        'min_sum_hessian_in_leaf': 10.0,
        'num_leaves': 13,
        'num_threads': 8,
        'tree_learner': 'serial',
        'objective': 'binary', 
        'verbosity': 1
    }
    num_round = 1000000
    folds = StratifiedKFold(n_splits=12, shuffle=False, random_state=101)
    oof = np.zeros(len(train_df))
    test_predictions = np.zeros(len(test_df))
    validation_predictions = np.zeros(len(validation_df))
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, target.values)):
        print("Fold {}".format(fold_))
        trn_data = lgb.Dataset(train_df.iloc[trn_idx][features], label=target.iloc[trn_idx])
        val_data = lgb.Dataset(train_df.iloc[val_idx][features], label=target.iloc[val_idx])
        clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 3500)
        oof[val_idx] = clf.predict(train_df.iloc[val_idx][features], num_iteration=clf.best_iteration)
        test_predictions += clf.predict(test_df[features], num_iteration=clf.best_iteration) / folds.n_splits
        validation_predictions += clf.predict(validation_df[features], num_iteration=clf.best_iteration) / folds.n_splits
    
    train_score = roc_auc_score(target, oof)
    validation_score = roc_auc_score(validation_df.target, validation_predictions)
    
    print("CV score: {:<8.5f}".format(train_score))
    
    submission = pd.read_csv('./input/sample_submission.csv')
    submission['target'] = test_predictions
    submission.to_csv('lgb_test_predictions_train_score_{}_validation_score_{}.csv'.format(train_score, validation_score), index=False)
    
    validation_df['target'] = validation_score
    validation_df.to_csv('lgb_validation_predictions_train_score_{}_validation_score_{}.csv'.format(train_score, validation_score), index=False)
    
    return test_predictions, validation_predictions, train_score, validation_score

In [12]:
def submit(test_df, predictions, score):
    logger.info('Prepare submission')
    sub = pd.DataFrame({"ID_code": test_df.ID_code.values})
    sub["target"] = predictions
    sub.to_csv("submission_{:<8.5f}.csv".format(score), index=False)

In [14]:
def read_process_train_submit(nrows=None, test_size=0.2):
    train_df, validation_df, test_df = read_data(nrows, test_size=test_size)
    #train_df, test_df = process_data(train_df, test_df)
    test_predictions, validation_predictions, train_score, validation_score = run_lgb_model(train_df, validation_df, test_df)
    submit(test_df, test_predictions, score)
    return train_score, validation_score

In [15]:
read_process_train_submit()

[INFO]2019-03-12 00:37:20,730:main:Input data
[INFO]2019-03-12 00:37:34,216:main:Prepare the model
[INFO]2019-03-12 00:37:34,217:main:Run model


Fold 0
Training until validation scores don't improve for 3500 rounds.
[1000]	training's auc: 0.948481	valid_1's auc: 0.890551
[2000]	training's auc: 0.975667	valid_1's auc: 0.888527
[3000]	training's auc: 0.990328	valid_1's auc: 0.883062
[4000]	training's auc: 0.996968	valid_1's auc: 0.878575
Early stopping, best iteration is:
[645]	training's auc: 0.934219	valid_1's auc: 0.891959
Fold 1
Training until validation scores don't improve for 3500 rounds.
[1000]	training's auc: 0.94865	valid_1's auc: 0.890513
[2000]	training's auc: 0.975647	valid_1's auc: 0.88708
[3000]	training's auc: 0.990425	valid_1's auc: 0.883813
[4000]	training's auc: 0.996895	valid_1's auc: 0.880975
Early stopping, best iteration is:
[823]	training's auc: 0.941983	valid_1's auc: 0.89161
Fold 2
Training until validation scores don't improve for 3500 rounds.
[1000]	training's auc: 0.949211	valid_1's auc: 0.889364
[2000]	training's auc: 0.975871	valid_1's auc: 0.88538
[3000]	training's auc: 0.990474	valid_1's auc: 0.88

NameError: name 'predictions' is not defined

In [None]:
train_df, test_df = read_data()
train_df, test_df = process_data(train_df, test_df)