In [1]:
import gc
import os
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
warnings.filterwarnings('ignore')

In [2]:
#logger
def get_logger():
    FORMAT = '[%(levelname)s]%(asctime)s:%(name)s:%(message)s'
    logging.basicConfig(format=FORMAT)
    logger = logging.getLogger('main')
    logger.setLevel(logging.DEBUG)
    return logger

In [3]:
logger = get_logger()

In [4]:
def read_data(nrows=None):
    logger.info('Input data')
    train_df = pd.read_csv('./input/train.csv',nrows=nrows)
    test_df = pd.read_csv('./input/test.csv')
    return train_df, test_df

In [6]:
def process_data(train_df, test_df):
    logger.info('Features engineering')
    idx = [c for c in train_df.columns if c not in ['ID_code', 'target']]
    enginering_feats = [('var_26','var_44'),('var_44','var_123'),('var_44','var_155')]
    
    for df in [test_df, train_df]:
        for feat in idx:
            df['r2_'+feat] = np.round(df[feat], 2)
        for fe_id, fe in enumerate(enginering_feats):
            # Magic Feature Enginering
            df = df_all.copy()
            df['%s_plus_%s'%fe] = df[fe[0]]+df_e[fe[1]]
            df['%s_minus_%s'%fe] = df[fe[1]]-df_e[fe[0]]
            df.drop(list)
        df['sum'] = df[idx].sum(axis=1)  
        df['min'] = df[idx].min(axis=1)
        df['max'] = df[idx].max(axis=1)
        df['mean'] = df[idx].mean(axis=1)
        df['std'] = df[idx].std(axis=1)
        df['skew'] = df[idx].skew(axis=1)
        df['kurt'] = df[idx].kurtosis(axis=1)
        df['med'] = df[idx].median(axis=1)
    print('Train and test shape:',train_df.shape, test_df.shape)
    return train_df, test_df

In [7]:
def run_model(train_df, test_df):
    logger.info('Prepare the model')
    features = [c for c in train_df.columns if c not in ['ID_code', 'target']]
    target = train_df['target']
    logger.info('Run model')
    param = {
        'bagging_freq': 5,
        'bagging_fraction': 0.38,
        'boost_from_average':'false',
        'boost': 'gbdt',
        'feature_fraction': 0.045,
        'learning_rate': 0.0095,
        'max_depth': -1,  
        'metric':'auc',
        'min_data_in_leaf': 80,
        'min_sum_hessian_in_leaf': 10.0,
        'num_leaves': 13,
        'num_threads': 8,
        'tree_learner': 'serial',
        'objective': 'binary', 
        'verbosity': 1
    }
    num_round = 1000000
    folds = StratifiedKFold(n_splits=12, shuffle=False, random_state=44000)
    oof = np.zeros(len(train_df))
    predictions = np.zeros(len(test_df))
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, target.values)):
        print("Fold {}".format(fold_))
        trn_data = lgb.Dataset(train_df.iloc[trn_idx][features], label=target.iloc[trn_idx])
        val_data = lgb.Dataset(train_df.iloc[val_idx][features], label=target.iloc[val_idx])
        clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 3500)
        oof[val_idx] = clf.predict(train_df.iloc[val_idx][features], num_iteration=clf.best_iteration)
        predictions += clf.predict(test_df[features], num_iteration=clf.best_iteration) / folds.n_splits
    score = roc_auc_score(target, oof)
    print("CV score: {:<8.5f}".format(score))
    return predictions, score

In [8]:
def submit(test_df, predictions, score):
    logger.info('Prepare submission')
    sub = pd.DataFrame({"ID_code": test_df.ID_code.values})
    sub["target"] = predictions
    sub.to_csv("submission_{:<8.5f}.csv".format(score), index=False)

In [9]:
def read_process_train_submit(nrows=None):
    train_df, test_df = read_data(nrows)
    #train_df, test_df = process_data(train_df, test_df)
    predictions, score = run_model(train_df, test_df)
    submit(test_df, predictions, score)

In [10]:
read_process_train_submit()

[INFO]2019-03-10 00:56:51,813:main:Input data
[INFO]2019-03-10 00:57:06,601:main:Prepare the model
[INFO]2019-03-10 00:57:06,602:main:Run model


Fold 0
Training until validation scores don't improve for 3500 rounds.
[1000]	training's auc: 0.900259	valid_1's auc: 0.882001
[2000]	training's auc: 0.910857	valid_1's auc: 0.889734
[3000]	training's auc: 0.917914	valid_1's auc: 0.894287
[4000]	training's auc: 0.923261	valid_1's auc: 0.896737
[5000]	training's auc: 0.927762	valid_1's auc: 0.89834
[6000]	training's auc: 0.931813	valid_1's auc: 0.899458
[7000]	training's auc: 0.935478	valid_1's auc: 0.900155
[8000]	training's auc: 0.939029	valid_1's auc: 0.90061
[9000]	training's auc: 0.94234	valid_1's auc: 0.900935
[10000]	training's auc: 0.945503	valid_1's auc: 0.901085
[11000]	training's auc: 0.948489	valid_1's auc: 0.901197
[12000]	training's auc: 0.951395	valid_1's auc: 0.901387
[13000]	training's auc: 0.954271	valid_1's auc: 0.901348
[14000]	training's auc: 0.95698	valid_1's auc: 0.901393
[15000]	training's auc: 0.959637	valid_1's auc: 0.901314
Early stopping, best iteration is:
[12274]	training's auc: 0.952199	valid_1's auc: 0.90

[2000]	training's auc: 0.910243	valid_1's auc: 0.894999
[3000]	training's auc: 0.917343	valid_1's auc: 0.899146
[4000]	training's auc: 0.922849	valid_1's auc: 0.901376
[5000]	training's auc: 0.92745	valid_1's auc: 0.902894
[6000]	training's auc: 0.931529	valid_1's auc: 0.903334
[7000]	training's auc: 0.935219	valid_1's auc: 0.903693
[8000]	training's auc: 0.938708	valid_1's auc: 0.903698
[9000]	training's auc: 0.941979	valid_1's auc: 0.903725
[10000]	training's auc: 0.945183	valid_1's auc: 0.90354
[11000]	training's auc: 0.948237	valid_1's auc: 0.903536
Early stopping, best iteration is:
[8204]	training's auc: 0.9394	valid_1's auc: 0.903831
Fold 10
Training until validation scores don't improve for 3500 rounds.
[1000]	training's auc: 0.899488	valid_1's auc: 0.888849
[2000]	training's auc: 0.90998	valid_1's auc: 0.896812
[3000]	training's auc: 0.91725	valid_1's auc: 0.901236
[4000]	training's auc: 0.922741	valid_1's auc: 0.903974
[5000]	training's auc: 0.927256	valid_1's auc: 0.90537
[6

TypeError: submit() missing 1 required positional argument: 'score'

NameError: name 'train_df' is not defined