In [1]:
# coding: utf-8
import os

import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

from mydatools.features_analyze import get_top_k_corr

% matplotlib inline

## Config

In [2]:
trn_path = './data/input/train.csv'
tst_path = './data/input/test.csv'
id_col = 'id'
label_col = 'target'

submission_path = './data/output/submission.csv'
output_id_col = id_col
output_label_col = label_col

## Load Data

In [3]:
trn_df = pd.read_csv(trn_path)
trn_df['ds_type'] = 'train'

tst_df = pd.read_csv(tst_path)
tst_df['ds_type'] = 'test'

full_df = pd.concat([trn_df, tst_df])

del(trn_df)
del(tst_df)

full_df.head()

Unnamed: 0,ds_type,id,ps_calc_01,ps_calc_02,ps_calc_03,ps_calc_04,ps_calc_05,ps_calc_06,ps_calc_07,ps_calc_08,...,ps_ind_13_bin,ps_ind_14,ps_ind_15,ps_ind_16_bin,ps_ind_17_bin,ps_ind_18_bin,ps_reg_01,ps_reg_02,ps_reg_03,target
0,train,7,0.6,0.5,0.2,3,1,10,1,10,...,0,0,11,0,1,0,0.7,0.2,0.71807,0.0
1,train,9,0.3,0.1,0.3,2,1,9,5,8,...,0,0,3,0,0,1,0.8,0.4,0.766078,0.0
2,train,13,0.5,0.7,0.1,2,2,9,1,8,...,0,0,12,1,0,0,0.0,0.0,-1.0,0.0
3,train,16,0.6,0.9,0.1,2,4,7,1,8,...,0,0,8,1,0,0,0.9,0.2,0.580948,0.0
4,train,17,0.4,0.6,0.0,2,2,6,3,10,...,0,0,9,1,0,0,0.7,0.6,0.840759,0.0


In [4]:
# dataset type
is_train = full_df['ds_type'] == 'train'
is_test = full_df['ds_type'] == 'test'

## Features

In [5]:
feature_columns = []

def add_features(features):
    if not isinstance(features, list):
        features = [features]
    global feature_columns
    feature_columns.extend([f for f in features if f not in feature_columns])

In [6]:
numerical_features = full_df.dtypes[full_df.dtypes != 'object'].index.tolist()
numerical_features = [c for c in numerical_features if c not in [id_col, label_col, 'ds_type']]
add_features(numerical_features)

## LightGBM

In [7]:
trn_df = full_df[is_train]
tst_df = full_df[is_test]

X = trn_df[feature_columns]
y = trn_df[label_col]
X_trn, X_val, y_trn, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

X_tst = tst_df[feature_columns]

trn_lgb = lgb.Dataset(X_trn.values, y_trn)
val_lgb = lgb.Dataset(X_val.values, y_val, reference=trn_lgb)

In [8]:
params = {
    'application': 'binary',
    'metric': 'binary_logloss',
    'learning_rate': 1,
    'max_depth': 5,
    'num_leaves': 25,
    'subsample': 0.5,
    'colsample_bytree': 0.5,
    'is_unbalance': True,
    'seed': 0,
    'reg_alpha': 0.1,
}
bst = lgb.train(params, trn_lgb, 1000, valid_sets=[trn_lgb, val_lgb], early_stopping_rounds=20)

[1]	training's binary_logloss: 0.651201	valid_1's binary_logloss: 0.651182
Training until validation scores don't improve for 20 rounds.
[2]	training's binary_logloss: 0.613257	valid_1's binary_logloss: 0.613224
[3]	training's binary_logloss: 0.578793	valid_1's binary_logloss: 0.578747
[4]	training's binary_logloss: 0.547379	valid_1's binary_logloss: 0.547318
[5]	training's binary_logloss: 0.518658	valid_1's binary_logloss: 0.518581
[6]	training's binary_logloss: 0.492325	valid_1's binary_logloss: 0.492238
[7]	training's binary_logloss: 0.468129	valid_1's binary_logloss: 0.46803
[8]	training's binary_logloss: 0.445847	valid_1's binary_logloss: 0.445734
[9]	training's binary_logloss: 0.425291	valid_1's binary_logloss: 0.425168
[10]	training's binary_logloss: 0.406295	valid_1's binary_logloss: 0.406159
[11]	training's binary_logloss: 0.388717	valid_1's binary_logloss: 0.388574
[12]	training's binary_logloss: 0.372431	valid_1's binary_logloss: 0.37228
[13]	training's binary_logloss: 0.357

[110]	training's binary_logloss: 0.150362	valid_1's binary_logloss: 0.151621
[111]	training's binary_logloss: 0.150313	valid_1's binary_logloss: 0.151609
[112]	training's binary_logloss: 0.150258	valid_1's binary_logloss: 0.151603
[113]	training's binary_logloss: 0.150211	valid_1's binary_logloss: 0.151596
[114]	training's binary_logloss: 0.150163	valid_1's binary_logloss: 0.151574
[115]	training's binary_logloss: 0.150121	valid_1's binary_logloss: 0.151565
[116]	training's binary_logloss: 0.150077	valid_1's binary_logloss: 0.151554
[117]	training's binary_logloss: 0.150036	valid_1's binary_logloss: 0.151541
[118]	training's binary_logloss: 0.14999	valid_1's binary_logloss: 0.151528
[119]	training's binary_logloss: 0.149944	valid_1's binary_logloss: 0.151522
[120]	training's binary_logloss: 0.149892	valid_1's binary_logloss: 0.151516
[121]	training's binary_logloss: 0.149848	valid_1's binary_logloss: 0.151512
[122]	training's binary_logloss: 0.149806	valid_1's binary_logloss: 0.151506


In [9]:
# bst = lgb.train(params, trn_lgb, 1000, valid_sets=[trn_lgb, val_lgb], early_stopping_rounds=20,
#                 init_model=bst, 
#                 learning_rates=lambda iter: 0.1 * (0.99 ** iter))

In [10]:
# imp_df = pd.DataFrame([bst.feature_importance()], columns=feature_columns, index=['importance']).T.sort_values(by='importance', ascending=False)
# imp_df

## predict

In [11]:
res_df = pd.DataFrame(bst.predict(X_tst), columns=[output_label_col])
res_df[output_id_col] = tst_df[output_id_col]
res_df.to_csv(submission_path, index=False)