# LightGBM Grid Search CV

In [1]:
input_dir = '../input/'
working_dir = '../working/'

In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
train = pd.read_csv(os.path.join(input_dir, 'train.csv'))
test = pd.read_csv(os.path.join(input_dir, 'test.csv'))
print(train.shape)
print(test.shape)

(9557, 143)
(23856, 142)


In [4]:
# Set index
train.index = train['Id'].values
test.index = test['Id'].values

# Split data
train_Id = train['Id'] # individual ID
train_idhogar = train['idhogar'] # household ID
train_y = train['Target'] # Target value
train_X = train.drop(['Id', 'Target', 'idhogar'], axis=1) # features

test_Id = test['Id'] # individual ID
test_idhogar = test['idhogar'] # household ID
test_X = test.drop(['Id', 'idhogar'], axis=1) # features

# Union train and test
all_Id = pd.concat([train_Id, test_Id], axis=0)
all_idhogar = pd.concat([train_idhogar, test_idhogar], axis=0)
all_X = pd.concat([train_X, test_X], axis=0)

## Convert object into numeric
* 'no' -> 0
* 'yes' -> average of non-zero values

In [5]:
all_X.dtypes[all_X.dtypes=='object']

dependency    object
edjefe        object
edjefa        object
dtype: object

In [6]:
def obj_into_numeric(X=all_X, obj_list=['dependency', 'edjefe', 'edjefa']):
    for obj_i in obj_list:
        # average of non-zero values
        all_obj_i = X[obj_i]
        numeric_obj_i = all_obj_i[((all_obj_i!='no') & (all_obj_i!='yes'))]
        obj_i_mean = numeric_obj_i.astype('float').mean()

        # replace 'no' and 'yes'
        X[obj_i].replace('no', 0, inplace=True)
        X[obj_i].replace('yes', -1, inplace=True)
        X[obj_i] = X[obj_i].astype('float')
        X[obj_i].replace(-1, obj_i_mean, inplace=True)
#     print('value_counts of data type')
#     print(X.dtypes.value_counts())
    return X

In [7]:
train_X = obj_into_numeric(train_X)
test_X = obj_into_numeric(test_X)

In [8]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, f1_score, make_scorer
import lightgbm as lgb

In [9]:
X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, test_size=0.1, random_state=0)

In [None]:
gbm_param = {
    'num_leaves':[210, 230, 240]
}

F1_scorer = make_scorer(f1_score, greater_is_better=True, average='macro')
gbm = GridSearchCV(lgb.LGBMClassifier(), gbm_param, scoring=F1_scorer)
gbm.fit(X_train, y_train)
gbm.best_params_

In [13]:
gbm

{'num_leaves': 120}

In [None]:
import pickle
with open(os.path.join(working_dir, '20180720_lgbm_gridcv'), mode='wb') as f:
    pickle.dump(gbm, f)
    

In [None]:
y_test_pred = gbm.predict(X_test)
cm = confusion_matrix(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred, average='macro')
print("confusion matrix: \n", cm)
print("f1: \n", f1)


In [None]:
pred = gbm.predict(test_X)
pred = pd.Series(data=pred, index=test_Id.values, name='Target')

In [None]:
sample_submission = pd.read_csv(os.path.join(input_dir, 'sample_submission.csv'))
sample_submission.head()

In [None]:
pred = pd.concat([test_Id, pred], axis=1, join_axes=[test_Id.index])
pred.head()

In [None]:
pred.to_csv('20180720_lgbm_gridcv.csv', index=False)