In [25]:
import pandas as pd
import numpy as np

import lightgbm as lgb
conda install -c conda-forge lightgbm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

In [3]:
data = pd.read_csv("data.csv",
                  names = ['a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8','a9', 'a10', 'a11', 
                           'a12', 'a13', 'a14', 'a15', 'a16','a17', 'a18', 'a19', 'a20', 'a21', 
                           'a22', 'a23', 'a24', 'a25', 'a26','a27', 'a28', 'a29', 'a30','a31', 
                           'a32', 'a33', 'a34', 'a35', 'a36','a37', 'a38', 'a39', 'a40','a41','a42'])

In [4]:
data.head()

Unnamed: 0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,...,a33,a34,a35,a36,a37,a38,a39,a40,a41,a42
0,48.0,32.0,47.0,64.0,34.0,14.0,14.0,15.0,42,61,...,62,33,33,41,11,13,14,7,6,0
1,34.0,21.0,82.0,48.0,29.0,11.0,14.0,14.0,34,31,...,48,61,63,58,7,5,14,13,12,1
2,45.0,34.0,54.0,65.0,43.0,13.0,11.0,9.0,42,61,...,62,33,33,41,11,13,14,7,6,0
3,69.0,57.0,47.0,85.0,47.0,6.0,11.0,10.0,37,32,...,61,62,64,60,8,6,15,14,13,1
4,36.0,30.0,50.0,59.0,35.0,6.0,13.0,13.0,37,32,...,61,62,64,60,8,6,15,14,13,1


In [5]:
cols_for_model = np.setdiff1d(data.columns, ["a42"])
target = "a42"

In [6]:
train_d = lgb.Dataset(data[cols_for_model], data[target])

In [8]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_error',
    'num_leaves': 50,
    'learning_rate': 0.01,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'min_data_in_leaf': 20,
    'num_threads': 4,
    'seed': 2017,
    'bagging_seed': 2,
    'feature_fraction_seed': 256
}

In [11]:
lgb_cv_obj = lgb.cv(params, train_d, num_boost_round=1000, nfold=5, early_stopping_rounds=30, verbose_eval=10)

[10]	cv_agg's binary_error: 0.430229 + 1.63888e-05
[20]	cv_agg's binary_error: 0.430181 + 0.000238243
[30]	cv_agg's binary_error: 0.429894 + 0.000380747
[40]	cv_agg's binary_error: 0.428983 + 0.000671612
[50]	cv_agg's binary_error: 0.426537 + 0.00149333
[60]	cv_agg's binary_error: 0.424763 + 0.00145276
[70]	cv_agg's binary_error: 0.42107 + 0.0036306
[80]	cv_agg's binary_error: 0.418816 + 0.00496992
[90]	cv_agg's binary_error: 0.416514 + 0.00482712
[100]	cv_agg's binary_error: 0.413206 + 0.00522658
[110]	cv_agg's binary_error: 0.41287 + 0.00504478
[120]	cv_agg's binary_error: 0.409705 + 0.0035901
[130]	cv_agg's binary_error: 0.409562 + 0.00524269
[140]	cv_agg's binary_error: 0.408267 + 0.00456544
[150]	cv_agg's binary_error: 0.407596 + 0.00554945
[160]	cv_agg's binary_error: 0.407548 + 0.00583072
[170]	cv_agg's binary_error: 0.406589 + 0.00578921
[180]	cv_agg's binary_error: 0.407164 + 0.00501379
[190]	cv_agg's binary_error: 0.406637 + 0.00548133
[200]	cv_agg's binary_error: 0.407213 + 

In [14]:
num_rounds = len(lgb_cv_obj["binary_error-mean"])

In [15]:
# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(data[cols_for_model], data[target], 
                                                    test_size=0.30, 
                                                    random_state = 2018)

In [17]:
cls_train_obj = lgb.Dataset(X_train, y_train)
validation_set = lgb.Dataset(X_test, y_test)

In [18]:
lgb_train_obj = lgb.train(params, cls_train_obj, num_boost_round=num_rounds, early_stopping_rounds=30, valid_sets=validation_set,
                          verbose_eval=10)                         

Training until validation scores don't improve for 30 rounds.
[10]	valid_0's binary_error: 0.420329
[20]	valid_0's binary_error: 0.420329
[30]	valid_0's binary_error: 0.420169
[40]	valid_0's binary_error: 0.418571
[50]	valid_0's binary_error: 0.417932
[60]	valid_0's binary_error: 0.417612
[70]	valid_0's binary_error: 0.413777
[80]	valid_0's binary_error: 0.411379
[90]	valid_0's binary_error: 0.41042
[100]	valid_0's binary_error: 0.412977
[110]	valid_0's binary_error: 0.409621
[120]	valid_0's binary_error: 0.409781
[130]	valid_0's binary_error: 0.404667
[140]	valid_0's binary_error: 0.405626
[150]	valid_0's binary_error: 0.405306
[160]	valid_0's binary_error: 0.405786
Early stopping, best iteration is:
[130]	valid_0's binary_error: 0.404667


In [20]:
preds = lgb_train_obj.predict(X_test)

In [22]:
preds = np.where(preds>0.5, 1, 0)

In [24]:
accuracy_score(y_test, preds)

0.5953332267859996

In [30]:
confusion_matrix(y_test, preds)