# Testing LightGBM Using Titanic Dataset
Dataset: https://www.kaggle.com/c/titanic

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.metrics import confusion_matrix

In [2]:
data = pd.read_csv("train.csv")
dataFeatures = data[[ 'Pclass', 'SibSp', 'Parch', 'Fare']]

In [54]:
data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [68]:
data.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [70]:
dataFeatures = data[['Pclass', 'SibSp', 'Parch', 'Fare']]
dataFeatures = pd.concat([dataFeatures,pd.get_dummies(data['Embarked']),pd.get_dummies(data['Sex'])],sort=False,axis=1)

In [71]:
yvalues = data['Survived']

Note: There is a class imbalance

In [6]:
print(f"Class imbalance: Positives: {yvalues[yvalues==1].shape[0]} | Negative: {yvalues[yvalues==0].shape[0]}")
print(f"Predict All Negative Accuracy = {round(yvalues[yvalues==0].shape[0]/(yvalues[yvalues==1].shape[0] + yvalues[yvalues==0].shape[0]),2)}")

Class imbalance: Positives: 342 | Negative: 549
Predict All Negative Accuracy = 0.62


In [73]:
X_train, X_test, Y_train, Y_test = train_test_split(dataFeatures, yvalues, test_size = 0.2, random_state = 10)

## Using LightGBM For Classification

In [74]:
import lightgbm as lgb;

In [75]:
import sklearn
sklearn.metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'accuracy', 'roc_auc', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'brier_score_loss', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted'])

In [77]:
def objective(params):
    learning_rate = 10. ** params[0]
    num_leaves = params[1]
    n_estimators = params[2]
    
    lgbm = lgb.LGBMClassifier(
                        learning_rate=learning_rate,
                        num_leaves=num_leaves,
                        max_depth=8,
                        n_estimators=n_estimators,
                        verbose=1)

    return -np.mean(cross_val_score(lgbm, X_train, Y_train, cv=10, scoring='accuracy'))

Testing the function with arbitrary parameters

In [78]:
objective([-.5,2,100])

-0.7738849765258216

## Using Bayesian Optimization For Hyperparameter Tuning

In [79]:
from skopt import gp_minimize

In [80]:
space = [(-5.0,0.0),(2,100),(2,300)]

r = gp_minimize(objective, space, n_calls=100, random_state=444,n_jobs=-1,verbose=True)

Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 1.8197
Function value obtained: -0.7907
Current minimum: -0.7907
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 2.4998
Function value obtained: -0.8033
Current minimum: -0.8033
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.3696
Function value obtained: -0.7964
Current minimum: -0.8033
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 3.9383
Function value obtained: -0.7963
Current minimum: -0.8033
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 0.4799
Function value obtained: -0.6067
Current minimum: -0.8033
Iteration No: 6 started. 

Iteration No: 41 ended. Search finished for the next optimal point.
Time taken: 1.8281
Function value obtained: -0.7767
Current minimum: -0.8119
Iteration No: 42 started. Searching for the next optimal point.
Iteration No: 42 ended. Search finished for the next optimal point.
Time taken: 3.6469
Function value obtained: -0.8033
Current minimum: -0.8119
Iteration No: 43 started. Searching for the next optimal point.
Iteration No: 43 ended. Search finished for the next optimal point.
Time taken: 4.7528
Function value obtained: -0.8091
Current minimum: -0.8119
Iteration No: 44 started. Searching for the next optimal point.
Iteration No: 44 ended. Search finished for the next optimal point.
Time taken: 5.5178
Function value obtained: -0.8020
Current minimum: -0.8119
Iteration No: 45 started. Searching for the next optimal point.
Iteration No: 45 ended. Search finished for the next optimal point.
Time taken: 4.2333
Function value obtained: -0.8077
Current minimum: -0.8119
Iteration No: 46 st

Iteration No: 81 ended. Search finished for the next optimal point.
Time taken: 2.4080
Function value obtained: -0.8006
Current minimum: -0.8119
Iteration No: 82 started. Searching for the next optimal point.
Iteration No: 82 ended. Search finished for the next optimal point.
Time taken: 2.2907
Function value obtained: -0.8077
Current minimum: -0.8119
Iteration No: 83 started. Searching for the next optimal point.
Iteration No: 83 ended. Search finished for the next optimal point.
Time taken: 2.9675
Function value obtained: -0.7964
Current minimum: -0.8119
Iteration No: 84 started. Searching for the next optimal point.
Iteration No: 84 ended. Search finished for the next optimal point.
Time taken: 7.5359
Function value obtained: -0.8035
Current minimum: -0.8119
Iteration No: 85 started. Searching for the next optimal point.
Iteration No: 85 ended. Search finished for the next optimal point.
Time taken: 3.4377
Function value obtained: -0.7936
Current minimum: -0.8119
Iteration No: 86 st

In [81]:
best_params = r.x
best_params

[-2.2380598356874204, 25, 207]

In [82]:
r.fun

-0.811913145539906

In [83]:
final_lgbm = lgb.LGBMClassifier(num_leaves=best_params[1],
                        learning_rate= 10 ** best_params[0],
                        n_estimators=best_params[2],
                        verbose=1)
    
final_lgbm.fit(X_train, Y_train)
Y_pred = final_lgbm.predict(X_test)

Y_pred[Y_pred > 0.5] = 1
Y_pred[Y_pred <= 0.5] = 0

accuracy_score(Y_pred,Y_test)

0.8044692737430168

In [84]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix

Y_true = Y_test
conf = confusion_matrix(Y_true, Y_pred)
tn, fp, fn, tp = conf.ravel()

print("LightGBM Results")

print(conf)

precision = (tp / (tp + fp))
recall = (tp / (tp + fn))
f1_score = 2*precision*recall/(precision + recall)

print ('\n')
print ("Accuracy:              %0.2f" % accuracy_score(Y_pred,Y_true))
print ("Precision:              %0.2f" % precision)
print ("Recall:                 %0.2f"% recall)
print ("F1 Score:               %0.4f"% f1_score)

LightGBM Results
[[102  15]
 [ 20  42]]


Accuracy:              0.80
Precision:              0.74
Recall:                 0.68
F1 Score:               0.7059


In [85]:
best_params = r.x
best_params

[-2.2380598356874204, 25, 207]

## Submit To Kaggle For Real Test Data

In [87]:
data_test = pd.read_csv("test.csv")

In [88]:
X_submit = data_test[['Pclass', 'SibSp', 'Parch', 'Fare']]
X_submit = pd.concat([X_submit,pd.get_dummies(data_test['Embarked']),pd.get_dummies(data_test['Sex'])],sort=False,axis=1)

In [94]:
print(X_train.columns)
print(X_submit.columns)
data_test.shape

Index(['Pclass', 'SibSp', 'Parch', 'Fare', 'C', 'Q', 'S', 'female', 'male'], dtype='object')
Index(['Pclass', 'SibSp', 'Parch', 'Fare', 'C', 'Q', 'S', 'female', 'male'], dtype='object')


(418, 11)

In [91]:
Y_lgbm_submit = final_lgbm.predict(X_submit)
Y_lgbm_submit[Y_lgbm_submit > 0.5] = 1
Y_lgbm_submit[Y_lgbm_submit <= 0.5] = 0

In [92]:
lgbm_submit = pd.DataFrame(list(zip(data_test['PassengerId'],Y_lgbm_submit)),columns=['PassengerId','Survived'])

In [93]:
lgbm_submit.to_csv("lgbm_submit.csv",index=False)

# FINAL RESULT

## LightGBM w AutoTuning

![lr](lgbm_titanic_6123.png)

# Thank You!