# Light GBM

* Light GBM model is a type of  a GBM model which increase the  performance of XGBoost model.

## 1-)MODEL

In [1]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split

In [2]:
diabetes = pd.read_csv("diabetes.csv")
df = diabetes.copy()
df = df.dropna()
y = df["Outcome"]
X = df.drop(['Outcome'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.30, 
                                                    random_state=42)



In [3]:
#!conda install -c conda-forge lightgbm

In [4]:
from lightgbm import LGBMClassifier

In [5]:
lgbm_model = LGBMClassifier()
lgbm_model.fit(X_train, y_train)
lgbm_model

LGBMClassifier()

## 2-)Prediction

In [9]:
y_pred = lgbm_model.predict(X_test)
y_pred[0:10]

array([0, 0, 0, 0, 0, 1, 0, 1, 1, 1], dtype=int64)

In [10]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [11]:
accuracy_score(y_test, y_pred) # before model tuning

0.7272727272727273

In [12]:
confusion_matrix(y_test, y_pred) # before model tuning

array([[115,  36],
       [ 27,  53]], dtype=int64)

In [13]:
print(classification_report(y_test, y_pred)) # before model tuning

              precision    recall  f1-score   support

           0       0.81      0.76      0.78       151
           1       0.60      0.66      0.63        80

    accuracy                           0.73       231
   macro avg       0.70      0.71      0.71       231
weighted avg       0.74      0.73      0.73       231



## 3-)Model Tuning

* In this section, we will try to determine the optimum **n_estimators, max_depth, learning_rate, min_child_samples, subsample**  with the GridSearchCV method.


* GridSearchCV: Grid Search Cross Validation Methode



* Then , we will create the most optimum model by using optimum **n_estimators, max_depth, learning_rate, min_child_samples, subsample**  .





* **n_estimators, max_depth, learning_rate, min_child_samples, subsample**  are the hyperparameters that we will determine according to ourselves and we want it to be the most optimum.



* But instead of relying on our own feeling and sense in order to find the  optimum value of these hyperparameters   , we will find the optimum value of these hyperparameters   by using the gridsearch method.




* **max_depth**:The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.





* **n_estimators**:  The number of trees in the forest.






In [16]:
lgbm_model.learning_rate# default value of  learning_rate

0.1

In [17]:
print( lgbm_model.max_depth)# default value of max depth

-1


In [19]:
lgbm_model.min_child_samples# default value of min_child_samples

20

In [20]:
lgbm_model.n_estimators#default value of n_estimators

100

In [21]:
lgbm_model.subsample #default value of subsample

1.0

In [22]:
from sklearn.model_selection import GridSearchCV

In [23]:
lgbm_params = {
        'n_estimators': [100, 500, 1000, 2000],
        'subsample': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5,6],
        'learning_rate': [0.1,0.01,0.02,0.05],
        "min_child_samples": [5,10,20]}

In [24]:
lgbm = LGBMClassifier()

lgbm_cv_model = GridSearchCV(lgbm, lgbm_params, 
                             cv = 10, 
                             n_jobs = -1, 
                             verbose = 2)

In [25]:
lgbm_cv_model.fit(X_train, y_train)

Fitting 10 folds for each of 576 candidates, totalling 5760 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 238 tasks      | elapsed:   30.8s
[Parallel(n_jobs=-1)]: Done 497 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 864 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1229 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 1674 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 2245 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 2852 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 3661 tasks      | elapsed:  9.2min
[Parallel(n_jobs=-1)]: Done 4470 tasks      | elapsed: 11.9min
[Parallel(n_jobs=-1)]: Done 5441 tasks      | elapsed: 14.3min
[Parallel(n_jobs=-1)]: Done 5760 out of 5760 | elapsed: 15.5min finished


GridSearchCV(cv=10, estimator=LGBMClassifier(), n_jobs=-1,
             param_grid={'learning_rate': [0.1, 0.01, 0.02, 0.05],
                         'max_depth': [3, 4, 5, 6],
                         'min_child_samples': [5, 10, 20],
                         'n_estimators': [100, 500, 1000, 2000],
                         'subsample': [0.6, 0.8, 1.0]},
             verbose=2)

In [26]:
lgbm_cv_model.best_params_

{'learning_rate': 0.01,
 'max_depth': 3,
 'min_child_samples': 20,
 'n_estimators': 500,
 'subsample': 0.6}

### 3.1-) Tuned Model

In [27]:
lgbm = LGBMClassifier(learning_rate = 0.01, 
                       max_depth = 3,
                       subsample = 0.6,
                       n_estimators = 500,
                       min_child_samples = 20)

In [28]:
lgbm_tuned = lgbm.fit(X_train,y_train)

In [30]:
y_pred1 = lgbm_tuned.predict(X_test)
y_pred1[0:10]

array([0, 0, 0, 0, 0, 1, 0, 1, 1, 1], dtype=int64)

In [31]:
accuracy_score(y_test, y_pred1)# after model tuning

0.7445887445887446

In [32]:
confusion_matrix(y_test, y_pred1)# after model tuning

array([[121,  30],
       [ 29,  51]], dtype=int64)

In [33]:
print(classification_report(y_test, y_pred1))# after model tuning

              precision    recall  f1-score   support

           0       0.81      0.80      0.80       151
           1       0.63      0.64      0.63        80

    accuracy                           0.74       231
   macro avg       0.72      0.72      0.72       231
weighted avg       0.75      0.74      0.74       231

