# Name : Snehal shyam Jagtap

## Assignement No 15



### XGBM & LGBM
The objective of this assignment is to compare the performance of Light GBM and XG Boost algorithms using the Titanic dataset. 

In [1]:
pip install lightgbm

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.



In [2]:
pip install --upgrade pandas

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install --upgrade dask

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


### Step 1: Import Libraries and Load Data

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [5]:
# Load datasets
train_data = pd.read_csv('Titanic_train.csv')
test_data = pd.read_csv('Titanic_test.csv')

### Step 2: Data Preprocessing

In [6]:
# Check for missing values
train_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
# Fill missing values (you can use other strategies based on data insights)
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)

In [8]:
# Drop the 'Cabin' column due to a large number of missing values
train_data.drop(columns=['Cabin'], inplace=True)

In [9]:
# Encoding categorical variables
train_data = pd.get_dummies(train_data, columns=['Sex', 'Embarked'], drop_first=True)

In [10]:
# Drop irrelevant columns like PassengerId, Name, Ticket
train_data.drop(columns=['PassengerId', 'Name', 'Ticket'], inplace=True)

In [11]:
# Split features and target
X = train_data.drop(columns=['Survived'])
y = train_data['Survived']

In [12]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Step 3: Build Models and Evaluate Performance
3.1 LightGBM Model

In [13]:
# Train LightGBM model
lgbm = LGBMClassifier()
lgbm.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 268, number of negative: 444
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001465 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 195
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376404 -> initscore=-0.504838
[LightGBM] [Info] Start training from score -0.504838


In [14]:
# Predict on test set
y_pred_lgbm = lgbm.predict(X_test)

In [15]:
# Evaluation
print("LGBM Accuracy:", accuracy_score(y_test, y_pred_lgbm))
print("LGBM Precision:", precision_score(y_test, y_pred_lgbm))
print("LGBM Recall:", recall_score(y_test, y_pred_lgbm))
print("LGBM F1 Score:", f1_score(y_test, y_pred_lgbm))

LGBM Accuracy: 0.8268156424581006
LGBM Precision: 0.7945205479452054
LGBM Recall: 0.7837837837837838
LGBM F1 Score: 0.7891156462585033


3.2 XGBoost Model

In [16]:
# Train XGBoost model
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

In [17]:
# Predict on test set
y_pred_xgb = xgb.predict(X_test)

In [18]:
# Evaluation
print("XGB Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("XGB Precision:", precision_score(y_test, y_pred_xgb))
print("XGB Recall:", recall_score(y_test, y_pred_xgb))
print("XGB F1 Score:", f1_score(y_test, y_pred_xgb))

XGB Accuracy: 0.8212290502793296
XGB Precision: 0.8
XGB Recall: 0.7567567567567568
XGB F1 Score: 0.7777777777777778


### Step 4: Hyperparameter Tuning and Cross-validation
4.1 LightGBM Hyperparameter Tuning

In [19]:
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'num_leaves': [31, 50, 100]
}

grid_lgbm = GridSearchCV(LGBMClassifier(), param_grid, cv=5, scoring='accuracy')
grid_lgbm.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 214, number of negative: 355
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000250 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 180
[LightGBM] [Info] Number of data points in the train set: 569, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376098 -> initscore=-0.506142
[LightGBM] [Info] Start training from score -0.506142
[LightGBM] [Info] Number of positive: 214, number of negative: 355
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000233 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 179
[LightGBM] [Info] Number of data points in the train set: 569, number of used features: 8
[LightGBM] [Info] [binary:BoostFro

In [20]:
# Best parameters and score
print("Best LGBM parameters:", grid_lgbm.best_params_)
print("Best LGBM cross-validated score:", grid_lgbm.best_score_)

Best LGBM parameters: {'learning_rate': 0.01, 'n_estimators': 200, 'num_leaves': 31}
Best LGBM cross-validated score: 0.8286220821432089


4.2 XGBoost Hyperparameter Tuning

In [21]:
param_grid_xgb = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7]
}

grid_xgb = GridSearchCV(XGBClassifier(), param_grid_xgb, cv=5, scoring='accuracy')
grid_xgb.fit(X_train, y_train)

In [22]:
# Best parameters and score
print("Best XGB parameters:", grid_xgb.best_params_)
print("Best XGB cross-validated score:", grid_xgb.best_score_)

Best XGB parameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200}
Best XGB cross-validated score: 0.8356544863587116


### Step 5: Comparative Analysis

In [23]:
results = pd.DataFrame({
    'Model': ['LightGBM', 'XGBoost'],
    'Accuracy': [accuracy_score(y_test, y_pred_lgbm), accuracy_score(y_test, y_pred_xgb)],
    'Precision': [precision_score(y_test, y_pred_lgbm), precision_score(y_test, y_pred_xgb)],
    'Recall': [recall_score(y_test, y_pred_lgbm), recall_score(y_test, y_pred_xgb)],
    'F1 Score': [f1_score(y_test, y_pred_lgbm), f1_score(y_test, y_pred_xgb)]
})

print(results)

      Model  Accuracy  Precision    Recall  F1 Score
0  LightGBM  0.826816   0.794521  0.783784  0.789116
1   XGBoost  0.821229   0.800000  0.756757  0.777778
