In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report


Here we import the required libraries for our Prediction Task. Pandas is for data manipulation, numpy for numerical operations, sklearn for machine learning algorithms, and metrics for evaluating our models.

In [2]:
# Load the dataset
data = pd.read_csv('Churn_Modelling.csv')


We load the dataset 'Churn_Modelling.csv' into a Pandas DataFrame called data.

In [3]:
# Drop irrelevant columns
data = data.drop(columns=['RowNumber', 'CustomerId', 'Surname'])


We drop the columns 'RowNumber', 'CustomerId', and 'Surname' from our dataset as they are not relevant for our analysis.

In [4]:
# Encode categorical variables
label_encoder = LabelEncoder()
data['Geography'] = label_encoder.fit_transform(data['Geography'])
data['Gender'] = label_encoder.fit_transform(data['Gender'])


We use label encoding to convert categorical variables ('Geography' and 'Gender') into numerical format for our machine learning models.

In [5]:
# Split features and target variable
X = data.drop(columns=['Exited'])
y = data['Exited']


We separate the features (X) and the target variable (y) from our dataset. 'Exited' is the target variable we want to predict.

In [6]:
# Standardize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


We standardize the numerical features in our dataset to have a mean of 0 and a standard deviation of 1, which can help improve the performance of some machine learning algorithms.

In [7]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


We split our dataset into training and testing sets. The training set will be used to train our models, and the testing set will be used to evaluate their performance.

In [8]:
# Initialize models
logistic_reg = LogisticRegression()
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
gradient_boosting = GradientBoostingClassifier(n_estimators=100, random_state=42)


We initialize three machine learning models: Logistic Regression, Random Forest, and Gradient Boosting.

In [13]:
# Train models
logistic_reg.fit(X_train, y_train)
random_forest.fit(X_train, y_train)
gradient_boosting.fit(X_train, y_train)


We train the initialized models using the training data.

In [10]:
# Predictions
logistic_reg_pred = logistic_reg.predict(X_test)
random_forest_pred = random_forest.predict(X_test)
gradient_boosting_pred = gradient_boosting.predict(X_test)


We make predictions using the trained models on the testing data.

In [11]:
# Evaluate models
print("Logistic Regression Accuracy:", accuracy_score(y_test, logistic_reg_pred))
print("Random Forest Accuracy:", accuracy_score(y_test, random_forest_pred))
print("Gradient Boosting Accuracy:", accuracy_score(y_test, gradient_boosting_pred))


Logistic Regression Accuracy: 0.8155
Random Forest Accuracy: 0.8645
Gradient Boosting Accuracy: 0.866


We evaluate the performance of each model by calculating the accuracy score on the testing data.

In [12]:
# Classification report
print("\nLogistic Regression Classification Report:\n", classification_report(y_test, logistic_reg_pred))
print("\nRandom Forest Classification Report:\n", classification_report(y_test, random_forest_pred))
print("\nGradient Boosting Classification Report:\n", classification_report(y_test, gradient_boosting_pred))



Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.97      0.89      1607
           1       0.60      0.18      0.28       393

    accuracy                           0.82      2000
   macro avg       0.71      0.58      0.59      2000
weighted avg       0.78      0.82      0.77      2000


Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1       0.75      0.47      0.58       393

    accuracy                           0.86      2000
   macro avg       0.81      0.72      0.75      2000
weighted avg       0.85      0.86      0.85      2000


Gradient Boosting Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1       0.76      0.47      0.58       393

    accuracy                           0.87      2000
 

We print the classification report for each model, which includes precision, recall, F1-score, and support for each class. This provides more detailed performance metrics for each model.