In [3]:
# SMOTE on titanic dataset to remove imbalance between survived = 0 and survived = 1 
# (label/predicted class value)

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

In [None]:
df = pd.read_csv('titanic.csv')

Unnamed: 0,survived,pclass,gender,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,S,
3,1,1,female,35.0,1,0,53.1,S,C
4,0,3,male,35.0,0,0,8.05,S,


In [5]:
df = df[['survived', 'pclass', 'gender', 'age', 'sibsp', 'fare']].dropna()
df.head()

Unnamed: 0,survived,pclass,gender,age,sibsp,fare
0,0,3,male,22.0,1,7.25
1,1,1,female,38.0,1,71.2833
2,1,3,female,26.0,0,7.925
3,1,1,female,35.0,1,53.1
4,0,3,male,35.0,0,8.05


In [6]:
le = LabelEncoder()
df['gender'] = le.fit_transform(df['gender'])

x = df.drop('survived', axis = 1)
y = df['survived']

In [7]:
# Check class distribution before SMOTE
print('Class distribution before SMOTE:')
print(y.value_counts())

Class distribution before SMOTE:
survived
0    424
1    290
Name: count, dtype: int64


In [None]:
# split data

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=42, 
                                                    stratify=y)

In [9]:
# ---------------Before SMOTE----------------
model = LogisticRegression(max_iter=500)
model.fit(x_train, y_train)
y_pred_before = model.predict(x_test)
print("\n BEFORE SMOTE:")
print("Accuracy:", accuracy_score(y_test,y_pred_before))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_before))


 BEFORE SMOTE:
Accuracy: 0.7953488372093023
Confusion Matrix:
 [[106  22]
 [ 22  65]]


In [10]:
# ---------------Apply SMOTE----------------
smote = SMOTE(random_state=42)
x_train_res, y_train_res = smote.fit_resample(x_train, y_train)

In [11]:
# Check class distribution after SMOTE

print("\nClass distribution after SMOTE:")
print(pd.Series(y_train_res).value_counts())


Class distribution after SMOTE:
survived
0    296
1    296
Name: count, dtype: int64


In [12]:
model_smote = LogisticRegression(max_iter=500)
model_smote.fit(x_train_res, y_train_res)
y_pred_after = model_smote.predict(x_test)
print("\n AFTER SMOTE:")
print("Accuracy:", accuracy_score(y_test,y_pred_after))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_after))


 AFTER SMOTE:
Accuracy: 0.7813953488372093
Confusion Matrix:
 [[101  27]
 [ 20  67]]
