In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE

In [6]:
titanic = pd.read_csv('artifacts/raw/titanic_train.csv')

In [7]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,593,0,3,"Elsbury, Mr. William James",male,47.0,0,0,A/5 3902,7.25,,S
1,853,0,3,"Boulos, Miss. Nourelain",female,9.0,1,1,2678,15.2458,,C
2,703,0,3,"Barbara, Miss. Saiide",female,18.0,0,1,2691,14.4542,,C
3,705,0,3,"Hansen, Mr. Henrik Juul",male,26.0,1,0,350025,7.8542,,S
4,824,1,3,"Moor, Mrs. (Beila)",female,27.0,0,1,392096,12.475,E121,S


In [8]:
titanic.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            149
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          553
Embarked         1
dtype: int64

In [9]:
titanic["Embarked"].value_counts()

Embarked
S    507
C    137
Q     67
Name: count, dtype: int64

In [10]:
titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())

titanic['Embarked'] = titanic['Embarked'].fillna(titanic['Embarked'].mode()[0])

titanic['Fare'] = titanic['Fare'].fillna(titanic['Fare'].median())

titanic['Sex'] = titanic['Sex'].map({'male': 0, 'female': 1})

titanic['Embarked'] = titanic['Embarked'].astype('category').cat.codes

In [11]:
titanic["Embarked"].value_counts()

Embarked
2    508
0    137
1     67
Name: count, dtype: int64

In [12]:
titanic['Familysize'] = titanic['SibSp'] + titanic['Parch'] + 1

titanic['Isalone'] = (titanic['Familysize'] == 1).astype(int)

titanic['HasCabin'] = titanic['Cabin'].notnull().astype(int)

titanic['Title'] = titanic['Name'].str.extract(' ([A-Za-z]+)\.', expand=False).map(
    {'Mr': 0, 'Miss': 1, 'Mrs': 2, 'Master': 3, 'Rare': 4}
).fillna(4)

titanic['Pclass_Fare'] = titanic['Pclass'] * titanic['Fare']

titanic['Age_Fare'] = titanic['Age'] * titanic['Fare']

In [13]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Familysize,Isalone,HasCabin,Title,Pclass_Fare,Age_Fare
0,593,0,3,"Elsbury, Mr. William James",0,47.0,0,0,A/5 3902,7.25,,2,1,1,0,0.0,21.75,340.75
1,853,0,3,"Boulos, Miss. Nourelain",1,9.0,1,1,2678,15.2458,,0,3,0,0,1.0,45.7374,137.2122
2,703,0,3,"Barbara, Miss. Saiide",1,18.0,0,1,2691,14.4542,,0,2,0,0,1.0,43.3626,260.1756
3,705,0,3,"Hansen, Mr. Henrik Juul",0,26.0,1,0,350025,7.8542,,2,2,0,0,0.0,23.5626,204.2092
4,824,1,3,"Moor, Mrs. (Beila)",1,27.0,0,1,392096,12.475,E121,2,2,0,1,2.0,37.425,336.825


In [14]:
X = titanic[['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Familysize', 'Isalone', 'HasCabin', 'Title', 'Pclass_Fare', 'Age_Fare']]
y = titanic['Survived']


In [15]:
titanic["Survived"].value_counts()

Survived
0    444
1    268
Name: count, dtype: int64

In [16]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [17]:
y_resampled.value_counts()

Survived
0    444
1    444
Name: count, dtype: int64

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [19]:
param_distributions = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

In [20]:
rf = RandomForestClassifier(random_state=42)
random_search = RandomizedSearchCV(rf, param_distributions, n_iter=10, cv=3, scoring='accuracy', random_state=42)
random_search.fit(X_train, y_train)

In [21]:
best_rf = random_search.best_estimator_
y_pred = best_rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy: {rf_accuracy:.2f}")

Random Forest Accuracy: 0.86
