In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB 


In [None]:
# Load the dataset
data = pd.read_csv('titanic.csv')
data.head(20)

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
# Preprocessing: Taking a close look at the missing values
total = data.isnull().sum().sort_values(ascending=False)
percent_1 = data.isnull().sum()/data.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data.head(5)

In [None]:
# Now lets explore throught the data 
# We focus on gender a little too much so we will be comparing correlations for sub classes in gender 
# lets take a look at the correlation of age with survival probability
survived = 'survived'
not_survived = 'not survived'
fig, axes = plt.subplots(nrows=1, ncols=2,figsize=(10, 4))
women = data[data['Gender']=='female']
men = data[data['Gender']=='male']
ax = sns.histplot(women[women['Survived']==1].Age.dropna(), bins=18, label = survived, ax = axes[0], kde =False,alpha=0.3)
ax = sns.histplot(women[women['Survived']==0].Age.dropna(), bins=40, label = not_survived, ax = axes[0], kde =False)
ax.legend()
ax.set_title('Female')
ax = sns.histplot(men[men['Survived']==1].Age.dropna(), bins=18, label = survived, ax = axes[1], kde = False,alpha=0.3)
ax = sns.histplot(men[men['Survived']==0].Age.dropna(), bins=40, label = not_survived, ax = axes[1], kde = False)
ax.legend()
_ = ax.set_title('Male')
# its found that different age groups in men and women have varying survival chances it is not a linear correlation 
# so we need to make classes out of age 

In [None]:
# lets look for any correlation between embarked and survival chance 
FacetGrid = sns.FacetGrid(data, row='Embarked', aspect=1.6)
FacetGrid.map(sns.pointplot, 'Pclass', 'Survived', 'Gender', palette='dark:#1f77b4',  order=None, hue_order=None )
FacetGrid.add_legend()

# Women on port Q have a higher chance of survival but women on ports S and C have low chances of survival
# Men have a high survival probability if they are on port C and S, but a low probability if they are on port Q.

In [None]:
# pclass have a negative correlation with survived so we look into it a bit deeper 
sns.barplot(x='Pclass', y='Survived', data=data)

In [None]:
# pclass = 1 implies the person is more likely to survive and other pclasses people in a localised age clusters survived 
grid = sns.FacetGrid(data, col='Survived', row='Pclass', aspect=1.6)
grid.map(plt.hist, 'Age', alpha=.5, bins=20)
grid.add_legend()

In [None]:
# we combine sibsp and parch to create a feature called relatives
data['relatives'] = data['SibSp'] + data['Parch']
data.loc[data['relatives'] > 0, 'not_alone'] = 0
data.loc[data['relatives'] == 0, 'not_alone'] = 1
data['not_alone'] = data['not_alone'].astype(int)
data['not_alone'].value_counts()
axes = sns.catplot(data=data, x='relatives',y='Survived', aspect = 2.5,kind='bar')

In [None]:
# As 'cabin' has a lot of null values we drop the column
data = data.drop(columns=['Cabin'])

data

In [None]:
# lets fill up the nan values in age with near realitic values 
mean = data["Age"].mean()
std = data["Age"].std()
is_null = data["Age"].isnull().sum()
# compute random numbers between the mean, std and is_null
rand_age = np.random.randint(mean - std, mean + std, size = is_null)
# fill NaN values in Age column with random values generated
age_slice = data["Age"].copy()
age_slice[np.isnan(age_slice)] = rand_age
data["Age"] = age_slice
data["Age"] = data["Age"].astype(int)

data["Age"].isnull().sum()

In [None]:
# with only 2 missing values we can fill embarked with most common value
data['Embarked'].describe()

In [None]:
data['Embarked'] = data['Embarked'].fillna('S')
# all the null values are handled now 
data.info()

In [None]:
# mapping gender to numerical values
data['Gender'] = data['Gender'].map({'male': 0, 'female': 1})  # Convert categorical to numerical
print(data['Gender'])

In [None]:
data['Age'] = data['Age'].astype(int)
data.loc[ data['Age'] <= 11, 'Age'] = 0
data.loc[(data['Age'] > 11) & (data['Age'] <= 18), 'Age'] = 1
data.loc[(data['Age'] > 18) & (data['Age'] <= 22), 'Age'] = 2
data.loc[(data['Age'] > 22) & (data['Age'] <= 27), 'Age'] = 3
data.loc[(data['Age'] > 27) & (data['Age'] <= 33), 'Age'] = 4
data.loc[(data['Age'] > 33) & (data['Age'] <= 40), 'Age'] = 5
data.loc[(data['Age'] > 40) & (data['Age'] <= 66), 'Age'] = 6
data.loc[ data['Age'] > 66, 'Age'] = 6

# its a nice distribution 
data['Age'].value_counts()

In [None]:
titles = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
# extract titles
data['Title'] = data.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
# replace titles with a more common title or as Rare
data['Title'] = data['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr',
                                        'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
data['Title'] = data['Title'].replace('Mlle', 'Miss')
data['Title'] = data['Title'].replace('Ms', 'Miss')
data['Title'] = data['Title'].replace('Mme', 'Mrs')
# convert titles into numbers
data['Title'] = data['Title'].map(titles)
# filling NaN with 0, to get safe
data['Title'] = data['Title'].fillna(0)

In [None]:
# data = data.drop(columns = ['Ticket','PassengerId','Name','SibSp','Parch'])
data = data.drop(columns = ['Ticket','PassengerId','Name'])

In [None]:
ohe_data = pd.get_dummies(data,columns=['Pclass','Embarked'],drop_first=True)
ohe_data

In [None]:
corr_mat =  ohe_data.corr()
plt.figure(figsize=(8,6))
sns.heatmap(corr_mat,annot=True,cmap="coolwarm",linewidths=.5)

In [None]:
# Split into features and target
X = ohe_data.drop(columns=['Survived'], axis=1)
y = ohe_data['Survived']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Initialize RandomForestClassifier with default parameters
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Make predictions
y_pred = rf.predict(X_test)

# Evaluate the model
print("Default Random Forest Classifier Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for tuning
n_estimators= [5,10 ,25 , 50, 100, 200]
max_depth= [None, 5, 10, 15,25,50]
min_samples_split= [2, 5, 10,20,50]

accuracy = []


for estimator in n_estimators:
    for maxdepth in max_depth:
        for split in min_samples_split:
            rfc = RandomForestClassifier(n_estimators=estimator,max_depth=maxdepth,min_samples_split=split,random_state=42)
            rfc.fit(X_train, y_train)

            # Make predictions
            y_pred = rfc.predict(X_test)

            # Evaluate the model
            acc = accuracy_score(y_test, y_pred)
            if estimator==10 and maxdepth == 50:
                accuracy.append(acc)
                print(accuracy)

In [None]:
plt.plot(min_samples_split,accuracy)
plt.xlabel('Min Samples Split')
plt.ylabel('Accuracy')
plt.show()

In [None]:
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8,6))
sns.heatmap(matrix,annot=True,fmt='.0f',cmap="coolwarm",linewidths=.5)
plt.ylabel('Actual')
plt.xlabel("Predicted")
print(classification_report(y_test, y_pred))

In [None]:
feature_importances = pd.DataFrame({
    'feature': X.columns,
    'importance': rfc.feature_importances_
}).sort_values(by='importance', ascending=False)

In [None]:
feature_importances

In [None]:
# Initialize DecisionTree with default parameters
dtree = DecisionTreeClassifier(random_state=42)
dtree.fit(X_train, y_train)

# Make predictions
y_pred = dtree.predict(X_test)

# Evaluate the model
print("Default Decision Tree Classifier Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8,6))
sns.heatmap(matrix,annot=True,fmt='.0f',cmap="coolwarm",linewidths=.5)
plt.ylabel('Actual')
plt.xlabel("Predicted")

In [None]:
# Define the hyperparameter grid
param_grid = {
    'criterion': ['gini', 'entropy'],  # Function to measure the quality of a split
    'max_depth': [None, 2, 4, 6, 8, 10],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4, 6],  # Minimum number of samples required to be at a leaf node
    'max_features': [None, 'sqrt', 'log2']  # Number of features to consider when looking for the best split
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=dtree, param_grid=param_grid, 
                           scoring='accuracy', cv=5, n_jobs=-1, verbose=1)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Evaluate the best model
best_dtree = grid_search.best_estimator_
y_pred = best_dtree.predict(X_test)

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8,6))
sns.heatmap(matrix,annot=True,fmt='.0f',cmap="coolwarm",linewidths=.5)
plt.ylabel('Actual')
plt.xlabel("Predicted")

In [None]:
from sklearn.linear_model import LogisticRegression

# Initialize and train the model
log_model = LogisticRegression(max_iter=200)
log_model.fit(X_train, y_train)

# Make predictions
y_pred_log = log_model.predict(X_test)

# Evaluate the model
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_log))


solvers = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
max_iters = [10,50,100,150,200,250,300,500]

for solver in solvers:
    for max_iter in max_iters:
        model = LogisticRegression(solver=solver,max_iter=max_iter)
        model.fit(X_train,y_train)
        
        y_pred = model.predict(X_test)

        print(f"solver = {solver}  ------- iter = {max_iter}")
        print(classification_report(y_pred=y_pred,y_true=y_test))

In [None]:
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Inverse of regularization strength
    'solver': ['liblinear', 'saga'],      # Algorithms to use for optimization
    'penalty': ['l1', 'l2', 'elasticnet'], # Regularization type
    'max_iter': [100, 200, 300]            # Maximum number of iterations
}


# Set up GridSearchCV
grid_search = GridSearchCV(estimator=log_model, param_grid=param_grid, 
                           scoring='accuracy', cv=5, n_jobs=-1, verbose=1)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)
# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

best_log_model = grid_search.best_estimator_
y_pred = best_log_model.predict(X_test)

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Print accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize the KNN model
knn = KNeighborsClassifier(n_neighbors=5)  # You can adjust n_neighbors

# Train the model
knn.fit(X_train, y_train)

# Make predictions
y_pred = knn.predict(X_test)

# Evaluate the model
accuracy_before_tuning = accuracy_score(y_test, y_pred)
print("Accuracy before tuning:", accuracy_before_tuning)
print("\nClassification Report (Before Tuning):")
print(classification_report(y_test, y_pred))

In [None]:
# Define the hyperparameter grid for KNN
param_grid = {
    'n_neighbors': list(range(1, 21)),  # Test n_neighbors from 1 to 20
    'weights': ['uniform', 'distance']    # Test uniform vs distance weights
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, 
                           scoring='accuracy', cv=5, n_jobs=-1, verbose=1)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

In [None]:
# Evaluate the best model
best_knn = grid_search.best_estimator_
y_pred_tuned = best_knn.predict(X_test)

# Print accuracy and classification report after tuning
accuracy_after_tuning = accuracy_score(y_test, y_pred_tuned)
print("Accuracy after tuning:", accuracy_after_tuning)
print("\nClassification Report (After Tuning):")
print(classification_report(y_test, y_pred_tuned))