In [None]:
# Libraries
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# Data Operations
from sklearn.model_selection import train_test_split
from category_encoders import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

# Classification Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
# Metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import warnings
warnings.filterwarnings("ignore")


### Importing libraries

### Import data

In [None]:
df_train = pd.read_csv("./data/train.csv")
df_test = pd.read_csv("./data/test.csv")

### Data Understanding

In [None]:
df_train.shape

In [None]:
df_train.info()

In [None]:
df_train.describe()

In [None]:
df_train.sample(5)

### Data Wrangling

#### Missing Values

In [None]:
df_train.isnull().sum()

##### Median imputation for Age

In [None]:
df_train["Age"].fillna(df_train["Age"].median(), inplace=True)

#### Outlier Detection and Removal

##### Plots before removing outliers

In [None]:
fig, axes = plt.subplots(1,3, figsize=(18,5))

# Fare vs Age
axes[0].scatter(df_train["Age"], df_train["Fare"])
axes[0].set_title("Fare vs Age")
axes[0].set_xlabel("Age")
axes[0].set_ylabel("Fare")

# Fare Boxplot
axes[1].boxplot(df_train["Fare"])
axes[1].set_title("Fare")

# Age Boxplot
axes[2].boxplot(df_train["Age"])
axes[2].set_title("Age")

plt.show()

#### IQR Method (fare)

In [None]:
q1 = df_train["Fare"].quantile(0.25)
q3 = df_train["Fare"].quantile(0.75)
iqr = q3 - q1

low = q1 - 1.5 * iqr
high = q3 + 1.5 * iqr

df_train = df_train.loc[(df_train["Fare"] > low) & (df_train["Fare"] < high)].reset_index(drop=True)

#### IQR Method (age)

In [None]:
q1 = df_train["Age"].quantile(0.25)
q3 = df_train["Age"].quantile(0.75)
iqr = q3 - q1

low = q1 - 1.5 * iqr
high = q3 + 1.5 * iqr

df_train = df_train.loc[(df_train["Age"] > low) & (df_train["Age"] < high)].reset_index(drop=True)

##### Plots after removing outliers

In [None]:
fig, axes = plt.subplots(1,3, figsize=(18,5))

# Fare vs Age
axes[0].scatter(df_train["Age"], df_train["Fare"])
axes[0].set_title("Fare vs Age")
axes[0].set_xlabel("Age")
axes[0].set_ylabel("Fare")

# Fare Boxplot
axes[1].boxplot(df_train["Fare"])
axes[1].set_title("Fare")

# Age Boxplot
axes[2].boxplot(df_train["Age"])
axes[2].set_title("Age")

plt.show()


#### Drop Features


In [None]:
# Cabin contains high amount of missing values, >50%
df_train.drop(columns = ["Cabin"], inplace=True)

# Ticket is a high cardinality feature
df_train.drop(columns="Ticket", inplace=True)

#### Converting data types

In [None]:
# category
df_train[["Embarked", "Sex"]] = df_train[["Embarked", "Sex"]].astype("category")

# uint8
df_train[["Pclass", "SibSp", "Age", "Parch", "Survived", "Fare"]] = df_train[["Pclass", "SibSp", "Age", "Parch", "Survived", "Fare"]].astype("uint8")

# unit16
df_train[["PassengerId"]] = df_train[["PassengerId"]].astype("uint16")

### Exploratory Data Analysis

#### Collinearity

In [None]:
corr = df_train.select_dtypes("number").drop(columns=["Survived", "PassengerId"]).corr()
sns.heatmap(corr, annot=True)

In [None]:
df_train.info()

#### Data visualization

##### Survivors by Passenger Type

In [None]:
survivors_count = df_train[df_train["Survived"] == True].groupby("Sex")["Survived"].count()
non_survivors_count = df_train[df_train["Survived"] == False].groupby("Sex")["Survived"].count()

combined_counts = pd.DataFrame({
    "Survivors": survivors_count,
    "Non-Survivors": non_survivors_count
})

combined_counts.plot(kind='bar')
plt.title("Survivors and Non-Survivors by Passenger Type")
plt.xlabel("Passenger Type")
plt.ylabel("Count")
plt.xticks(rotation=0)
plt.show()
print(combined_counts)

##### Survivors & Non-Survivor by Embark Town


In [None]:
total_count = df_train["Embarked"].value_counts()
survived_count = df_train[df_train["Survived"] == 1]["Embarked"].value_counts()
non_survivors_count = df_train[df_train["Survived"] == 0]["Embarked"].value_counts()

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Plot total distribution
total_count.plot(kind="bar", ax=axes[0], color="skyblue", title="Embark Town Distribution")
axes[0].set_xlabel("Embarked")
axes[0].set_ylabel("Count")
axes[0].set_xticks(range(len(total_count.index)))
axes[0].set_xticklabels(total_count.index, rotation=0)

# Plot survivors distribution
survived_count.plot(kind="bar", ax=axes[1], color="green", title="Survivors from Embark Town")
axes[1].set_xlabel("Embarked")
axes[1].set_ylabel("Count")
axes[1].set_xticks(range(len(total_count.index)))
axes[1].set_xticklabels(total_count.index, rotation=0)

# Plot non-survivors distribution
non_survivors_count.plot(kind="bar", ax=axes[2], color="red", title="Non-Survivors from Embark Town")
axes[2].set_xlabel("Embarked")
axes[2].set_ylabel("Count")
axes[2].set_xticks(range(len(total_count.index)))
axes[2].set_xticklabels(total_count.index, rotation=0)

plt.show()

### Feature Engineering

#### Family Size

In [None]:
df_train["Family_size"] = df_train["SibSp"] + df_train["Parch"] + 1
plt.hist(df_train["Family_size"])
plt.xlabel("Family Size")
plt.ylim(0, 500)
plt.ylabel("Count")
plt.title("Family Size Distribution")
plt.show()

### Predictive Modeling

#### Features & Target Selection

In [None]:
df_train.columns

In [178]:
features = df_train[["Age", "Fare", "Sex", "SibSp", "Parch"]]

In [179]:
target = df_train["Survived"]

#### Model Selection

In [180]:
X_train = features
y_train = target

In [181]:
X_test = df_test[["Age","Fare", "Sex", "SibSp", "Parch"]]

In [182]:
# X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.20, random_state=42)

In [183]:
model = make_pipeline(OneHotEncoder(), SimpleImputer(), RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1))
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [184]:
output = pd.DataFrame({'PassengerId': df_test.PassengerId, 'Survived': y_pred})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [185]:
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

ValueError: Found input variables with inconsistent numbers of samples: [140, 418]

#### Multiple models

In [166]:
# models = {
#     "Logistic Regression": make_pipeline(OneHotEncoder(), SimpleImputer(), LogisticRegression()),
#     "Random Forest": make_pipeline(OneHotEncoder(), SimpleImputer(), RandomForestClassifier()),
#     "Support Vector Machine": make_pipeline(OneHotEncoder(), SimpleImputer(), SVC(kernel="linear")),
#     "Naive Bayes": make_pipeline(OneHotEncoder(), SimpleImputer(), GaussianNB()),
# }

In [167]:
# results = {}
# for name, model in models.items():
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     accuracy = accuracy_score(y_test, y_pred)
#     precision = precision_score(y_test, y_pred, average="macro")
#     recall = recall_score(y_test, y_pred, average="macro")
#     f1 = f1_score(y_test, y_pred, average="macro")
#     results[name] = {
#         "Accuracy": accuracy,
#         "Precision": precision,
#         "Recall": recall,
#         "F1": f1
#     }

#### Model Metrics

In [168]:
# for name, metrics in results.items():
#     print(f"{name}:")
#     print(f"  Accuracy = {metrics['Accuracy'] * 100:.2f}%")
#     print(f"  Precision = {metrics['Precision'] * 100:.2f}%")
#     print(f"  Recall = {metrics['Recall'] * 100:.2f}%")
#     print(f"  F1 Score = {metrics['F1'] * 100:.2f}%")


#### Comparing model accuracy

In [169]:
# model_names = list(results.keys())
# accuracies = [metrics["Accuracy"] for metrics in results.values()]
# precisions = [metrics["Precision"] for metrics in results.values()]
# recalls = [metrics["Recall"] for metrics in results.values()]
# f1_scores = [metrics["F1"] for metrics in results.values()]
#
# fig, axes = plt.subplots(2, 2, figsize=(12, 8))
#
# # Plot accuracy
# axes[0, 0].bar(model_names, accuracies, width=0.5, align="center")
# axes[0, 0].set_title("Accuracy Comparison")
# axes[0, 0].set_ylabel("Accuracy")
# axes[0, 0].set_ylim(0, 1)
# axes[0, 0].tick_params(axis="x", labelrotation=45)
#
# # Plot precision
# axes[0, 1].bar(model_names, precisions, width=0.5, align="center")
# axes[0, 1].set_title("Precision")
# axes[0, 1].set_ylabel("Precision")
# axes[0, 1].set_ylim(0, 1)
# axes[0, 1].tick_params(axis="x", labelrotation=45)
#
# # Plot recall
# axes[1, 0].bar(model_names, recalls, width=0.5, align="center")
# axes[1, 0].set_title("Recall")
# axes[1, 0].set_ylabel("Recall")
# axes[1, 0].set_ylim(0, 1)
# axes[1, 0].tick_params(axis="x", labelrotation=45)
#
# # Plot F1 score
# axes[1, 1].bar(model_names, f1_scores, width=0.5, align="center")
# axes[1, 1].set_title("F1 Score")
# axes[1, 1].set_ylabel("F1 Score")
# axes[1, 1].set_ylim(0, 1)
# axes[1, 1].tick_params(axis="x", labelrotation=45)
#
# plt.tight_layout()
# plt.show()