### Importing libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Import data

In [None]:
df = sns.load_dataset("titanic")

### Data Understanding

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.sample(10)

### Data Wrangling

#### Missing Values

In [None]:
df.isnull().sum()

##### Understanding age to fill the most appropirate value

In [None]:
df[df["age"].isnull()]

In [None]:
df[df["age"].isnull()].groupby(["who", "embark_town"])["sex"].count()

In [None]:
df.groupby(["who"])["age"].describe()

In [None]:
df.age.median()

In [None]:
df["age"].fillna(df.age.median(), inplace=True)

#### Outlier Detection and Removal

In [None]:
plt.scatter(df["age"], df["fare"])

In [None]:
plt.boxplot(df[["age", "fare"]])

In [None]:
q1 = df['fare'].quantile(0.25)
q3 = df['fare'].quantile(0.75)
iqr = q3 - q1

fence_low = q1 - 1.5 * iqr
fence_high = q3 + 1.5 * iqr

df = df.loc[(df['fare'] > fence_low) & (df['fare'] < fence_high)].reset_index(drop=True)


#### Unique values

In [None]:
print(df["embarked"].unique(), df["embark_town"].unique())
df[["embarked", "embark_town"]].sample(5)

In [None]:
print(df["pclass"].unique(), df["class"].unique())
df[["pclass", "class"]].sample(5)

In [None]:
print(df["sibsp"].unique(), df["parch"].unique())
df[["sibsp", "parch"]].sample(5)

In [None]:
print(df["survived"].unique(), df["alive"].unique())
df[["survived", "alive"]].sample(5)

#### Drop Features


In [None]:
# Dropping alone and adult_male because we can get same information from parch and who respectively, deck is insignificant with a lot of missing values, embarked and embark_town, pclass and class, alive and survived are same
df.drop(columns = ["alone","deck","embarked", "adult_male", "class", "alive"], inplace=True)

#### Converting data types

In [None]:
df[["embark_town", "sex", "who"]] = df[["embark_town", "sex", "who"]].astype("category")
df[["pclass", "sibsp", "age", "parch", "survived"]] = df[["pclass", "sibsp", "age", "parch", "survived"]].astype("uint8")
df[["fare"]] = df[["fare"]].astype("uint16")

### Exploratory Data Analysis

In [None]:
corr = df.select_dtypes("number").corr()
sns.heatmap(corr, annot=True)

In [None]:
df.info()

#### Survivors by Passenger Type

In [None]:
survivors_count = df[df["survived"] == True].groupby("who")["survived"].count()
non_survivors_count = df[df["survived"] == False].groupby("who")["survived"].count()

combined_counts = pd.DataFrame({
    "Survivors": survivors_count,
    "Non-Survivors": non_survivors_count
})

combined_counts.plot(kind='bar')
plt.title("Survivors and Non-Survivors by Passenger Type")
plt.xlabel("Passenger Type")
plt.ylabel("Count")
plt.xticks(rotation=0)
plt.show()
print(combined_counts)

In [None]:
# Data preparation
total_count = df["embark_town"].value_counts()  # Total passengers by embark_town
survived_count = df[df["survived"] == 1]["embark_town"].value_counts()  # Survivors by embark_town
non_survivors_count = df[df["survived"] == 0]["embark_town"].value_counts()  # Non-survivors by embark_town

# Create subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 5))  # 1 row, 3 columns

# Plot total distribution
total_count.plot(kind="bar", ax=axes[0], color="skyblue", title="Embark Town Distribution")
axes[0].set_xlabel("Embark Town")
axes[0].set_ylabel("Count")
axes[0].set_xticks(range(len(total_count.index)))
axes[0].set_xticklabels(total_count.index, rotation=0)

# Plot survivors distribution
survived_count.plot(kind="bar", ax=axes[1], color="green", title="Survivors from Embark Town")
axes[1].set_xlabel("Embark Town")
axes[1].set_ylabel("Count")
axes[1].set_xticks(range(len(total_count.index)))
axes[1].set_xticklabels(total_count.index, rotation=0)

# Plot non-survivors distribution
non_survivors_count.plot(kind="bar", ax=axes[2], color="red", title="Non-Survivors from Embark Town")
axes[2].set_xlabel("Embark Town")
axes[2].set_ylabel("Count")
axes[2].set_xticks(range(len(total_count.index)))
axes[2].set_xticklabels(total_count.index, rotation=0)

plt.show()