### Importing libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Import data

In [None]:
df = sns.load_dataset("titanic")

### Data Understanding

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.sample(10)

### Data Wrangling

#### Missing Values

In [None]:
df.isnull().sum()

##### Understanding age to fill the most appropirate value

In [None]:
df[df["age"].isnull()]

In [None]:
df[df["age"].isnull()].groupby(["who", "embark_town"])["sex"].count()

In [None]:
df[df["age"].isnull()].groupby("who")["sex"].describe()

In [None]:
df.groupby(["who"])["age"].describe()

In [None]:
df.age.median()

In [None]:
df["age"].fillna(df.age.median(), inplace=True)

#### Outlier Detection and Removal

In [None]:
plt.scatter(df["age"], df["fare"])

In [None]:
plt.boxplot(df[["age", "fare"]])

In [None]:
q1 = df['fare'].quantile(0.25)
q3 = df['fare'].quantile(0.75)
iqr = q3 - q1

fence_low = q1 - 1.5 * iqr
fence_high = q3 + 1.5 * iqr

df = df.loc[(df['fare'] > fence_low) & (df['fare'] < fence_high)].reset_index(drop=True)


#### Unique values

In [None]:
df["embarked"].unique()

In [None]:
df["embark_town"].unique()

#### Drop Features


In [None]:
# Droping alone and adult_male because we can get same information from parch and who respectively, deck is insignificant with a lot of missing values, embarked is same as embark_town
df.drop(columns = ["alone","deck","embarked", "adult_male"], inplace=True)

#### Converting data types

In [None]:
df[["class", "embark_town", "sex", "who"]] = df[["class", "embark_town", "sex", "who"]].astype("category")
df[["pclass", "sibsp", "age", "parch"]] = df[["pclass", "sibsp", "age", "parch"]].astype("uint8")
df[["fare"]] = df[["fare"]].astype("uint16")
df["survived"] = df[["survived"]].astype("bool")
df["alive"] = df["alive"].str.replace("yes", "1").replace("no", "0").astype("uint8").astype("bool")