# Titanic EDA Notebook

This notebook is dedicated to exploratory data analysis (EDA) on the Titanic dataset.  
Note: Preprocessing functions (scaling, encoding, etc.) are defined in `data_processing.py` and `processed.py`.  
Here we mainly focus on data exploration, visualization, and feature understanding.


In [None]:
# 1. Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import custom preprocessing functions (if needed)
from data_processing import min_max_scale, standardize, one_hot_encode


In [None]:
# 2. Load Raw Dataset
df = pd.read_csv("titanic.csv")  # adjust path if needed
df.head()

In [None]:
# 3. Data Overview
print("Shape of dataset:", df.shape)
df.info()
df.describe()
df.isnull().sum()


In [None]:
# 4. Target Variable Distribution
sns.countplot(x="Survived", data=df)
plt.title("Survival Distribution")
plt.show()


In [None]:
# 5. Numerical Feature Exploration
fig, axs = plt.subplots(1, 2, figsize=(12, 5))

sns.histplot(df["Age"].dropna(), bins=30, kde=True, ax=axs[0])
axs[0].set_title("Age Distribution")

sns.histplot(df["Fare"], bins=30, kde=True, ax=axs[1])
axs[1].set_title("Fare Distribution")

plt.tight_layout()
plt.show()


In [None]:
# 6. Categorical Feature Exploration
fig, axs = plt.subplots(1, 2, figsize=(12, 5))

sns.countplot(x="Sex", data=df, ax=axs[0])
axs[0].set_title("Passenger Gender Distribution")

sns.countplot(x="Pclass", data=df, ax=axs[1])
axs[1].set_title("Passenger Class Distribution")

plt.tight_layout()
plt.show()


In [None]:
# 7. Survival by Category
sns.catplot(x="Sex", hue="Survived", data=df, kind="count")
sns.catplot(x="Pclass", hue="Survived", data=df, kind="count")


In [None]:
# 8. Correlation Heatmap (numeric features only)
plt.figure(figsize=(8, 6))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()
