<a href="https://colab.research.google.com/github/krishna-gera/my-aiml-learning/blob/main/day3_titanic_viz.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Day 3 – Titanic Visualization & EDA
# -----------------------------------
# Goal: Visualize the cleaned Titanic dataset (from Day 2)
# and save plots for analysis + future posts.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# -----------------------------------
# Setup
# -----------------------------------
sns.set(style="whitegrid")
ASSETS = Path("assets")
ASSETS.mkdir(exist_ok=True)

# Load cleaned data from Day 2
df = pd.read_csv("../day02/day02_titanic_clean.csv")
print("Shape:", df.shape)
print(df.info())
print("Missing values:\n", df.isna().sum())

# -----------------------------------
# Univariate plots (single features)
# -----------------------------------
plt.figure(figsize=(8,4))
sns.histplot(df['Age'], bins=30, kde=True)
plt.title("Age Distribution")
plt.xlabel("Age")
plt.ylabel("Count")
plt.savefig(ASSETS/"age_dist.png")
plt.show()

plt.figure(figsize=(8,4))
sns.histplot(df['Fare'], bins=40, kde=True)
plt.title("Fare Distribution (raw)")
plt.xlabel("Fare")
plt.ylabel("Count")
plt.savefig(ASSETS/"fare_dist.png")
plt.show()

plt.figure(figsize=(8,4))
sns.histplot(np.log1p(df['Fare']), bins=40, kde=True)
plt.title("Fare Distribution (log1p)")
plt.xlabel("log(Fare)")
plt.ylabel("Count")
plt.savefig(ASSETS/"fare_log_dist.png")
plt.show()

# -----------------------------------
# Categorical plots
# -----------------------------------
plt.figure(figsize=(6,4))
sns.barplot(x='Sex', y='Survived', data=df)
plt.title("Survival Rate by Sex")
plt.savefig(ASSETS/"survival_by_sex.png")
plt.show()

plt.figure(figsize=(6,4))
sns.barplot(x='Pclass', y='Survived', data=df, order=sorted(df['Pclass'].unique()))
plt.title("Survival Rate by Pclass")
plt.savefig(ASSETS/"survival_by_pclass.png")
plt.show()

plt.figure(figsize=(8,4))
sns.countplot(x='Pclass', hue='Survived', data=df)
plt.title("Passenger Class Counts (with Survival)")
plt.savefig(ASSETS/"count_pclass_survived.png")
plt.show()

# -----------------------------------
# Boxplots (numerical vs survival)
# -----------------------------------
plt.figure(figsize=(8,5))
sns.boxplot(x='Survived', y='Age', data=df)
plt.title("Age by Survival")
plt.savefig(ASSETS/"age_by_survival.png")
plt.show()

plt.figure(figsize=(8,5))
sns.boxplot(x='Survived', y='Fare', data=df)
plt.title("Fare by Survival")
plt.savefig(ASSETS/"fare_by_survival.png")
plt.show()

# -----------------------------------
# Correlation heatmap
# -----------------------------------
plt.figure(figsize=(10,8))
corr = df.corr()
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", square=True)
plt.title("Correlation Heatmap")
plt.savefig(ASSETS/"corr_heatmap.png")
plt.show()

# -----------------------------------
# Pairplot (subset of features)
# -----------------------------------
subset = df[['Survived','Age','Fare','Pclass','FamilySize']]
sns.pairplot(subset, hue='Survived', diag_kind='kde', plot_kws={'alpha':0.6})
plt.savefig(ASSETS/"pairplot_subset.png", bbox_inches='tight')
plt.show()

# -----------------------------------
# Optional: Feature importance with RandomForest
# -----------------------------------
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X = df.drop(columns=['Survived'])
y = df['Survived']
Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(Xtr, ytr)

imp = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
print("Feature Importances:\n", imp)

plt.figure(figsize=(8,4))
sns.barplot(x=imp.values, y=imp.index)
plt.title("Feature Importances (RandomForest)")
plt.savefig(ASSETS/"feature_importances.png")
plt.show()

# -----------------------------------
# Notes section (fill manually)
# -----------------------------------
# Example observations:
# - Females survived much more often than males.
# - 1st class passengers had higher survival rates.
# - Fare is highly skewed; log-transform may help.
# - Age doesn’t separate survival strongly alone.
# - FamilySize has some effect (IsAlone matters).


FileNotFoundError: [Errno 2] No such file or directory: '../day02/day02_titanic_clean.csv'