In [2]:
# 1_EDA_Template.ipynb

# ==========================
# Exploratory Data Analysis
# ==========================
# WHEN TO USE:
# Always your first step after getting any dataset.
# Goal: understand structure, missing values, distributions, correlations,
# and patterns that guide cleaning + modeling later.

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load dataset (replace path)
df = pd.read_csv("your_dataset.csv")

In [None]:
# --------------------------
# Dataset Overview
# --------------------------
# Use this to quickly check size, columns, and missing values
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
print("\nFirst 5 rows:\n", df.head())
print("\nMissing values:\n", df.isnull().sum())

In [None]:
# Summary stats (useful for spotting anomalies/outliers)
display(df.describe(include="all"))


In [None]:
# --------------------------
# Univariate Analysis
# --------------------------
# WHEN TO USE:
# To study distributions of single variables before looking at relationships.
# Helps find skew, outliers, and dominant categories.

# Numeric distributions
df.hist(figsize=(12,8), bins=30, edgecolor="black")
plt.suptitle("Numeric Distributions", fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
# Categorical counts
for col in df.select_dtypes(include=["object","category"]).columns:
    plt.figure(figsize=(6,4))
    sns.countplot(x=col, data=df, order=df[col].value_counts().index)
    plt.title(f"Countplot of {col}")
    plt.xticks(rotation=45)
    plt.show()

In [None]:
# --------------------------
# Bivariate Analysis
# --------------------------
# WHEN TO USE:
# To check how variables interact with each other.
# Use correlation for numeric vs numeric, plots for categorical vs numeric.

# Correlation heatmap
plt.figure(figsize=(8,6))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

In [None]:
# Example scatterplot (change columns as per dataset)
if "Age" in df.columns and "Fare" in df.columns:
    sns.scatterplot(x="Age", y="Fare", data=df)
    plt.title("Age vs Fare")
    plt.show()

In [None]:

# --------------------------
# Group Analysis
# --------------------------
# WHEN TO USE:
# To check how categories (e.g., Gender, Region) affect outcome/target.

if "Survived" in df.columns and "Sex" in df.columns:
    sns.barplot(x="Sex", y="Survived", data=df)
    plt.title("Survival Rate by Gender")
    plt.show()

print("✅ EDA finished. Modify plots/columns for your dataset.")