In [19]:
import nbformat as nbf
from nbformat.v4 import new_markdown_cell, new_code_cell, new_notebook
from pathlib import Path


In [20]:
notebook_cells = []

# Title and Introduction
notebook_cells.append(new_markdown_cell("# Task 2 – Exploratory Data Analysis (EDA)\n\nThis notebook performs EDA on the Titanic dataset to identify patterns, trends, and anomalies using statistics and visualizations."))

# Import Libraries
code_imports = """\
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
%matplotlib inline
"""
notebook_cells.append(new_code_cell(code_imports))


In [21]:
# Load Dataset
notebook_cells.append(new_markdown_cell("## Step 1: Load the Dataset"))
code_load_data = """\
# Load the dataset
df = pd.read_csv('../dataset/titanic.csv')  # Update path as needed
df.head()
"""
notebook_cells.append(new_code_cell(code_load_data))

In [22]:
# Histograms
notebook_cells.append(new_markdown_cell("## Step 4: Visualize Distributions (Histograms)"))
code_histograms = """\
# Histograms for numerical columns
df.hist(bins=20, figsize=(15, 10), color='skyblue', edgecolor='black')
plt.suptitle("Histograms of Numerical Features")
plt.show()
"""
notebook_cells.append(new_code_cell(code_histograms))

In [23]:
# Boxplots
notebook_cells.append(new_markdown_cell("## Step 5: Boxplots for Outlier Detection"))
code_boxplots = """\
# Boxplots for numeric features
plt.figure(figsize=(12,6))
sns.boxplot(data=df[['Age', 'Fare']])
plt.title("Boxplots of Age and Fare")
plt.show()
"""
notebook_cells.append(new_code_cell(code_boxplots))

In [24]:
# Correlation matrix
notebook_cells.append(new_markdown_cell("## Step 6: Correlation Matrix"))
code_corr = """\
# Correlation heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()
"""
notebook_cells.append(new_code_cell(code_corr))

In [25]:
# Pairplot
notebook_cells.append(new_markdown_cell("## Step 7: Pairplot for Feature Relationships"))
code_pairplot = """\
# Pairplot of selected features
sns.pairplot(df[['Age', 'Fare', 'Survived', 'Pclass']], hue='Survived')
plt.show()
"""
notebook_cells.append(new_code_cell(code_pairplot))

In [26]:
# Inference
notebook_cells.append(new_markdown_cell("## 📌 Inferences\n\n- Use the visuals above to draw conclusions.\n- Look for skewness, outliers, and correlations.\n- EDA helps decide which features are most informative."))

# Create and save the notebook
notebook = new_notebook(cells=notebook_cells)
output_path = Path("task2_EDA_complete.ipynb")
with open(output_path, "w", encoding="utf-8") as f:
    f.write(nbf.writes(notebook))

output_path.name

'task2_EDA_complete.ipynb'