<a href="https://colab.research.google.com/github/kumaramar5526/Task_2_DA/blob/main/Task_2_DA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
%pip install pandas numpy matplotlib seaborn nbformat reportlab

Collecting reportlab
  Downloading reportlab-4.4.3-py3-none-any.whl.metadata (1.7 kB)
Downloading reportlab-4.4.3-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: reportlab
Successfully installed reportlab-4.4.3


In [3]:
# Complete EDA script for "Stores.csv"
# Save as eda_stores_full.py and run in the folder containing Stores.csv
# Requirements: pandas, numpy, matplotlib, seaborn, nbformat, reportlab

import os
from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix
import nbformat
from nbformat.v4 import new_notebook, new_markdown_cell, new_code_cell
from reportlab.lib.pagesizes import A4
from reportlab.lib.utils import ImageReader
from reportlab.pdfgen import canvas

# ---------- Configuration ----------
INPUT = "Stores.csv"
OUTDIR = "eda_output"
NOTEBOOK_FILE = "EDA_Stores_notebook.ipynb"
PDF_FILE = "EDA_Report.pdf"
os.makedirs(OUTDIR, exist_ok=True)
plt.rcParams.update({"figure.max_open_warning": 0})
# -----------------------------------

# ---------- Load ----------
df = pd.read_csv(INPUT)
# Basic splits
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

# ---------- Quick summaries (save CSVs) ----------
col_summary = pd.DataFrame({
    "dtype": df.dtypes.astype(str),
    "missing_count": df.isnull().sum(),
    "unique_count": df.nunique()
})
col_summary.to_csv(os.path.join(OUTDIR, "column_summary.csv"))

if num_cols:
    df[num_cols].describe().T.to_csv(os.path.join(OUTDIR, "numeric_describe.csv"))
if cat_cols:
    df[cat_cols].describe().T.to_csv(os.path.join(OUTDIR, "categorical_describe.csv"))

# ---------- Utility functions ----------
def iqr_outlier_counts(frame, columns):
    out = {}
    for c in columns:
        s = frame[c].dropna()
        if s.empty:
            out[c] = 0
            continue
        q1 = s.quantile(0.25); q3 = s.quantile(0.75); iqr = q3 - q1
        low = q1 - 1.5 * iqr; high = q3 + 1.5 * iqr
        out[c] = int(((s < low) | (s > high)).sum())
    return out

skewness = df[num_cols].skew().sort_values(ascending=False) if num_cols else pd.Series(dtype=float)
iqr_counts = iqr_outlier_counts(df, num_cols)

# ---------- Plotting & PDF (report) ----------
pdf_path = os.path.join(OUTDIR, PDF_FILE)
# Save figures to PNG list for embedding
fig_paths = []

# 1. Missing values heatmap (png)
plt.figure(figsize=(10, max(2, len(df.columns)*0.25)))
sns.heatmap(df.isnull().T, cbar=False, cmap=["#ffffff","#444444"])
plt.title("Missing values heatmap (columns x rows)")
p = os.path.join(OUTDIR, "missing_matrix.png"); plt.tight_layout(); plt.savefig(p, dpi=150); plt.close(); fig_paths.append(p)

# 2. Correlation heatmap (if numeric)
if num_cols:
    plt.figure(figsize=(8, max(3, len(num_cols)*0.4)))
    sns.heatmap(df[num_cols].corr(), annot=True, fmt=".2f", cmap="RdYlBu", vmin=-1, vmax=1)
    plt.title("Correlation matrix (numeric)")
    p = os.path.join(OUTDIR, "corr_matrix.png"); plt.tight_layout(); plt.savefig(p, dpi=150); plt.close(); fig_paths.append(p)

# 3. Histogram + boxplot per numeric column
for c in num_cols:
    fig, axes = plt.subplots(1,2, figsize=(10,4))
    sns.histplot(df[c].dropna(), bins=30, kde=True, ax=axes[0])
    axes[0].set_title(f"Histogram: {c}")
    sns.boxplot(x=df[c], ax=axes[1])
    axes[1].set_title(f"Boxplot: {c}")
    p = os.path.join(OUTDIR, f"hist_box_{c}.png"); plt.tight_layout(); plt.savefig(p, dpi=150); plt.close(); fig_paths.append(p)

# 4. Pairplot / scatter matrix if small number of numeric cols
if 2 <= len(num_cols) <= 6:
    p = os.path.join(OUTDIR, "scatter_matrix.png")
    scatter_matrix(df[num_cols].dropna(), diagonal='hist', figsize=(8,8))
    plt.suptitle("Scatter matrix (numeric)")
    plt.tight_layout(); plt.savefig(p, dpi=150); plt.close(); fig_paths.append(p)

# 5. Categorical top counts (png)
for c in cat_cols:
    vc = df[c].value_counts(dropna=False).head(20)
    plt.figure(figsize=(8, max(3, len(vc)*0.25)))
    sns.barplot(x=vc.values, y=vc.index)
    plt.title(f"Top categories for {c} (top 20)")
    p = os.path.join(OUTDIR, f"bar_{c}.png"); plt.tight_layout(); plt.savefig(p, dpi=150); plt.close(); fig_paths.append(p)

# ---------- Build PDF report with text + embedded pngs ----------
c = canvas.Canvas(pdf_path, pagesize=A4)
W, H = A4; margin = 36
y = H - margin
c.setFont("Helvetica-Bold", 16); c.drawString(margin, y, "Exploratory Data Analysis Report"); y -= 20
c.setFont("Helvetica", 9); c.drawString(margin, y, f"Source: {INPUT}    Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"); y -= 18

# Basic facts
c.setFont("Helvetica-Bold", 11); c.drawString(margin, y, "Dataset facts:"); y -= 14
c.setFont("Helvetica", 9)
c.drawString(margin, y, f"Shape: {df.shape}    Duplicate rows: {int(df.duplicated().sum())}"); y -= 12
c.drawString(margin, y, f"Numeric columns: {len(num_cols)}    Categorical columns: {len(cat_cols)}"); y -= 14

# Missing and skewness & outliers
c.setFont("Helvetica-Bold", 11); c.drawString(margin, y, "Top findings:"); y -= 12
c.setFont("Helvetica", 9)
missing_cols = [col for col in df.columns if df[col].isnull().any()]
c.drawString(margin, y, "Columns with missing values: " + (", ".join(missing_cols) if missing_cols else "None")); y -= 12
top_skew = ", ".join([f"{k}({v:.2f})" for k,v in skewness.head(5).items()]) if not skewness.empty else "None"
c.drawString(margin, y, "Top skewness (numeric): " + top_skew); y -= 12
iqr_txt = ", ".join([f"{k}:{v}" for k,v in iqr_counts.items()]) if iqr_counts else "None"
c.drawString(margin, y, "IQR outlier counts (numeric): " + iqr_txt); y -= 18

# Insert images (two per page approx)
for p in fig_paths:
    try:
        im = ImageReader(p)
        iw, ih = im.getSize()
        max_w = W - 2*margin
        aspect = ih/iw
        draw_w = max_w
        draw_h = draw_w*aspect
        if y - draw_h < margin:
            c.showPage(); y = H - margin
        c.drawImage(im, margin, y - draw_h, width=draw_w, height=draw_h)
        y -= (draw_h + 8)
    except Exception:
        continue

c.save()

# ---------- Create reproducible Jupyter notebook (.ipynb) ----------
nb = new_notebook()
cells = []
cells.append(new_markdown_cell("# EDA Notebook for Stores.csv\nThis notebook reproduces the EDA. Run cells to inspect outputs."))
cells.append(new_code_cell(
    "import pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom pandas.plotting import scatter_matrix\n\nINPUT = 'Stores.csv'\ndf = pd.read_csv(INPUT)\ndf.head()"
))
cells.append(new_code_cell(
    "print('Shape:', df.shape)\nprint('\\nDtypes:')\nprint(df.dtypes)\nprint('\\nMissing values:')\nprint(df.isnull().sum())\nprint('\\nDuplicate rows:', df.duplicated().sum())"
))
cells.append(new_code_cell(
    "num_cols = df.select_dtypes(include=[np.number]).columns.tolist()\ncat_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()\nprint('Numeric cols:', num_cols)\nprint('Categorical cols:', cat_cols)"
))
cells.append(new_code_cell(
    "# Numeric description\nif num_cols:\n    display(df[num_cols].describe().T)\n# Categorical description\nif cat_cols:\n    display(df[cat_cols].describe().T)"
))
cells.append(new_code_cell(
    "import os\nos.listdir('eda_output') if os.path.exists('eda_output') else None"
))
cells.append(new_code_cell(
    "# Example plots: missing heatmap and correlation\nplt.figure(figsize=(10,3)); sns.heatmap(df.isnull().T, cbar=False); plt.title('Missing values heatmap'); plt.show()\nif num_cols:\n    plt.figure(figsize=(8,6)); sns.heatmap(df[num_cols].corr(), annot=True, fmt='.2f'); plt.title('Correlation'); plt.show()"
))
nb['cells'] = cells
nbf_path = os.path.join(OUTDIR, NOTEBOOK_FILE)
with open(nbf_path, "w", encoding="utf-8") as f:
    nbformat.write(nb, f)

# ---------- Short summary file ----------
summary = []
summary.append(f"Shape: {df.shape}")
summary.append(f"Duplicate rows: {int(df.duplicated().sum())}")
summary.append("Columns with missing values: " + (", ".join(missing_cols) if missing_cols else "None"))
summary.append("Numeric columns: " + (", ".join(num_cols) if num_cols else "None"))
summary.append("Categorical columns: " + (", ".join(cat_cols) if cat_cols else "None"))
summary.append("")
summary.append("Top skewness (numeric):")
for k,v in (skewness.head(10).items() if not skewness.empty else []):
    summary.append(f" - {k}: {v:.3f}")
summary.append("")
summary.append("IQR outlier counts (numeric):")
for k,v in iqr_counts.items():
    summary.append(f" - {k}: {v}")
with open(os.path.join(OUTDIR, "short_summary.txt"), "w", encoding="utf-8") as f:
    f.write("\n".join(summary))

# ---------- Light cleaning export (optional) ----------
df.drop_duplicates().to_csv(os.path.join(OUTDIR, "Stores_clean_shallow.csv"), index=False)

# ---------- Print outputs ----------
print("EDA completed.")
print("Outputs in:", OUTDIR)
print("Notebook:", nbf_path)
print("PDF report:", pdf_path)
print("Column summary:", os.path.join(OUTDIR, "column_summary.csv"))
print("Short summary:", os.path.join(OUTDIR, "short_summary.txt"))

EDA completed.
Outputs in: eda_output
Notebook: eda_output/EDA_Stores_notebook.ipynb
PDF report: eda_output/EDA_Report.pdf
Column summary: eda_output/column_summary.csv
Short summary: eda_output/short_summary.txt
