# Automated EDA Notebook

This notebook was auto-generated on 2025-09-04 12:16:45.  
It loads the dataset and performs a standard EDA with Pandas and Matplotlib.

**Dataset**: `sample_retail_dataset.csv`


In [None]:
import io
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix

csv_path = r'/mnt/data/sample_retail_dataset.csv'
df = pd.read_csv(csv_path)


In [None]:
# Overview
buf = io.StringIO()
df.info(buf=buf)
print(buf.getvalue())

display(df.head(5))

numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

print("Numeric columns:", numeric_cols)
print("Categorical columns:", categorical_cols)

df[numeric_cols].describe().T if numeric_cols else print("No numeric columns")
df[categorical_cols].describe().T if categorical_cols else print("No categorical columns")

print("\nMissing values by column:")
print(df.isnull().sum().sort_values(ascending=False))


In [None]:
# Missingness visuals
plt.figure()
plt.imshow(df.isnull(), aspect='auto', interpolation='nearest')
plt.title('Missing Values Heatmap (True=missing)')
plt.xlabel('Columns'); plt.ylabel('Rows'); plt.colorbar()
plt.show()

missing_pct = (df.isnull().mean() * 100).sort_values(ascending=False)
plt.figure()
missing_pct.head(20).plot(kind='bar')
plt.title('Missingness by Column (Top 20) - %')
plt.ylabel('% Missing')
plt.show()


In [None]:
# Distributions
for col in numeric_cols[:8]:
    plt.figure()
    plt.hist(df[col].dropna(), bins=30)
    plt.title(f'Histogram: {col}')
    plt.xlabel(col); plt.ylabel('Frequency')
    plt.show()

for col in numeric_cols[:8]:
    plt.figure()
    plt.boxplot(df[col].dropna(), vert=True)
    plt.title(f'Boxplot: {col}')
    plt.ylabel(col)
    plt.show()


In [None]:
# Top categories
for col in categorical_cols[:2]:
    vc = df[col].value_counts().head(15)
    plt.figure()
    vc.plot(kind='bar')
    plt.title(f'Top 15 categories: {col}')
    plt.ylabel('Count')
    plt.show()


In [None]:
# Correlations and scatter matrix
if len(numeric_cols) >= 2:
    subset = numeric_cols[:5]
    scatter_matrix(df[subset].dropna(), diagonal='hist', figsize=(12, 12))
    plt.suptitle('Scatter Matrix (first 5 numeric columns)')
    plt.show()

corr = df[numeric_cols].corr() if numeric_cols else None
if corr is not None:
    plt.figure()
    plt.imshow(corr, interpolation='nearest')
    plt.title('Correlation Heatmap (Numeric)')
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
    plt.yticks(range(len(corr.index)), corr.index)
    plt.colorbar()
    plt.show()


In [None]:
# Skewness and outliers (IQR)
if numeric_cols:
    print("Skewness:")
    print(df[numeric_cols].skew(numeric_only=True))

    def iqr_outlier_count(s):
        q1 = s.quantile(0.25)
        q3 = s.quantile(0.75)
        iqr = q3 - q1
        lower, upper = q1 - 1.5*iqr, q3 + 1.5*iqr
        return ((s < lower) | (s > upper)).sum()

    outlier_counts = {col: iqr_outlier_count(df[col].dropna()) for col in numeric_cols}
    print("\nOutlier counts (IQR):")
    print(outlier_counts)
