<a href="https://colab.research.google.com/github/jmcconne100/Pandas_Notebook_Project/blob/main/my_data_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import files, drive

def load_csv(method="upload", source=None, concat=False, **read_csv_kwargs):
    """
    Load CSVs in Colab using one of three methods:
      - "upload": upload from local machine
      - "drive": read from Google Drive
      - "web": read from URL(s)

    Args:
        method: "upload" | "drive" | "web"
        source: file path(s) or URL(s); not needed for upload
        concat: if True, combine all CSVs into one DataFrame
        **read_csv_kwargs: passed to pandas.read_csv()

    Returns:
        A DataFrame (if concat=True or one file) or dict of {name: DataFrame}

    Examples:
        df1 = load_csv("upload")

        path = "/content/drive/MyDrive/data/UScomments.csv"
        df2 = load_csv("drive", path)

        url = "https://people.sc.fsu.edu/~jburkardt/data/csv/addresses.csv"
        df3 = load_csv("web", url)
    """
    defaults = {"on_bad_lines": "skip"}
    kwargs = {**defaults, **(read_csv_kwargs or {})}

    method = method.lower()
    dfs = {}

    if method == "upload":
        uploaded = files.upload()
        for name in uploaded.keys():
            dfs[name] = pd.read_csv(name, **kwargs)

    elif method == "drive":
        drive.mount("/content/drive", force_remount=False)
        if isinstance(source, str):
            source = [source]
        for path in source:
            dfs[path.split("/")[-1]] = pd.read_csv(path, **kwargs)

    elif method == "web":
        if isinstance(source, str):
            source = [source]
        for url in source:
            dfs[url.split("/")[-1]] = pd.read_csv(url, **kwargs)

    else:
        raise ValueError("method must be one of: 'upload', 'drive', 'web'")

    if concat:
        return pd.concat(list(dfs.values()), ignore_index=True)
    return dfs if len(dfs) > 1 else next(iter(dfs.values()))

In [None]:
method = "upload" # can put in upload, drive, or web
# Note if picking drive specify a path and if picking web specify a URL

df1 = load_csv(method)
df1.head()

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

text = "I love Python and data analysis but I hate debugging errors sometimes."
stop_words = set(stopwords.words('english'))

# Remove stopwords
filtered_text = ' '.join([word for word in text.split() if word.lower() not in stop_words])

# Generate word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(filtered_text)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()


In [None]:
# pip installs (run once)
!pip install emoji regex plotly pandas

import pandas as pd
import emoji
import regex as re
from collections import Counter
import plotly.express as px

# Robust grapheme splitter so flags / family sequences stay intact
GRAPHEME = re.compile(r'\X', re.UNICODE)

def extract_emojis(text: str) -> list[str]:
    # Keep grapheme clusters that contain at least one emoji codepoint
    return [g for g in GRAPHEME.findall(text) if any(ch in emoji.EMOJI_DATA for ch in g)]

# Example corpus (replace with yours)
messages = [
    "Love this! 😍🔥",
    "Hahaha 😂😂",
    "Ok 👍🏽👍🏽 meeting at 3pm 🕒",
    "New PR merged ✅🚀🚀🚀🚀🚀",
    "Ugh… Mondays 😒☕",
    "Flags work too 🇺🇸🇨🇦 😍😍😍",
    "🙂🙂🙂",
    "🤣😔😔😔😔"
]

# Flatten all emojis
all_emojis = [e for msg in messages for e in extract_emojis(msg)]

freq = Counter(all_emojis)
df_freq = pd.DataFrame(freq.items(), columns=["emoji", "count"]).sort_values("count", ascending=False)

# Bar chart of the top 20 emojis
fig = px.bar(df_freq.head(20), x="emoji", y="count", text="count",
             title="Top Emojis")
fig.update_traces(textposition="outside")
fig.update_layout(xaxis_title="Emoji", yaxis_title="Count")
fig.show()

In [None]:
# Upload and combine a series of csv's

from google.colab import files
import pandas as pd
import os

uploaded = files.upload()  # opens a file picker dialog

csv_files = [f for f in os.listdir() if f.endswith('.csv')]
print("Found CSVs:", csv_files)

dfs = [pd.read_csv(f, on_bad_lines='skip', low_memory=False) for f in csv_files]

df_all = pd.concat(dfs, ignore_index=True)

print(f"Loaded {len(dfs)} CSVs, combined shape: {df_all.shape}")
df_all.head()

In [None]:
# Box Plot Script

# Install if needed
!pip install plotly

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

# ----- Generate Example Data -----
np.random.seed(42)

departments = ['Sales', 'Marketing', 'Engineering', 'HR']
n_per_dept = 30

data = {
    'department': np.repeat(departments, n_per_dept),
    'score': np.concatenate([
        np.random.normal(75, 8, n_per_dept),   # Sales
        np.random.normal(70, 10, n_per_dept),  # Marketing
        np.random.normal(85, 5, n_per_dept),   # Engineering
        np.random.normal(65, 7, n_per_dept)    # HR
    ])
}

df = pd.DataFrame(data)
print(df.head())

# ----- Matplotlib Box Plot -----
plt.figure(figsize=(8, 5))
df.boxplot(column='score', by='department', grid=False, patch_artist=True)
plt.title('Box Plot of Scores by Department')
plt.suptitle('')
plt.xlabel('Department')
plt.ylabel('Score')
plt.show()

# ----- Interactive Plotly Box Plot -----
fig = px.box(df, x='department', y='score', color='department',
             title='Interactive Box Plot of Scores by Department',
             points='all')  # 'all' adds jittered individual points
fig.show()


In [None]:
# Install seaborn & plotly if needed
# !pip install seaborn plotly

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

# ----- 1️⃣ Generate Example Data -----
np.random.seed(42)
n = 100

df = pd.DataFrame({
    'age': np.random.randint(20, 60, n),
    'experience': np.random.randint(0, 30, n),
    'hours_per_week': np.random.randint(30, 60, n),
    'projects_completed': np.random.randint(1, 10, n),
    'score': np.random.normal(75, 10, n)
})

# Add a correlated feature (score slightly depends on hours + projects)
df['performance_index'] = (
    0.4 * df['hours_per_week'] +
    0.3 * df['projects_completed'] +
    np.random.normal(0, 5, n)
)

print(df.head())

# ----- Compute Correlation Matrix -----
corr = df.corr(numeric_only=True)
print("\nCorrelation Matrix:\n", corr)

# ----- Seaborn Heatmap (Static) -----
plt.figure(figsize=(8,6))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Correlation Heatmap (Seaborn)")
plt.show()

# ----- 4Plotly Heatmap (Interactive) -----
fig = px.imshow(
    corr,
    text_auto=".2f",
    color_continuous_scale='RdBu_r',
    title="Interactive Correlation Heatmap (Plotly)"
)
fig.update_layout(xaxis_title="Features", yaxis_title="Features")
fig.show()


In [None]:
# Install if needed
!pip install seaborn plotly

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

# ----- Generate Synthetic Data -----
np.random.seed(42)
n = 100

# Create a linear relationship with noise
x = np.random.uniform(1, 100, n)
y = 2.5 * x + np.random.normal(0, 25, n)

df = pd.DataFrame({"hours_studied": x, "exam_score": y})

print(df.head())

# ----- Static Regression Plot (Seaborn) -----
plt.figure(figsize=(7,5))
sns.regplot(
    data=df,
    x="hours_studied",
    y="exam_score",
    scatter_kws={'alpha':0.7},
    line_kws={'color':'red'}
)
plt.title("Regression Plot: Hours Studied vs Exam Score")
plt.xlabel("Hours Studied")
plt.ylabel("Exam Score")
plt.show()

# ----- Optional Faceted Plot (Seaborn lmplot) -----
# Example: if you had a categorical variable like 'class'
df['class'] = np.random.choice(['A','B'], size=n)
sns.lmplot(data=df, x="hours_studied", y="exam_score", hue="class", aspect=1.2)
plt.title("Regression Plot by Class")
plt.show()

# ----- Interactive Plotly Version -----
fig = px.scatter(
    df,
    x="hours_studied",
    y="exam_score",
    color="class",
    trendline="ols",  # adds regression line automatically
    title="Interactive Regression Plot (Plotly)"
)
fig.update_traces(marker=dict(size=8, opacity=0.7))
fig.show()


In [None]:
# Install if needed
!pip install seaborn plotly

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# ----- Example Data -----
np.random.seed(42)
departments = ['Sales', 'Marketing', 'Engineering', 'HR', 'Finance']
avg_scores = np.random.randint(60, 95, len(departments))

df = pd.DataFrame({
    'department': departments,
    'average_score': avg_scores
})

print(df)

# ----- Vertical Bar Chart (Matplotlib) -----
plt.figure(figsize=(7,5))
plt.bar(df['department'], df['average_score'], color='skyblue')
plt.title("Average Score by Department (Vertical)")
plt.xlabel("Department")
plt.ylabel("Average Score")
plt.show()

# ----- Horizontal Bar Chart (Matplotlib) -----
plt.figure(figsize=(7,5))
plt.barh(df['department'], df['average_score'], color='lightcoral')
plt.title("Average Score by Department (Horizontal)")
plt.xlabel("Average Score")
plt.ylabel("Department")
plt.show()

# ----- Vertical Bar Chart (Seaborn) -----
plt.figure(figsize=(7,5))
sns.barplot(data=df, x='department', y='average_score', palette='Blues_d')
plt.title("Seaborn Vertical Bar Chart")
plt.show()

# ----- Horizontal Bar Chart (Seaborn) -----
plt.figure(figsize=(7,5))
sns.barplot(data=df, y='department', x='average_score', palette='Reds_d')
plt.title("Seaborn Horizontal Bar Chart")
plt.show()

# ----- Interactive Plotly Bar Charts -----
# Vertical
fig_v = px.bar(df, x='department', y='average_score',
               title='Interactive Vertical Bar Chart (Plotly)',
               color='department', text='average_score')
fig_v.update_traces(textposition='outside')
fig_v.show()

# Horizontal
fig_h = px.bar(df, x='average_score', y='department', orientation='h',
               title='Interactive Horizontal Bar Chart (Plotly)',
               color='department', text='average_score')
fig_h.update_traces(textposition='outside')
fig_h.show()


In [None]:
import pandas as pd
import numpy as np
import re

# ---------- Example Raw Data ----------
raw = pd.DataFrame({
    "order_id": [101, 101, 102, 103, 104, 105],
    "order_date": ["2025-10-01", "10/01/2025", "10/02/2025", "2025/10/03", "Oct 04, 2025", None],
    "category": ["  Mobile ", "mobile", "Phones", "Accessories", "ACCESSORIES ", " tablets "],
    "unit_price": ["$1,299.99", "$1,299.99", "$899", "  $29.99", "$19,999.99", "$250"],
    "qty": [1, 1, None, 2, 1, 1],
})

print("Raw:\n", raw, "\n")

# ---------- Cleaning ----------
df = raw.copy()

# Dates → datetime (coerce errors, then backfill if helpful)
df["order_date"] = pd.to_datetime(df["order_date"], errors="coerce", infer_datetime_format=True)
df["order_date"] = df["order_date"].fillna(df["order_date"].bfill())

# Categories → stripped, lowercased, standardized
df["category"] = df["category"].str.strip().str.lower()
cat_map = {"phones": "mobile", "tablets": "tablet", "accessories": "accessories", "mobile": "mobile"}
df["category"] = df["category"].map(lambda c: cat_map.get(c, c))

# Prices → numeric (remove currency and commas)
df["unit_price"] = (
    df["unit_price"].astype(str)
    .str.replace(r"[^0-9.\-]", "", regex=True)
    .replace("", np.nan)
    .astype(float)
)

# qty → fill missing with 1
df["qty"] = df["qty"].fillna(1).astype(int)

# Duplicate rows (same order_id, category, unit_price, qty, date) → drop
df = df.drop_duplicates(subset=["order_id", "order_date", "category", "unit_price", "qty"])

# Derived total
df["line_total"] = df["unit_price"] * df["qty"]

# Simple outlier guard on price using IQR
q1, q3 = df["unit_price"].quantile([0.25, 0.75])
iqr = q3 - q1
upper = q3 + 1.5 * iqr
df = df[df["unit_price"] <= upper]  # remove extreme outlier row

print("Cleaned dtypes:\n", df.dtypes, "\n")
print("Cleaned:\n", df, "\n")

# (Optional) Save
# df.to_csv("retail_transactions_clean.csv", index=False)


In [None]:
import pandas as pd
import numpy as np
import re

# ---------- Example Raw Data ----------
raw = pd.DataFrame({
    "respondent_id": [1, 2, 3, 4],
    "consent": ["Yes", "y", "NO", "true"],
    "age": [" 29 ", "N/A", "35", None],
    "country": ["U.S.", "United States", "usa", "Canada"],
    "q1_satisfaction": [5, 4, np.nan, 2],     # 1..5
    "q2_rev": [1, 2, 5, 3],                   # reverse-coded 1..5
    "tools": ["pandas; numpy ; SQL", "Python;sql", "", "NumPy;   Pandas;  seaborn "],
    "notes": ["  great product!!  ", "too $$$  ", "fine 👍", None]
})

print("Raw:\n", raw, "\n")

# ---------- Cleaning ----------
df = raw.copy()

# consent → boolean
true_set = {"yes", "y", "true", "1"}
df["consent"] = df["consent"].astype(str).str.strip().str.lower().isin(true_set)

# age → numeric, coerce, impute median
df["age"] = pd.to_numeric(df["age"], errors="coerce")
df["age"] = df["age"].fillna(df["age"].median())

# country → standardized
def norm_country(x):
    x = str(x).strip().lower().replace(".", "")
    if x in {"us", "u s", "u.s", "u.s", "usa", "united states"}:
        return "United States"
    if x in {"canada", "ca"}:
        return "Canada"
    return x.title()

df["country"] = df["country"].apply(norm_country)

# Likert reverse-code q2_rev (1..5 → 5..1)
df["q2_rev_rc"] = 6 - df["q2_rev"]

# Impute missing for Likert with median (per column)
for col in ["q1_satisfaction", "q2_rev_rc"]:
    df[col] = df[col].fillna(df[col].median())

# notes → trim whitespace; remove simple emojis/non-ASCII safely
df["notes"] = df["notes"].fillna("").str.strip()
df["notes_clean"] = df["notes"].str.encode("ascii", "ignore").str.decode("ascii")

# tools (multi-select, delimiter “;”) → one-hot bools
tools_clean = (
    df["tools"]
    .fillna("")
    .str.lower()
    .str.replace(r"\s+", " ", regex=True)
    .str.replace(" ;", ";", regex=False)
    .str.replace("; ", ";", regex=False)
)
df["_tools_list"] = tools_clean.apply(lambda s: [t.strip() for t in s.split(";") if t.strip()])

# Collect all unique tools
unique_tools = sorted({t for lst in df["_tools_list"] for t in lst})
for t in unique_tools:
    df[f"tool__{t.replace(' ', '_')}"] = df["_tools_list"].apply(lambda lst: t in lst)

# Overall score (example composite)
df["overall_score"] = df["q1_satisfaction"] + df["q2_rev_rc"]

# Drop helper columns
df = df.drop(columns=["_tools_list"])

print("Cleaned dtypes:\n", df.dtypes, "\n")
print("Cleaned:\n", df, "\n")

# (Optional) Save
# df.to_csv("survey_clean.csv", index=False)


In [None]:
import requests
import pandas as pd

def test_api_to_dataframe():
    """
    Test fetching JSON data from a public API, validating the response,
    and converting it into a pandas DataFrame with clear print checkpoints.
    """
    url = "https://jsonplaceholder.typicode.com/posts"
    print(f"Testing API endpoint: {url}")

    try:
        # Step 1: Send request
        print("Sending request...")
        response = requests.get(url, timeout=10)
        print(f"Response status code: {response.status_code}")
        response.raise_for_status()

        # Step 2: Validate content type
        content_type = response.headers.get("Content-Type", "")
        print(f"Content-Type: {content_type}")
        if "application/json" not in content_type:
            raise ValueError(f"Unexpected content type: {content_type}")

        # Step 3: Parse JSON
        print("Parsing JSON response...")
        data = response.json()
        print(f"JSON parsed successfully. Type: {type(data)}")

        if not isinstance(data, (list, dict)):
            raise TypeError("API response is not a valid JSON structure")

        # Step 4: Normalize for DataFrame conversion
        if isinstance(data, dict):
            data = [data]  # wrap single object in a list

        # Step 5: Convert to DataFrame
        print("Converting to pandas DataFrame...")
        df = pd.DataFrame(data)
        print("Conversion successful.")
        print(f"DataFrame shape: {df.shape}")
        print(f"Columns: {list(df.columns)}")

        # Step 6: Preview data
        print("\nData preview:")
        print(df.head())

        print("\nTest completed successfully.")
        return df

    except requests.exceptions.Timeout:
        print("Request timed out.")
    except requests.exceptions.HTTPError as e:
        print(f"HTTP error: {e}")
    except ValueError as e:
        print(f"Value error: {e}")
    except TypeError as e:
        print(f"Type error: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")

# Run the test
df = test_api_to_dataframe()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def plot_line(
    df: pd.DataFrame,
    x: str,
    y: list | str,
    title: str | None = None,
    xlabel: str | None = None,
    ylabel: str | None = None,
    rolling: int | None = None,
    save_path: str | None = None,
    figsize=(10, 5),
):
    """
    Plot a line chart from a DataFrame.

    Parameters
    ----------
    df : DataFrame
        Source data.
    x : str
        Column name for x-axis. If dtype is not datetime, will try to parse as datetime.
    y : list | str
        One or more column names for y-series.
    rolling : int | None
        Window size for optional rolling mean (applied to each y series).
    save_path : str | None
        If provided, saves the figure (e.g., 'figure.png').
    """

    # Ensure y is a list
    y_cols = [y] if isinstance(y, str) else list(y)

    # Coerce datetime x if possible
    if not pd.api.types.is_datetime64_any_dtype(df[x]):
        try:
            df = df.copy()
            df[x] = pd.to_datetime(df[x], errors="coerce")
        except Exception:
            pass

    # Sort by x for nicer lines
    df = df.sort_values(x)

    plt.figure(figsize=figsize)

    # Plot each series
    for col in y_cols:
        series = df[col]
        if rolling and rolling > 1:
            series = series.rolling(rolling, min_periods=max(1, rolling // 2)).mean()
        plt.plot(df[x], series, label=col)

    # Labels & grid
    plt.title(title or "Line Chart")
    plt.xlabel(xlabel or x)
    plt.ylabel(ylabel or (", ".join(y_cols) if len(y_cols) == 1 else "Values"))
    plt.grid(True, alpha=0.3)

    # Legend for multiple series
    if len(y_cols) > 1:
        plt.legend()

    # Improve date formatting if x is datetime
    if pd.api.types.is_datetime64_any_dtype(df[x]):
        plt.gcf().autofmt_xdate()

    if save_path:
        plt.savefig(save_path, bbox_inches="tight")

    plt.show()

# Example DataFrame
dates = pd.date_range("2025-01-01", periods=30, freq="D")
df_example = pd.DataFrame({
    "date": dates,
    "sales": (100 + pd.Series(range(30))).astype(float),
    "visits": (200 + pd.Series(range(30)) * 1.5).astype(float),
})

# Single series
plot_line(df_example, x="date", y="sales", title="Daily Sales", rolling=3)

# Multiple series
plot_line(df_example, x="date", y=["sales", "visits"], title="Sales vs Visits", rolling=None)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

def plot_histogram(
    df: pd.DataFrame,
    column: str,
    bins: int = 20,
    title: str | None = None,
    xlabel: str | None = None,
    ylabel: str = "Frequency",
    density: bool = False,
    figsize=(8, 5),
    save_path: str | None = None,
):
    """
    Plot a histogram (frequency distribution) for a numeric column.

    Parameters
    ----------
    df : DataFrame
        Source data.
    column : str
        Column name to plot.
    bins : int
        Number of histogram bins.
    title : str | None
        Optional chart title.
    xlabel : str | None
        Optional x-axis label (defaults to column name).
    ylabel : str
        Label for y-axis.
    density : bool
        If True, shows probability density instead of raw counts.
    save_path : str | None
        If provided, saves the figure (e.g., 'histogram.png').
    """

    # Drop NaN values for cleaner plot
    data = df[column].dropna()

    plt.figure(figsize=figsize)
    plt.hist(data, bins=bins, edgecolor="black", alpha=0.7, density=density)

    plt.title(title or f"Distribution of {column}")
    plt.xlabel(xlabel or column)
    plt.ylabel(ylabel if not density else "Density")
    plt.grid(axis="y", linestyle="--", alpha=0.6)

    # Show key stats in console
    print(f"Column: {column}")
    print(f"Count: {len(data)} | Mean: {data.mean():.2f} | Std: {data.std():.2f} | Min: {data.min():.2f} | Max: {data.max():.2f}")

    if save_path:
        plt.savefig(save_path, bbox_inches="tight")

    plt.show()

np.random.seed(42)
df_example = pd.DataFrame({
    "age": np.random.normal(35, 10, 500).clip(0, 80)  # ages roughly 0–80
})

# Plot histogram
plot_histogram(df_example, column="age", bins=15, title="Age Distribution", xlabel="Age (years)")

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

# --- 5 functions from eda_core.py ---

def summarize_dataframe(df):
    rows = []
    for col in df.columns:
        s = df[col]
        row = {
            "column": col,
            "dtype": str(s.dtype),
            "null_%": s.isna().mean() * 100,
            "unique": s.nunique(dropna=True),
        }
        if pd.api.types.is_numeric_dtype(s):
            row.update({
                "min": s.min(),
                "max": s.max(),
                "mean": s.mean(),
                "std": s.std()
            })
        rows.append(row)
    summary_df = pd.DataFrame(rows)
    return summary_df.round(2)


def detect_outliers(df, z_thresh=3):
    numeric = df.select_dtypes(include=np.number)
    report = {}
    for col in numeric.columns:
        z = np.abs((numeric[col] - numeric[col].mean()) / numeric[col].std(ddof=0))
        outliers = (z > z_thresh).sum()
        report[col] = {"outlier_count": int(outliers),
                       "outlier_%": round(100*outliers/len(numeric), 2)}
    return pd.DataFrame(report).T


def compare_before_after(df1, df2):
    print(f"Rows before: {len(df1)} | after: {len(df2)}")
    print(f"Columns before: {len(df1.columns)} | after: {len(df2.columns)}")

    new_cols = set(df2.columns) - set(df1.columns)
    removed_cols = set(df1.columns) - set(df2.columns)
    print(f"Added columns: {new_cols}")
    print(f"Removed columns: {removed_cols}")

    null_diff = (df2.isna().sum() - df1.isna().sum())
    print("\nChange in null counts:")
    print(null_diff[null_diff != 0])


def profile_categories(df, top_n=10):
    cat_cols = df.select_dtypes(include="object").columns
    for col in cat_cols:
        print(f"\nColumn: {col}")
        vc = df[col].value_counts(dropna=False).head(top_n)
        pct = (vc / len(df) * 100).round(2)
        print(pd.DataFrame({"count": vc, "percent": pct}))


def save_clean_data(df, path):
    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    out_path = f"{path.replace('.csv','')}_{ts}.csv"
    df.to_csv(out_path, index=False)
    print(f"Saved cleaned file: {out_path}")
    return out_path

df_summary = pd.DataFrame({
    "age": [25, 30, 40, np.nan, 50],
    "income": [50000, 52000, 51000, 200000, np.nan],
    "signed_up": [True, True, False, True, False],
    "city": ["NY", "LA", "NY", None, "SF"]
})

print("DataFrame:")
display(df_summary)

print("\nSummary:")
display(summarize_dataframe(df_summary))

np.random.seed(42)
x = np.concatenate([np.random.normal(10, 1, 99), np.array([1000])])
y = np.random.normal(0, 1, 100)
df_out = pd.DataFrame({"x": x, "y": y})

print("Outlier Report:")
display(detect_outliers(df_out))

df_before = pd.DataFrame({
    "id": [1, 2, 2, 3],
    "score": [10, 20, 20, np.nan],
    "group": ["A", "A", "A", "B"]
})

df_after = df_before.drop_duplicates().copy()
df_after["score"] = df_after["score"].fillna(0)
df_after["score2"] = df_after["score"] * 2

compare_before_after(df_before, df_after)

df_cat = pd.DataFrame({
    "color": ["red", "red", "blue", "green", "red", "blue", None],
    "segment": ["pro", "basic", "basic", "pro", "pro", "pro", "basic"]
})

profile_categories(df_cat)

# Reuse the df_after from earlier example
path = save_clean_data(df_after, "cleaned_data.csv")

# Confirm it worked
import os
print("File exists:", os.path.exists(path))
pd.read_csv(path).head()



In [14]:
import pandas as pd
import numpy as np
from scipy import stats

def analyze_numerical_stats(df: pd.DataFrame, normal_test: bool = True) -> pd.DataFrame:
    """
    Provide a detailed statistical analysis of all numeric columns.

    Parameters
    ----------
    df : pd.DataFrame
        Input DataFrame.
    normal_test : bool
        If True, runs Shapiro-Wilk normality test (up to 5000 samples).

    Returns
    -------
    pd.DataFrame
        Summary statistics per numeric column.
    """
    numeric_cols = df.select_dtypes(include=np.number).columns
    results = []

    for col in numeric_cols:
        s = df[col].dropna()

        if s.empty:
            continue

        desc = {
            "column": col,
            "count": s.count(),
            "mean": s.mean(),
            "std": s.std(),
            "var": s.var(),
            "min": s.min(),
            "25%": s.quantile(0.25),
            "50% (median)": s.median(),
            "75%": s.quantile(0.75),
            "max": s.max(),
            "iqr": s.quantile(0.75) - s.quantile(0.25),
            "skew": s.skew(),
            "kurtosis": s.kurt(),
        }

        if normal_test and len(s) >= 3:
            stat, p = stats.shapiro(s.sample(min(len(s), 5000), random_state=0))
            desc.update({"shapiro_stat": stat, "shapiro_p": p})
        results.append(desc)

    df_stats = pd.DataFrame(results)
    df_stats = df_stats.round(4)
    return df_stats

np.random.seed(0)
df_stats_demo = pd.DataFrame({
    "age": np.random.normal(35, 10, 1000),
    "income": np.random.lognormal(mean=10, sigma=0.4, size=1000),
    "score": np.random.uniform(50, 100, 1000)
})

print("Numeric Summary:")
display(analyze_numerical_stats(df_stats_demo))


Numeric Summary:


Unnamed: 0,column,count,mean,std,var,min,25%,50% (median),75%,max,iqr,skew,kurtosis,shapiro_stat,shapiro_p
0,age,1000,34.5474,9.8753,97.521,4.5386,28.0158,34.4197,41.0695,62.5936,13.0537,0.0339,-0.041,0.9986,0.5912
1,income,1000,23881.5762,9663.4611,93382480.0,6648.5553,16957.4096,22257.2891,28287.9742,78306.8145,11330.5646,1.2319,2.3161,0.9249,0.0
2,score,1000,75.0226,14.403,207.4471,50.0869,63.0415,74.39,87.9329,99.9982,24.8914,0.0258,-1.2161,0.9536,0.0


In [15]:
# eda_quality.py
from __future__ import annotations
import os
import time
from contextlib import contextmanager
from typing import Dict, List, Optional, Tuple

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


# ---------- Data Quality & Structure ----------

def check_missing_values(df: pd.DataFrame, threshold: float = 0.30) -> pd.Series:
    """
    Report columns where the proportion of missing values exceeds `threshold`.
    Returns a Series indexed by column with the null ratio (descending).
    """
    null_ratio = df.isna().mean()
    flagged = null_ratio[null_ratio > threshold].sort_values(ascending=False)
    print(f"Columns > {threshold*100:.0f}% missing:")
    print(flagged if not flagged.empty else "None")
    return flagged


def check_constant_columns(df: pd.DataFrame) -> List[str]:
    """
    Return a list of columns with a single unique value (including NaN-only columns).
    """
    constants = [c for c in df.columns if df[c].nunique(dropna=False) <= 1]
    print("Constant columns:", constants if constants else "None")
    return constants


def check_high_cardinality(df: pd.DataFrame, limit: int = 50) -> Dict[str, int]:
    """
    Find object/category columns with unique count > `limit`.
    Returns {column: nunique}.
    """
    cat_cols = df.select_dtypes(include=["object", "category"]).columns
    high = {c: int(df[c].nunique(dropna=False)) for c in cat_cols if df[c].nunique(dropna=False) > limit}
    print("High-cardinality columns:")
    if high:
        print(pd.Series(high).sort_values(ascending=False))
    else:
        print("None")
    return high


# ---------- Correlation & Relationships ----------

def correlation_matrix(
    df: pd.DataFrame,
    method: str = "pearson",
    plot: bool = True,
    figsize: Tuple[int, int] = (7, 6),
) -> pd.DataFrame:
    """
    Compute numeric correlation matrix and (optionally) plot a heatmap.
    method: 'pearson' | 'spearman' | 'kendall'
    """
    num = df.select_dtypes(include=np.number)
    corr = num.corr(method=method)
    if plot and not corr.empty:
        plt.figure(figsize=figsize)
        plt.imshow(corr.values, interpolation="none")
        plt.title(f"Correlation ({method})")
        plt.colorbar()
        ticks = range(len(corr.columns))
        plt.xticks(ticks, corr.columns, rotation=45, ha="right")
        plt.yticks(ticks, corr.columns)
        plt.tight_layout()
        plt.show()
    return corr


def cramers_v(df: pd.DataFrame, col1: str, col2: str) -> float:
    """
    Compute Cramér's V for association between two categorical columns.
    Uses chi-square with expected frequencies; no external dependencies.
    """
    table = pd.crosstab(df[col1], df[col2]).values.astype(float)
    n = table.sum()
    row_sums = table.sum(axis=1, keepdims=True)
    col_sums = table.sum(axis=0, keepdims=True)
    expected = row_sums @ col_sums / n
    # Avoid division by zero
    with np.errstate(divide="ignore", invalid="ignore"):
        chi2 = np.nansum((table - expected) ** 2 / np.where(expected == 0, np.nan, expected))
    k = min(table.shape)  # smaller dimension
    if n == 0 or k <= 1:
        return 0.0
    v = np.sqrt(chi2 / (n * (k - 1)))
    return float(v)


# ---------- Visualization ----------

def pairwise_scatter(df: pd.DataFrame, cols: Optional[List[str]] = None, figsize: Tuple[int, int] = (8, 8)):
    """
    Quick scatter-matrix for numeric relationships. `cols` limits which numeric columns.
    """
    from pandas.plotting import scatter_matrix
    num = df.select_dtypes(include=np.number)
    if cols:
        num = num[[c for c in cols if c in num.columns]]
    if num.shape[1] < 2:
        print("Not enough numeric columns to plot.")
        return
    axarr = scatter_matrix(num, figsize=figsize, diagonal="hist")
    plt.tight_layout()
    plt.show()
    return axarr


def plot_boxplots(df: pd.DataFrame, by: Optional[str] = None, figsize: Tuple[int, int] = (10, 6)):
    """
    Boxplots for all numeric columns; if `by` is provided, group by that categorical column.
    """
    num_cols = df.select_dtypes(include=np.number).columns.tolist()
    if not num_cols:
        print("No numeric columns to plot.")
        return
    plt.figure(figsize=figsize)
    if by and by in df.columns:
        df.boxplot(column=num_cols, by=by, grid=True)
        plt.suptitle("")  # cleaner title
        plt.title(f"Boxplots grouped by '{by}'")
    else:
        df[num_cols].plot(kind="box")
        plt.title("Boxplots")
    plt.tight_layout()
    plt.show()


# ---------- Utility & Safety ----------

def memory_usage_report(df: pd.DataFrame) -> pd.DataFrame:
    """
    Show memory use by column (MB) and total. Returns a DataFrame.
    """
    mem = (df.memory_usage(deep=True) / 1e6).to_frame("MB").sort_values("MB", ascending=False)
    print(mem)
    print(f"Total: {mem['MB'].sum():.2f} MB")
    return mem


def convert_dtypes_safely(
    df: pd.DataFrame,
    to_category_max_ratio: float = 0.5,
    downcast_int: bool = True,
    downcast_float: bool = True,
    try_datetime: bool = True,
) -> pd.DataFrame:
    """
    Attempt safe dtype conversions:
    - objects with unique_ratio <= to_category_max_ratio -> category
    - downcast ints/floats to smaller types
    - try parsing obvious datetime-like object columns
    Returns a new DataFrame copy.
    """
    out = df.copy()

    # Try datetime parse for object columns
    if try_datetime:
        obj_cols = out.select_dtypes(include=["object"]).columns
        for c in obj_cols:
            sample = out[c].dropna().astype(str).head(50)
            if sample.empty:
                continue
            # heuristic: presence of '-' or '/' or ':' often indicates date/time
            if sample.str.contains(r"[-/:]").mean() > 0.3:
                try:
                    parsed = pd.to_datetime(out[c], errors="raise", utc=False, infer_datetime_format=True)
                    out[c] = parsed
                except Exception:
                    pass

    # Convert suitable objects to category
    obj_cols = out.select_dtypes(include=["object"]).columns
    n = len(out)
    for c in obj_cols:
        uniq = out[c].nunique(dropna=False)
        if n > 0 and (uniq / n) <= to_category_max_ratio:
            out[c] = out[c].astype("category")

    # Downcast numerics
    if downcast_int:
        for c in out.select_dtypes(include=["int", "int64", "Int64"]).columns:
            out[c] = pd.to_numeric(out[c], downcast="integer")
    if downcast_float:
        for c in out.select_dtypes(include=["float", "float64"]).columns:
            out[c] = pd.to_numeric(out[c], downcast="float")

    return out


def log_dataframe_shape(df: pd.DataFrame, label: str = "") -> pd.DataFrame:
    """
    Print '<label>: rows x cols' and return df (so it can be chained).
    """
    print(f"{label}: {df.shape[0]} × {df.shape[1]}")
    return df


@contextmanager
def timeit_context(label: str = "elapsed"):
    """
    Context manager to time a code block.
    Usage:
        with timeit_context("cleaning"):
            ... your code ...
    """
    t0 = time.perf_counter()
    try:
        yield
    finally:
        dt = time.perf_counter() - t0
        print(f"{label}: {dt:.4f} s")


def timeit_decorator(label: Optional[str] = None):
    """
    Decorator version of timeit_context for functions.
    Usage:
        @timeit_decorator("impute")
        def impute(...):
            ...
    """
    def _wrap(fn):
        def _inner(*args, **kwargs):
            l = label or fn.__name__
            t0 = time.perf_counter()
            try:
                return fn(*args, **kwargs)
            finally:
                dt = time.perf_counter() - t0
                print(f"{l}: {dt:.4f} s")
        return _inner
    return _wrap


In [None]:
# -------------------------------------------------------------------
# 1️⃣  Generate a mixed dataset for demonstration
# -------------------------------------------------------------------
np.random.seed(42)
df = pd.DataFrame({
    "id": range(1, 201),
    "age": np.random.normal(35, 10, 200).round(1),
    "income": np.random.lognormal(mean=10, sigma=0.4, size=200).round(2),
    "group": np.random.choice(["A", "B", "C"], 200),
    "gender": np.random.choice(["M", "F"], 200),
    "city": np.random.choice(["NY", "LA", "SF", "TX"], 200),
})

# Add missing values and outlier
df.loc[5:10, "income"] = np.nan
df.loc[0, "age"] = 100
df["constant_col"] = 1
df["category_high"] = [f"user_{i}" for i in range(200)]

print("\n=== SAMPLE DATA ===")
display(df.head())

# -------------------------------------------------------------------
# 2️⃣  Core 6
# -------------------------------------------------------------------
print("\n=== summarize_dataframe ===")
display(summarize_dataframe(df))

print("\n=== detect_outliers ===")
display(detect_outliers(df))

print("\n=== compare_before_after ===")
df_clean = df.drop(columns=["constant_col"])
compare_before_after(df, df_clean)

print("\n=== profile_categories ===")
profile_categories(df)

print("\n=== save_clean_data ===")
out_path = save_clean_data(df_clean, "eda_demo_clean.csv")
print("File saved to:", out_path)

print("\n=== analyze_numerical_stats ===")
display(analyze_numerical_stats(df))

# -------------------------------------------------------------------
# 3️⃣  Tier-2 utilities
# -------------------------------------------------------------------
print("\n=== check_missing_values ===")
check_missing_values(df, threshold=0.05)

print("\n=== check_constant_columns ===")
check_constant_columns(df)

print("\n=== check_high_cardinality ===")
check_high_cardinality(df, limit=10)

print("\n=== correlation_matrix ===")
corr = correlation_matrix(df, method="pearson", plot=True)
display(corr)

print("\n=== cramers_v ===")
print("Cramér’s V (group vs gender):", round(cramers_v(df, "group", "gender"), 4))

print("\n=== pairwise_scatter ===")
pairwise_scatter(df[["age", "income"]])

print("\n=== plot_boxplots ===")
plot_boxplots(df, by="group")

print("\n=== memory_usage_report (before) ===")
memory_usage_report(df)

print("\n=== convert_dtypes_safely & memory_usage_report (after) ===")
df_opt = convert_dtypes_safely(df)
memory_usage_report(df_opt)

print("\n=== log_dataframe_shape ===")
log_dataframe_shape(df_opt, "Optimized DataFrame")

# Timing examples
print("\n=== timeit_context ===")
with timeit_context("groupby mean"):
    _ = df.groupby("group")["income"].mean()

@timeit_decorator("demo_function")
def demo_fn(x):
    return x ** 2

print("\n=== timeit_decorator ===")
demo_fn(10)

print("\n✅ All EDA functions demonstrated successfully.")

In [17]:
# common_joins.py
from __future__ import annotations
import pandas as pd


def join_datasets(
    left: pd.DataFrame,
    right: pd.DataFrame,
    on: list[str] | str,
    how: str = "inner",
    validate: bool = True,
    suffixes: tuple[str, str] = ("_x", "_y"),
    verbose: bool = True,
) -> pd.DataFrame:
    """
    Safe, logged join operation with optional key validation and row-change summary.

    Parameters
    ----------
    left, right : DataFrame
        Input DataFrames.
    on : list[str] or str
        Column(s) to join on.
    how : str
        Type of join ('inner', 'left', 'right', 'outer', 'cross').
    validate : bool
        If True, check for duplicate keys and print diagnostics.
    suffixes : tuple(str, str)
        Suffixes for overlapping column names.
    verbose : bool
        Print row counts and key diagnostics.

    Returns
    -------
    DataFrame
        Result of the join.
    """
    if isinstance(on, str):
        on = [on]

    if validate:
        # Check for duplicate keys
        left_dupes = left.duplicated(subset=on, keep=False).sum()
        right_dupes = right.duplicated(subset=on, keep=False).sum()
        if verbose:
            print(f"Left duplicates on {on}: {left_dupes}")
            print(f"Right duplicates on {on}: {right_dupes}")
        if left_dupes > 0 or right_dupes > 0:
            print("⚠️  Warning: join keys are not unique — may cause row multiplication.")

    left_rows, right_rows = len(left), len(right)
    result = pd.merge(left, right, on=on, how=how, suffixes=suffixes)
    joined_rows = len(result)

    if verbose:
        print(f"\nJoin type: {how}")
        print(f"Left rows: {left_rows}, Right rows: {right_rows}, Result rows: {joined_rows}")
        if how in ("inner", "left", "right"):
            unmatched_left = (
                left_rows
                - result[on].drop_duplicates().merge(left[on].drop_duplicates(), on=on, how="inner").shape[0]
            )
            print(f"Unmatched (approx): {unmatched_left}")
        overlap = set(left.columns) & set(right.columns) - set(on)
        if overlap:
            print(f"Overlapping columns renamed with suffixes {suffixes}: {overlap}")

    return result


def join_summary(df_left: pd.DataFrame, df_right: pd.DataFrame, on: list[str] | str) -> pd.DataFrame:
    """
    Quick visual summary of key overlap before joining.

    Returns a DataFrame with counts:
        - only_in_left
        - only_in_right
        - in_both
    """
    if isinstance(on, str):
        on = [on]

    left_keys = pd.DataFrame(df_left[on].drop_duplicates())
    right_keys = pd.DataFrame(df_right[on].drop_duplicates())

    both = left_keys.merge(right_keys, on=on, how="inner").shape[0]
    only_left = left_keys.shape[0] - both
    only_right = right_keys.shape[0] - both

    summary = pd.DataFrame(
        {"only_in_left": [only_left], "only_in_right": [only_right], "in_both": [both]},
        index=["key_overlap"],
    )
    print(summary)
    return summary


In [19]:
import pandas as pd

# Example DataFrames
left = pd.DataFrame({
    "id": [1, 2, 3, 4],
    "name": ["A", "B", "C", "D"],
    "value": [10, 20, 30, 40],
})

right = pd.DataFrame({
    "id": [3, 4, 4, 5],
    "region": ["West", "East", "East", "North"],
    "sales": [300, 400, 401, 500],
})

# Preview key overlap
join_summary(left, right, on="id")

# Safe join
merged = join_datasets(left, right, on="id", how="left")
print("\nMerged Result:")
display(merged)


             only_in_left  only_in_right  in_both
key_overlap             2              1        2
Left duplicates on ['id']: 0
Right duplicates on ['id']: 2

Join type: left
Left rows: 4, Right rows: 4, Result rows: 5
Unmatched (approx): 0

Merged Result:


Unnamed: 0,id,name,value,region,sales
0,1,A,10,,
1,2,B,20,,
2,3,C,30,West,300.0
3,4,D,40,East,400.0
4,4,D,40,East,401.0


In [None]:
# common_splits.py
from __future__ import annotations
import pandas as pd
from typing import List, Dict, Tuple, Optional


def split_by_columns(
    df: pd.DataFrame,
    include_cols: List[str],
    *,
    suffix_left: str = "_part1",
    suffix_right: str = "_part2",
    verbose: bool = True,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Split a DataFrame into two parts based on a list of included columns.

    Parameters
    ----------
    df : DataFrame
        The full dataset.
    include_cols : list of str
        Columns to keep in the first output DataFrame.
    suffix_left, suffix_right : str
        Labels for printing.
    verbose : bool
        If True, print information about the split.

    Returns
    -------
    Tuple of DataFrames: (df_selected, df_remaining)
    """
    include_cols = [c for c in include_cols if c in df.columns]
    exclude_cols = [c for c in df.columns if c not in include_cols]

    left = df[include_cols].copy()
    right = df[exclude_cols].copy()

    if verbose:
        print(f"Split by columns:")
        print(f"  {suffix_left}: {len(left.columns)} columns → {include_cols[:6]}{'...' if len(include_cols) > 6 else ''}")
        print(f"  {suffix_right}: {len(right.columns)} columns → {exclude_cols[:6]}{'...' if len(exclude_cols) > 6 else ''}")

    return left, right


def split_by_condition(
    df: pd.DataFrame,
    condition: pd.Series | pd.Index | list | Tuple,
    *,
    suffix_true: str = "_match",
    suffix_false: str = "_nonmatch",
    verbose: bool = True,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Split a DataFrame into two subsets based on a Boolean condition.

    Example:
        active, inactive = split_by_condition(df, df["status"] == "active")

    Returns
    -------
    (df_true, df_false)
    """
    df_true = df[condition].copy()
    df_false = df[~condition].copy()

    if verbose:
        print(f"Split by condition:")
        print(f"  {suffix_true}: {len(df_true)} rows matched")
        print(f"  {suffix_false}: {len(df_false)} rows did not match")

    return df_true, df_false


def split_by_dtype(
    df: pd.DataFrame,
    *,
    verbose: bool = True
) -> Dict[str, pd.DataFrame]:
    """
    Split a DataFrame into numeric, categorical, datetime, and other dtype subsets.

    Returns
    -------
    dict of {dtype_group: DataFrame}
    """
    numeric_cols = df.select_dtypes(include="number").columns.tolist()
    categorical_cols = df.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
    datetime_cols = df.select_dtypes(include=["datetime64[ns]"]).columns.tolist()

    other_cols = [c for c in df.columns if c not in numeric_cols + categorical_cols + datetime_cols]

    splits = {
        "numeric": df[numeric_cols],
        "categorical": df[categorical_cols],
        "datetime": df[datetime_cols],
        "other": df[other_cols],
    }

    if verbose:
        print(f"Split by dtype:")
        for k, v in splits.items():
            print(f"  {k:<12} → {len(v.columns)} columns")

    return splits


def split_by_unique_keys(
    df: pd.DataFrame,
    key_cols: List[str],
    *,
    verbose: bool = True
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Split a DataFrame into unique and duplicate records based on key columns.

    Example:
        unique, dupes = split_by_unique_keys(df, ["id"])

    Returns
    -------
    (unique_records_df, duplicate_records_df)
    """
    dup_mask = df.duplicated(subset=key_cols, keep=False)
    unique_df = df[~dup_mask].copy()
    dup_df = df[dup_mask].copy()

    if verbose:
        print(f"Split by uniqueness on {key_cols}:")
        print(f"  unique rows → {len(unique_df)}")
        print(f"  duplicate rows → {len(dup_df)}")

    return unique_df, dup_df

# Create sample data
df = pd.DataFrame({
    "id": [1, 2, 2, 3, 4],
    "name": ["A", "B", "B", "C", "D"],
    "age": [23, 45, 45, 31, 52],
    "salary": [50000, 70000, 70000, 62000, 80000],
    "status": ["active", "inactive", "inactive", "active", "active"],
    "joined": pd.date_range("2023-01-01", periods=5)
})

# Split by columns
df_personal, df_job = split_by_columns(df, ["id", "name", "age"])

# Split by condition
active, inactive = split_by_condition(df, df["status"] == "active")

# Split by dtype
splits = split_by_dtype(df)

# Split by unique keys
unique_rows, dupes = split_by_unique_keys(df, ["id"])


In [None]:
# fake_table_generator.py
!pip install Faker
from __future__ import annotations
import numpy as np
import pandas as pd
import random
from faker import Faker
from typing import Dict, Any, Optional

fake = Faker()


def create_fake_table(
    schema: Dict[str, Dict[str, Any]],
    rows: int = 100,
    seed: Optional[int] = 42,
    save_path: Optional[str] = None,
) -> pd.DataFrame:
    """
    Create a fake table (DataFrame) from a schema definition.

    Parameters
    ----------
    schema : dict
        Format:
        {
          "id": {"type": "int", "min": 1, "max": 1000},
          "name": {"type": "name"},
          "email": {"type": "email"},
          "age": {"type": "int", "min": 18, "max": 65},
          "salary": {"type": "float", "mean": 70000, "std": 15000},
          "country": {"type": "choice", "values": ["US", "UK", "CA", "AU"]},
          "join_date": {"type": "date", "start": "2020-01-01", "end": "2024-12-31"},
        }

    rows : int
        Number of rows to generate.
    seed : int | None
        Random seed for reproducibility.
    save_path : str | None
        If provided, saves output to CSV or Parquet depending on extension.

    Returns
    -------
    pd.DataFrame
        The generated fake dataset.
    """
    if seed is not None:
        np.random.seed(seed)
        random.seed(seed)
        Faker.seed(seed)

    data = {}

    for col, spec in schema.items():
        t = spec.get("type", "str").lower()

        if t in ("int", "integer"):
            data[col] = np.random.randint(spec.get("min", 0), spec.get("max", 100), size=rows)
        elif t in ("float", "double"):
            data[col] = np.random.normal(spec.get("mean", 0.0), spec.get("std", 1.0), size=rows).round(2)
        elif t in ("str", "string", "text"):
            length = spec.get("length", 8)
            data[col] = ["".join(random.choices("abcdefghijklmnopqrstuvwxyz", k=length)) for _ in range(rows)]
        elif t == "choice":
            values = spec.get("values", ["A", "B", "C"])
            probs = spec.get("probs", None)
            data[col] = np.random.choice(values, size=rows, p=probs)
        elif t == "bool":
            p_true = spec.get("p_true", 0.5)
            data[col] = np.random.choice([True, False], size=rows, p=[p_true, 1 - p_true])
        elif t == "date":
            start = pd.to_datetime(spec.get("start", "2020-01-01"))
            end = pd.to_datetime(spec.get("end", "2024-12-31"))
            delta = (end - start).days
            data[col] = [start + pd.Timedelta(days=random.randint(0, delta)) for _ in range(rows)]
        elif t == "name":
            data[col] = [fake.name() for _ in range(rows)]
        elif t == "email":
            data[col] = [fake.email() for _ in range(rows)]
        elif t == "address":
            data[col] = [fake.city() for _ in range(rows)]
        elif t == "uuid":
            data[col] = [fake.uuid4() for _ in range(rows)]
        else:
            raise ValueError(f"Unsupported type '{t}' for column '{col}'")

    df = pd.DataFrame(data)

    if save_path:
        if save_path.endswith(".csv"):
            df.to_csv(save_path, index=False)
            print(f"Saved CSV → {save_path}")
        elif save_path.endswith(".parquet"):
            df.to_parquet(save_path, index=False)
            print(f"Saved Parquet → {save_path}")
        else:
            print("Unknown file extension; not saved.")

    return df

schema = {
    "id": {"type": "int", "min": 1, "max": 5000},
    "name": {"type": "name"},
    "email": {"type": "email"},
    "age": {"type": "int", "min": 18, "max": 70},
    "salary": {"type": "float", "mean": 75000, "std": 12000},
    "country": {"type": "choice", "values": ["US", "UK", "CA", "AU"], "probs": [0.4, 0.3, 0.2, 0.1]},
    "is_active": {"type": "bool", "p_true": 0.7},
    "join_date": {"type": "date", "start": "2021-01-01", "end": "2024-12-31"},
}

df_fake = create_fake_table(schema, rows=10)
print(df_fake.head())

In [None]:
# visual_piechart.py
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

def plot_pie_chart(
    df: pd.DataFrame,
    column: str,
    *,
    title: str | None = None,
    top_n: int | None = None,
    autopct: bool = True,
    figsize=(6, 6),
    startangle: int = 90,
    colors: list[str] | None = None,
    save_path: str | None = None,
):
    """
    Plot a pie chart from a categorical column.

    Parameters
    ----------
    df : DataFrame
        Input data.
    column : str
        Column name to visualize (categorical or small number of unique values).
    title : str
        Optional chart title.
    top_n : int | None
        If provided, show only top N categories (grouping others as 'Other').
    autopct : bool
        Whether to show percentages on the chart.
    figsize : tuple
        Figure size in inches.
    startangle : int
        Starting rotation for pie slices.
    colors : list[str]
        Optional color list.
    save_path : str | None
        Optional file path to save figure (PNG, JPG, etc.).
    """
    # Compute value counts
    counts = df[column].value_counts()

    # Collapse smaller categories into "Other" if top_n specified
    if top_n and len(counts) > top_n:
        others_sum = counts.iloc[top_n:].sum()
        counts = counts.iloc[:top_n]
        counts["Other"] = others_sum

    labels = counts.index.tolist()
    values = counts.values

    plt.figure(figsize=figsize)
    plt.pie(
        values,
        labels=labels,
        autopct="%1.1f%%" if autopct else None,
        startangle=startangle,
        colors=colors,
    )

    plt.title(title or f"Distribution of {column}")
    plt.axis("equal")  # Equal aspect ratio ensures a perfect circle.

    if save_path:
        plt.savefig(save_path, bbox_inches="tight")
        print(f"Saved pie chart → {save_path}")

    plt.show()

    # Print raw value summary for inspection
    print("=== Category Counts ===")
    print(counts)

# --- Create test data ---
np.random.seed(42)
df_test = pd.DataFrame({
    "region": np.random.choice(
        ["North", "South", "East", "West", "Central"],
        size=200,
        p=[0.25, 0.25, 0.2, 0.2, 0.1]
    )
})

# --- Plot pie chart ---
plot_pie_chart(
    df_test,
    column="region",
    title="Regional Distribution (Sample Data)",
    top_n=4,
)