<a href="https://colab.research.google.com/github/jmcconne100/Pandas_Notebook_Project/blob/main/my_data_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import files, drive

def load_csv(method="upload", source=None, concat=False, **read_csv_kwargs):
    """
    Load CSVs in Colab using one of three methods:
      - "upload": upload from local machine
      - "drive": read from Google Drive
      - "web": read from URL(s)

    Args:
        method: "upload" | "drive" | "web"
        source: file path(s) or URL(s); not needed for upload
        concat: if True, combine all CSVs into one DataFrame
        **read_csv_kwargs: passed to pandas.read_csv()

    Returns:
        A DataFrame (if concat=True or one file) or dict of {name: DataFrame}

    Examples:
        df1 = load_csv("upload")

        path = "/content/drive/MyDrive/data/UScomments.csv"
        df2 = load_csv("drive", path)

        url = "https://people.sc.fsu.edu/~jburkardt/data/csv/addresses.csv"
        df3 = load_csv("web", url)
    """
    defaults = {"on_bad_lines": "skip"}
    kwargs = {**defaults, **(read_csv_kwargs or {})}

    method = method.lower()
    dfs = {}

    if method == "upload":
        uploaded = files.upload()
        for name in uploaded.keys():
            dfs[name] = pd.read_csv(name, **kwargs)

    elif method == "drive":
        drive.mount("/content/drive", force_remount=False)
        if isinstance(source, str):
            source = [source]
        for path in source:
            dfs[path.split("/")[-1]] = pd.read_csv(path, **kwargs)

    elif method == "web":
        if isinstance(source, str):
            source = [source]
        for url in source:
            dfs[url.split("/")[-1]] = pd.read_csv(url, **kwargs)

    else:
        raise ValueError("method must be one of: 'upload', 'drive', 'web'")

    if concat:
        return pd.concat(list(dfs.values()), ignore_index=True)
    return dfs if len(dfs) > 1 else next(iter(dfs.values()))

In [None]:
method = "upload" # can put in upload, drive, or web
# Note if picking drive specify a path and if picking web specify a URL

df1 = load_csv(method)
df1.head()

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

text = "I love Python and data analysis but I hate debugging errors sometimes."
stop_words = set(stopwords.words('english'))

# Remove stopwords
filtered_text = ' '.join([word for word in text.split() if word.lower() not in stop_words])

# Generate word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(filtered_text)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()


In [None]:
# pip installs (run once)
!pip install emoji regex plotly pandas

import pandas as pd
import emoji
import regex as re
from collections import Counter
import plotly.express as px

# Robust grapheme splitter so flags / family sequences stay intact
GRAPHEME = re.compile(r'\X', re.UNICODE)

def extract_emojis(text: str) -> list[str]:
    # Keep grapheme clusters that contain at least one emoji codepoint
    return [g for g in GRAPHEME.findall(text) if any(ch in emoji.EMOJI_DATA for ch in g)]

# Example corpus (replace with yours)
messages = [
    "Love this! 😍🔥",
    "Hahaha 😂😂",
    "Ok 👍🏽👍🏽 meeting at 3pm 🕒",
    "New PR merged ✅🚀🚀🚀🚀🚀",
    "Ugh… Mondays 😒☕",
    "Flags work too 🇺🇸🇨🇦 😍😍😍",
    "🙂🙂🙂",
    "🤣😔😔😔😔"
]

# Flatten all emojis
all_emojis = [e for msg in messages for e in extract_emojis(msg)]

freq = Counter(all_emojis)
df_freq = pd.DataFrame(freq.items(), columns=["emoji", "count"]).sort_values("count", ascending=False)

# Bar chart of the top 20 emojis
fig = px.bar(df_freq.head(20), x="emoji", y="count", text="count",
             title="Top Emojis")
fig.update_traces(textposition="outside")
fig.update_layout(xaxis_title="Emoji", yaxis_title="Count")
fig.show()

In [None]:
# Upload and combine a series of csv's

from google.colab import files
import pandas as pd
import os

uploaded = files.upload()  # opens a file picker dialog

csv_files = [f for f in os.listdir() if f.endswith('.csv')]
print("Found CSVs:", csv_files)

dfs = [pd.read_csv(f, on_bad_lines='skip', low_memory=False) for f in csv_files]

df_all = pd.concat(dfs, ignore_index=True)

print(f"Loaded {len(dfs)} CSVs, combined shape: {df_all.shape}")
df_all.head()

In [None]:
# Box Plot Script

# Install if needed
!pip install plotly

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

# ----- Generate Example Data -----
np.random.seed(42)

departments = ['Sales', 'Marketing', 'Engineering', 'HR']
n_per_dept = 30

data = {
    'department': np.repeat(departments, n_per_dept),
    'score': np.concatenate([
        np.random.normal(75, 8, n_per_dept),   # Sales
        np.random.normal(70, 10, n_per_dept),  # Marketing
        np.random.normal(85, 5, n_per_dept),   # Engineering
        np.random.normal(65, 7, n_per_dept)    # HR
    ])
}

df = pd.DataFrame(data)
print(df.head())

# ----- Matplotlib Box Plot -----
plt.figure(figsize=(8, 5))
df.boxplot(column='score', by='department', grid=False, patch_artist=True)
plt.title('Box Plot of Scores by Department')
plt.suptitle('')
plt.xlabel('Department')
plt.ylabel('Score')
plt.show()

# ----- Interactive Plotly Box Plot -----
fig = px.box(df, x='department', y='score', color='department',
             title='Interactive Box Plot of Scores by Department',
             points='all')  # 'all' adds jittered individual points
fig.show()


In [None]:
# Install seaborn & plotly if needed
# !pip install seaborn plotly

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

# ----- 1️⃣ Generate Example Data -----
np.random.seed(42)
n = 100

df = pd.DataFrame({
    'age': np.random.randint(20, 60, n),
    'experience': np.random.randint(0, 30, n),
    'hours_per_week': np.random.randint(30, 60, n),
    'projects_completed': np.random.randint(1, 10, n),
    'score': np.random.normal(75, 10, n)
})

# Add a correlated feature (score slightly depends on hours + projects)
df['performance_index'] = (
    0.4 * df['hours_per_week'] +
    0.3 * df['projects_completed'] +
    np.random.normal(0, 5, n)
)

print(df.head())

# ----- Compute Correlation Matrix -----
corr = df.corr(numeric_only=True)
print("\nCorrelation Matrix:\n", corr)

# ----- Seaborn Heatmap (Static) -----
plt.figure(figsize=(8,6))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Correlation Heatmap (Seaborn)")
plt.show()

# ----- 4Plotly Heatmap (Interactive) -----
fig = px.imshow(
    corr,
    text_auto=".2f",
    color_continuous_scale='RdBu_r',
    title="Interactive Correlation Heatmap (Plotly)"
)
fig.update_layout(xaxis_title="Features", yaxis_title="Features")
fig.show()


In [None]:
# Install if needed
!pip install seaborn plotly

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

# ----- Generate Synthetic Data -----
np.random.seed(42)
n = 100

# Create a linear relationship with noise
x = np.random.uniform(1, 100, n)
y = 2.5 * x + np.random.normal(0, 25, n)

df = pd.DataFrame({"hours_studied": x, "exam_score": y})

print(df.head())

# ----- Static Regression Plot (Seaborn) -----
plt.figure(figsize=(7,5))
sns.regplot(
    data=df,
    x="hours_studied",
    y="exam_score",
    scatter_kws={'alpha':0.7},
    line_kws={'color':'red'}
)
plt.title("Regression Plot: Hours Studied vs Exam Score")
plt.xlabel("Hours Studied")
plt.ylabel("Exam Score")
plt.show()

# ----- Optional Faceted Plot (Seaborn lmplot) -----
# Example: if you had a categorical variable like 'class'
df['class'] = np.random.choice(['A','B'], size=n)
sns.lmplot(data=df, x="hours_studied", y="exam_score", hue="class", aspect=1.2)
plt.title("Regression Plot by Class")
plt.show()

# ----- Interactive Plotly Version -----
fig = px.scatter(
    df,
    x="hours_studied",
    y="exam_score",
    color="class",
    trendline="ols",  # adds regression line automatically
    title="Interactive Regression Plot (Plotly)"
)
fig.update_traces(marker=dict(size=8, opacity=0.7))
fig.show()


In [None]:
# Install if needed
!pip install seaborn plotly

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# ----- Example Data -----
np.random.seed(42)
departments = ['Sales', 'Marketing', 'Engineering', 'HR', 'Finance']
avg_scores = np.random.randint(60, 95, len(departments))

df = pd.DataFrame({
    'department': departments,
    'average_score': avg_scores
})

print(df)

# ----- Vertical Bar Chart (Matplotlib) -----
plt.figure(figsize=(7,5))
plt.bar(df['department'], df['average_score'], color='skyblue')
plt.title("Average Score by Department (Vertical)")
plt.xlabel("Department")
plt.ylabel("Average Score")
plt.show()

# ----- Horizontal Bar Chart (Matplotlib) -----
plt.figure(figsize=(7,5))
plt.barh(df['department'], df['average_score'], color='lightcoral')
plt.title("Average Score by Department (Horizontal)")
plt.xlabel("Average Score")
plt.ylabel("Department")
plt.show()

# ----- Vertical Bar Chart (Seaborn) -----
plt.figure(figsize=(7,5))
sns.barplot(data=df, x='department', y='average_score', palette='Blues_d')
plt.title("Seaborn Vertical Bar Chart")
plt.show()

# ----- Horizontal Bar Chart (Seaborn) -----
plt.figure(figsize=(7,5))
sns.barplot(data=df, y='department', x='average_score', palette='Reds_d')
plt.title("Seaborn Horizontal Bar Chart")
plt.show()

# ----- Interactive Plotly Bar Charts -----
# Vertical
fig_v = px.bar(df, x='department', y='average_score',
               title='Interactive Vertical Bar Chart (Plotly)',
               color='department', text='average_score')
fig_v.update_traces(textposition='outside')
fig_v.show()

# Horizontal
fig_h = px.bar(df, x='average_score', y='department', orientation='h',
               title='Interactive Horizontal Bar Chart (Plotly)',
               color='department', text='average_score')
fig_h.update_traces(textposition='outside')
fig_h.show()


In [None]:
import pandas as pd
import numpy as np
import re

# ---------- Example Raw Data ----------
raw = pd.DataFrame({
    "order_id": [101, 101, 102, 103, 104, 105],
    "order_date": ["2025-10-01", "10/01/2025", "10/02/2025", "2025/10/03", "Oct 04, 2025", None],
    "category": ["  Mobile ", "mobile", "Phones", "Accessories", "ACCESSORIES ", " tablets "],
    "unit_price": ["$1,299.99", "$1,299.99", "$899", "  $29.99", "$19,999.99", "$250"],
    "qty": [1, 1, None, 2, 1, 1],
})

print("Raw:\n", raw, "\n")

# ---------- Cleaning ----------
df = raw.copy()

# Dates → datetime (coerce errors, then backfill if helpful)
df["order_date"] = pd.to_datetime(df["order_date"], errors="coerce", infer_datetime_format=True)
df["order_date"] = df["order_date"].fillna(df["order_date"].bfill())

# Categories → stripped, lowercased, standardized
df["category"] = df["category"].str.strip().str.lower()
cat_map = {"phones": "mobile", "tablets": "tablet", "accessories": "accessories", "mobile": "mobile"}
df["category"] = df["category"].map(lambda c: cat_map.get(c, c))

# Prices → numeric (remove currency and commas)
df["unit_price"] = (
    df["unit_price"].astype(str)
    .str.replace(r"[^0-9.\-]", "", regex=True)
    .replace("", np.nan)
    .astype(float)
)

# qty → fill missing with 1
df["qty"] = df["qty"].fillna(1).astype(int)

# Duplicate rows (same order_id, category, unit_price, qty, date) → drop
df = df.drop_duplicates(subset=["order_id", "order_date", "category", "unit_price", "qty"])

# Derived total
df["line_total"] = df["unit_price"] * df["qty"]

# Simple outlier guard on price using IQR
q1, q3 = df["unit_price"].quantile([0.25, 0.75])
iqr = q3 - q1
upper = q3 + 1.5 * iqr
df = df[df["unit_price"] <= upper]  # remove extreme outlier row

print("Cleaned dtypes:\n", df.dtypes, "\n")
print("Cleaned:\n", df, "\n")

# (Optional) Save
# df.to_csv("retail_transactions_clean.csv", index=False)


In [None]:
import pandas as pd
import numpy as np
import re

# ---------- Example Raw Data ----------
raw = pd.DataFrame({
    "respondent_id": [1, 2, 3, 4],
    "consent": ["Yes", "y", "NO", "true"],
    "age": [" 29 ", "N/A", "35", None],
    "country": ["U.S.", "United States", "usa", "Canada"],
    "q1_satisfaction": [5, 4, np.nan, 2],     # 1..5
    "q2_rev": [1, 2, 5, 3],                   # reverse-coded 1..5
    "tools": ["pandas; numpy ; SQL", "Python;sql", "", "NumPy;   Pandas;  seaborn "],
    "notes": ["  great product!!  ", "too $$$  ", "fine 👍", None]
})

print("Raw:\n", raw, "\n")

# ---------- Cleaning ----------
df = raw.copy()

# consent → boolean
true_set = {"yes", "y", "true", "1"}
df["consent"] = df["consent"].astype(str).str.strip().str.lower().isin(true_set)

# age → numeric, coerce, impute median
df["age"] = pd.to_numeric(df["age"], errors="coerce")
df["age"] = df["age"].fillna(df["age"].median())

# country → standardized
def norm_country(x):
    x = str(x).strip().lower().replace(".", "")
    if x in {"us", "u s", "u.s", "u.s", "usa", "united states"}:
        return "United States"
    if x in {"canada", "ca"}:
        return "Canada"
    return x.title()

df["country"] = df["country"].apply(norm_country)

# Likert reverse-code q2_rev (1..5 → 5..1)
df["q2_rev_rc"] = 6 - df["q2_rev"]

# Impute missing for Likert with median (per column)
for col in ["q1_satisfaction", "q2_rev_rc"]:
    df[col] = df[col].fillna(df[col].median())

# notes → trim whitespace; remove simple emojis/non-ASCII safely
df["notes"] = df["notes"].fillna("").str.strip()
df["notes_clean"] = df["notes"].str.encode("ascii", "ignore").str.decode("ascii")

# tools (multi-select, delimiter “;”) → one-hot bools
tools_clean = (
    df["tools"]
    .fillna("")
    .str.lower()
    .str.replace(r"\s+", " ", regex=True)
    .str.replace(" ;", ";", regex=False)
    .str.replace("; ", ";", regex=False)
)
df["_tools_list"] = tools_clean.apply(lambda s: [t.strip() for t in s.split(";") if t.strip()])

# Collect all unique tools
unique_tools = sorted({t for lst in df["_tools_list"] for t in lst})
for t in unique_tools:
    df[f"tool__{t.replace(' ', '_')}"] = df["_tools_list"].apply(lambda lst: t in lst)

# Overall score (example composite)
df["overall_score"] = df["q1_satisfaction"] + df["q2_rev_rc"]

# Drop helper columns
df = df.drop(columns=["_tools_list"])

print("Cleaned dtypes:\n", df.dtypes, "\n")
print("Cleaned:\n", df, "\n")

# (Optional) Save
# df.to_csv("survey_clean.csv", index=False)


In [None]:
import requests
import pandas as pd

def test_api_to_dataframe():
    """
    Test fetching JSON data from a public API, validating the response,
    and converting it into a pandas DataFrame with clear print checkpoints.
    """
    url = "https://jsonplaceholder.typicode.com/posts"
    print(f"Testing API endpoint: {url}")

    try:
        # Step 1: Send request
        print("Sending request...")
        response = requests.get(url, timeout=10)
        print(f"Response status code: {response.status_code}")
        response.raise_for_status()

        # Step 2: Validate content type
        content_type = response.headers.get("Content-Type", "")
        print(f"Content-Type: {content_type}")
        if "application/json" not in content_type:
            raise ValueError(f"Unexpected content type: {content_type}")

        # Step 3: Parse JSON
        print("Parsing JSON response...")
        data = response.json()
        print(f"JSON parsed successfully. Type: {type(data)}")

        if not isinstance(data, (list, dict)):
            raise TypeError("API response is not a valid JSON structure")

        # Step 4: Normalize for DataFrame conversion
        if isinstance(data, dict):
            data = [data]  # wrap single object in a list

        # Step 5: Convert to DataFrame
        print("Converting to pandas DataFrame...")
        df = pd.DataFrame(data)
        print("Conversion successful.")
        print(f"DataFrame shape: {df.shape}")
        print(f"Columns: {list(df.columns)}")

        # Step 6: Preview data
        print("\nData preview:")
        print(df.head())

        print("\nTest completed successfully.")
        return df

    except requests.exceptions.Timeout:
        print("Request timed out.")
    except requests.exceptions.HTTPError as e:
        print(f"HTTP error: {e}")
    except ValueError as e:
        print(f"Value error: {e}")
    except TypeError as e:
        print(f"Type error: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")

# Run the test
df = test_api_to_dataframe()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def plot_line(
    df: pd.DataFrame,
    x: str,
    y: list | str,
    title: str | None = None,
    xlabel: str | None = None,
    ylabel: str | None = None,
    rolling: int | None = None,
    save_path: str | None = None,
    figsize=(10, 5),
):
    """
    Plot a line chart from a DataFrame.

    Parameters
    ----------
    df : DataFrame
        Source data.
    x : str
        Column name for x-axis. If dtype is not datetime, will try to parse as datetime.
    y : list | str
        One or more column names for y-series.
    rolling : int | None
        Window size for optional rolling mean (applied to each y series).
    save_path : str | None
        If provided, saves the figure (e.g., 'figure.png').
    """

    # Ensure y is a list
    y_cols = [y] if isinstance(y, str) else list(y)

    # Coerce datetime x if possible
    if not pd.api.types.is_datetime64_any_dtype(df[x]):
        try:
            df = df.copy()
            df[x] = pd.to_datetime(df[x], errors="coerce")
        except Exception:
            pass

    # Sort by x for nicer lines
    df = df.sort_values(x)

    plt.figure(figsize=figsize)

    # Plot each series
    for col in y_cols:
        series = df[col]
        if rolling and rolling > 1:
            series = series.rolling(rolling, min_periods=max(1, rolling // 2)).mean()
        plt.plot(df[x], series, label=col)

    # Labels & grid
    plt.title(title or "Line Chart")
    plt.xlabel(xlabel or x)
    plt.ylabel(ylabel or (", ".join(y_cols) if len(y_cols) == 1 else "Values"))
    plt.grid(True, alpha=0.3)

    # Legend for multiple series
    if len(y_cols) > 1:
        plt.legend()

    # Improve date formatting if x is datetime
    if pd.api.types.is_datetime64_any_dtype(df[x]):
        plt.gcf().autofmt_xdate()

    if save_path:
        plt.savefig(save_path, bbox_inches="tight")

    plt.show()

# Example DataFrame
dates = pd.date_range("2025-01-01", periods=30, freq="D")
df_example = pd.DataFrame({
    "date": dates,
    "sales": (100 + pd.Series(range(30))).astype(float),
    "visits": (200 + pd.Series(range(30)) * 1.5).astype(float),
})

# Single series
plot_line(df_example, x="date", y="sales", title="Daily Sales", rolling=3)

# Multiple series
plot_line(df_example, x="date", y=["sales", "visits"], title="Sales vs Visits", rolling=None)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

def plot_histogram(
    df: pd.DataFrame,
    column: str,
    bins: int = 20,
    title: str | None = None,
    xlabel: str | None = None,
    ylabel: str = "Frequency",
    density: bool = False,
    figsize=(8, 5),
    save_path: str | None = None,
):
    """
    Plot a histogram (frequency distribution) for a numeric column.

    Parameters
    ----------
    df : DataFrame
        Source data.
    column : str
        Column name to plot.
    bins : int
        Number of histogram bins.
    title : str | None
        Optional chart title.
    xlabel : str | None
        Optional x-axis label (defaults to column name).
    ylabel : str
        Label for y-axis.
    density : bool
        If True, shows probability density instead of raw counts.
    save_path : str | None
        If provided, saves the figure (e.g., 'histogram.png').
    """

    # Drop NaN values for cleaner plot
    data = df[column].dropna()

    plt.figure(figsize=figsize)
    plt.hist(data, bins=bins, edgecolor="black", alpha=0.7, density=density)

    plt.title(title or f"Distribution of {column}")
    plt.xlabel(xlabel or column)
    plt.ylabel(ylabel if not density else "Density")
    plt.grid(axis="y", linestyle="--", alpha=0.6)

    # Show key stats in console
    print(f"Column: {column}")
    print(f"Count: {len(data)} | Mean: {data.mean():.2f} | Std: {data.std():.2f} | Min: {data.min():.2f} | Max: {data.max():.2f}")

    if save_path:
        plt.savefig(save_path, bbox_inches="tight")

    plt.show()

np.random.seed(42)
df_example = pd.DataFrame({
    "age": np.random.normal(35, 10, 500).clip(0, 80)  # ages roughly 0–80
})

# Plot histogram
plot_histogram(df_example, column="age", bins=15, title="Age Distribution", xlabel="Age (years)")

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

# --- 5 functions from eda_core.py ---

def summarize_dataframe(df):
    rows = []
    for col in df.columns:
        s = df[col]
        row = {
            "column": col,
            "dtype": str(s.dtype),
            "null_%": s.isna().mean() * 100,
            "unique": s.nunique(dropna=True),
        }
        if pd.api.types.is_numeric_dtype(s):
            row.update({
                "min": s.min(),
                "max": s.max(),
                "mean": s.mean(),
                "std": s.std()
            })
        rows.append(row)
    summary_df = pd.DataFrame(rows)
    return summary_df.round(2)


def detect_outliers(df, z_thresh=3):
    numeric = df.select_dtypes(include=np.number)
    report = {}
    for col in numeric.columns:
        z = np.abs((numeric[col] - numeric[col].mean()) / numeric[col].std(ddof=0))
        outliers = (z > z_thresh).sum()
        report[col] = {"outlier_count": int(outliers),
                       "outlier_%": round(100*outliers/len(numeric), 2)}
    return pd.DataFrame(report).T


def compare_before_after(df1, df2):
    print(f"Rows before: {len(df1)} | after: {len(df2)}")
    print(f"Columns before: {len(df1.columns)} | after: {len(df2.columns)}")

    new_cols = set(df2.columns) - set(df1.columns)
    removed_cols = set(df1.columns) - set(df2.columns)
    print(f"Added columns: {new_cols}")
    print(f"Removed columns: {removed_cols}")

    null_diff = (df2.isna().sum() - df1.isna().sum())
    print("\nChange in null counts:")
    print(null_diff[null_diff != 0])


def profile_categories(df, top_n=10):
    cat_cols = df.select_dtypes(include="object").columns
    for col in cat_cols:
        print(f"\nColumn: {col}")
        vc = df[col].value_counts(dropna=False).head(top_n)
        pct = (vc / len(df) * 100).round(2)
        print(pd.DataFrame({"count": vc, "percent": pct}))


def save_clean_data(df, path):
    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    out_path = f"{path.replace('.csv','')}_{ts}.csv"
    df.to_csv(out_path, index=False)
    print(f"Saved cleaned file: {out_path}")
    return out_path

df_summary = pd.DataFrame({
    "age": [25, 30, 40, np.nan, 50],
    "income": [50000, 52000, 51000, 200000, np.nan],
    "signed_up": [True, True, False, True, False],
    "city": ["NY", "LA", "NY", None, "SF"]
})

print("DataFrame:")
display(df_summary)

print("\nSummary:")
display(summarize_dataframe(df_summary))

np.random.seed(42)
x = np.concatenate([np.random.normal(10, 1, 99), np.array([1000])])
y = np.random.normal(0, 1, 100)
df_out = pd.DataFrame({"x": x, "y": y})

print("Outlier Report:")
display(detect_outliers(df_out))

df_before = pd.DataFrame({
    "id": [1, 2, 2, 3],
    "score": [10, 20, 20, np.nan],
    "group": ["A", "A", "A", "B"]
})

df_after = df_before.drop_duplicates().copy()
df_after["score"] = df_after["score"].fillna(0)
df_after["score2"] = df_after["score"] * 2

compare_before_after(df_before, df_after)

df_cat = pd.DataFrame({
    "color": ["red", "red", "blue", "green", "red", "blue", None],
    "segment": ["pro", "basic", "basic", "pro", "pro", "pro", "basic"]
})

profile_categories(df_cat)

# Reuse the df_after from earlier example
path = save_clean_data(df_after, "cleaned_data.csv")

# Confirm it worked
import os
print("File exists:", os.path.exists(path))
pd.read_csv(path).head()



In [14]:
import pandas as pd
import numpy as np
from scipy import stats

def analyze_numerical_stats(df: pd.DataFrame, normal_test: bool = True) -> pd.DataFrame:
    """
    Provide a detailed statistical analysis of all numeric columns.

    Parameters
    ----------
    df : pd.DataFrame
        Input DataFrame.
    normal_test : bool
        If True, runs Shapiro-Wilk normality test (up to 5000 samples).

    Returns
    -------
    pd.DataFrame
        Summary statistics per numeric column.
    """
    numeric_cols = df.select_dtypes(include=np.number).columns
    results = []

    for col in numeric_cols:
        s = df[col].dropna()

        if s.empty:
            continue

        desc = {
            "column": col,
            "count": s.count(),
            "mean": s.mean(),
            "std": s.std(),
            "var": s.var(),
            "min": s.min(),
            "25%": s.quantile(0.25),
            "50% (median)": s.median(),
            "75%": s.quantile(0.75),
            "max": s.max(),
            "iqr": s.quantile(0.75) - s.quantile(0.25),
            "skew": s.skew(),
            "kurtosis": s.kurt(),
        }

        if normal_test and len(s) >= 3:
            stat, p = stats.shapiro(s.sample(min(len(s), 5000), random_state=0))
            desc.update({"shapiro_stat": stat, "shapiro_p": p})
        results.append(desc)

    df_stats = pd.DataFrame(results)
    df_stats = df_stats.round(4)
    return df_stats

np.random.seed(0)
df_stats_demo = pd.DataFrame({
    "age": np.random.normal(35, 10, 1000),
    "income": np.random.lognormal(mean=10, sigma=0.4, size=1000),
    "score": np.random.uniform(50, 100, 1000)
})

print("Numeric Summary:")
display(analyze_numerical_stats(df_stats_demo))


Numeric Summary:


Unnamed: 0,column,count,mean,std,var,min,25%,50% (median),75%,max,iqr,skew,kurtosis,shapiro_stat,shapiro_p
0,age,1000,34.5474,9.8753,97.521,4.5386,28.0158,34.4197,41.0695,62.5936,13.0537,0.0339,-0.041,0.9986,0.5912
1,income,1000,23881.5762,9663.4611,93382480.0,6648.5553,16957.4096,22257.2891,28287.9742,78306.8145,11330.5646,1.2319,2.3161,0.9249,0.0
2,score,1000,75.0226,14.403,207.4471,50.0869,63.0415,74.39,87.9329,99.9982,24.8914,0.0258,-1.2161,0.9536,0.0
