## 1. Setup + imports

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import plotly.express as px

DATA = Path(r"C:\Users\janah\OneDrive\المستندات\SDAIA2\week2-data-work\data\processed\analytics_table (1).parquet")


FIGS = Path(r"C:\Users\janah\OneDrive\المستندات\SDAIA2\week2-data-work\reports\figures")
FIGS.mkdir(parents=True, exist_ok=True)

def save_fig(fig, path: Path, *, scale: int = 2) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    fig.write_image(str(path), scale=scale)

## 2. Load processed data

In [14]:
df = pd.read_parquet(DATA)

print("Rows:", len(df), "Cols:", len(df.columns))

Rows: 5 Cols: 18


## 3. Quick audit

In [15]:
print(df.dtypes.head(15))

order_id               string[python]
user_id                string[python]
amount                        Float64
quantity                        Int64
created_at        datetime64[ns, UTC]
status                         object
status_clean           string[python]
amount__isna                     bool
quantity__isna                   bool
date                           object
year                          float64
month                  string[python]
dow                            object
hour                          float64
country                        object
dtype: object


In [None]:
missing = (
    df.isna()
    .sum()
    .sort_values(ascending=False)
    .head(10)
)
print(missing)


date                 1
year                 1
amount_winsor        1
hour                 1
dow                  1
month                1
amount_is_outlier    1
created_at           1
quantity             1
amount               1
dtype: int64


## 4. Questions + results

In [5]:
rev = (
    df.groupby("country", dropna=False)
    .agg(
        n=("order_id","size"),
        revenue=("amount","sum"),
        aov=("amount","mean"),
        )
    .reset_index()
    .sort_values("revenue", ascending=False)
    )

fig = px.bar(rev, x="country", y="revenue", title="Revenue by country (all data)")
fig.update_layout(title={"x": 0.02})
fig.update_xaxes(title_text="Country")
fig.update_yaxes(title_text="Revenue (sum of amount)")
save_fig(fig, FIGS / "revenue_by_country.png")
fig

In [6]:
trend = (
    df.groupby("month", dropna=False)
    .agg(n=("order_id","size"), revenue=("amount","sum"))
    .reset_index()
    .sort_values("month")
    )
fig = px.line(trend, x="month", y="revenue", title="Revenue over time (monthly)")
fig.update_layout(title={"x": 0.02})
fig.update_xaxes(title_text="Month")
fig.update_yaxes(title_text="Revenue")
save_fig(fig, FIGS / "revenue_trend_monthly.png")
fig

In [7]:
fig = px.histogram(df, x="amount_winsor", nbins=30, title="Order amount distribution (winsorized)")
fig.update_layout(title={"x": 0.02})
fig.update_xaxes(title_text="Amount (winsorized)")
fig.update_yaxes(title_text="Number of orders")
save_fig(fig, FIGS / "amount_hist_winsor.png")
fig

## 5. Bootstrap comparison

In [9]:
def bootstrap_diff_means(
    a: pd.Series,
    b: pd.Series,
    *,
    n_boot: int = 2000,
    seed: int = 0
) -> dict:
    rng = np.random.default_rng(seed)

    a = pd.to_numeric(a, errors="coerce").dropna().to_numpy()
    b = pd.to_numeric(b, errors="coerce").dropna().to_numpy()

    assert len(a) > 0 and len(b) > 0, "Empty group after cleaning"

    diffs = []
    for _ in range(n_boot):
        sa = rng.choice(a, size=len(a), replace=True)
        sb = rng.choice(b, size=len(b), replace=True)
        diffs.append(sa.mean() - sb.mean())

    diffs = np.array(diffs)

    return {
        "diff_mean": float(a.mean() - b.mean()),
        "ci_low": float(np.quantile(diffs, 0.025)),
        "ci_high": float(np.quantile(diffs, 0.975)),
    }


In [10]:
d = df.assign(is_refund=df["status_clean"].eq("refund").astype(int))

a = d.loc[d["country"].eq("SA"), "is_refund"]
b = d.loc[d["country"].eq("AE"), "is_refund"]

print("n_SA:", len(a), "n_AE:", len(b))

res = bootstrap_diff_means(a, b, n_boot=2000, seed=0)
print(res)


n_SA: 4 n_AE: 1
{'diff_mean': -1.0, 'ci_low': -1.0, 'ci_high': -1.0}
