# 01 — Executive EDA (theLook)

This notebook loads the Step 2 query outputs and summarizes:
- Top Categories (revenue + margin proxy)
- Fulfillment performance (delivery time)
- Revenue by Traffic Source

In [None]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)

ROOT = Path.cwd()
while not (ROOT / "analysis").exists() and ROOT != ROOT.parent:
    ROOT = ROOT.parent

OUT_DIR = ROOT / "analysis" / "outputs"
OUT_DIR

# 02 Load the three data sets

In [None]:
top_categories = pd.read_csv(OUT_DIR / "top_categories.csv")
fulfillment = pd.read_csv(OUT_DIR / "fulfillment.csv")
traffic_source_revenue = pd.read_csv(OUT_DIR / "traffic_source_revenue.csv")

print("top_categories:", top_categories.shape)
print("fulfillment:", fulfillment.shape)
print("traffic_source_revenue:", traffic_source_revenue.shape)

# 03 — Quick preview (sanity check)

In [None]:
display(top_categories.head(10))
display(fulfillment)
display(traffic_source_revenue.head(10))


# 04 Data types + missing values (fast EDA)

In [None]:
def profile_df(df, name):
    print(f"=== {name} ===")
    display(df.dtypes.to_frame("dtype").T)
    na = df.isna().sum().to_frame("missing_count").T
    display(na)
    display(df.describe(include="all").T)

profile_df(top_categories, "top_categories")
profile_df(traffic_source_revenue, "traffic_source_revenue")


# 05 Chart 1: Top categories by revenue

In [None]:
df = top_categories.copy().sort_values("revenue", ascending=True)

plt.figure()
plt.barh(df["category"], df["revenue"])
plt.title("Top Categories by Revenue (Last 12 Months)")
plt.xlabel("Revenue")
plt.ylabel("Category")
plt.tight_layout()
plt.show()


# 06 Chart 2: Margin rate proxy by category (shows efficiency)

In [None]:
df = top_categories.copy().sort_values("margin_rate_proxy", ascending=True)

plt.figure()
plt.barh(df["category"], df["margin_rate_proxy"])
plt.title("Margin Rate Proxy by Category (Last 12 Months)")
plt.xlabel("Margin Rate Proxy")
plt.ylabel("Category")
plt.tight_layout()
plt.show()


# 07 Chart 3: Revenue by traffic source

In [None]:
df = traffic_source_revenue.copy().sort_values("revenue", ascending=False).head(12)
df = df.sort_values("revenue", ascending=True)

plt.figure()
plt.barh(df["user_traffic_source"], df["revenue"])
plt.title("Revenue by Traffic Source (Top 12)")
plt.xlabel("Revenue")
plt.ylabel("Traffic Source")
plt.tight_layout()
plt.show()


# 08 Add “Insights” (auto-generated bullets from the data)

In [None]:
# --- Top categories insights ---
top_cat = top_categories.sort_values("revenue", ascending=False).reset_index(drop=True)
total_rev_top15 = top_cat["revenue"].sum()
top1 = top_cat.iloc[0]
top3_share = top_cat.head(3)["revenue"].sum() / total_rev_top15

# --- Traffic source insights ---
ts = traffic_source_revenue.sort_values("revenue", ascending=False).reset_index(drop=True)
ts_total = ts["revenue"].sum()
ts1 = ts.iloc[0]
ts_top3_share = ts.head(3)["revenue"].sum() / ts_total

# --- Fulfillment insights ---
f = fulfillment.iloc[0].to_dict()

insights = [
    f"Category concentration: Top category is '{top1['category']}' with revenue {top1['revenue']:,.2f}. Top 3 categories represent {top3_share:.1%} of revenue within the Top 15 categories list.",
    f"Profit efficiency: Highest margin_rate_proxy in Top 15 is '{top_cat.loc[top_cat['margin_rate_proxy'].idxmax(),'category']}' at {top_cat['margin_rate_proxy'].max():.2%}. Lowest is '{top_cat.loc[top_cat['margin_rate_proxy'].idxmin(),'category']}' at {top_cat['margin_rate_proxy'].min():.2%}.",
    f"Acquisition impact: Top traffic source is '{ts1['user_traffic_source']}' with revenue {ts1['revenue']:,.2f} and AOV {ts1['aov']:,.2f}. Top 3 sources represent {ts_top3_share:.1%} of total revenue.",
    f"Fulfillment performance: median (p50) delivery time ≈ {float(f['p50_days_to_deliver']):.2f} days, p90 ≈ {float(f['p90_days_to_deliver']):.2f} days, average ≈ {float(f['avg_days_to_deliver']):.2f} days."
]

for i, line in enumerate(insights, 1):
    print(f"{i}. {line}")


# 09 “Next questions”

In [None]:
next_questions = [
    "Which products (not just categories) drive the top revenue and how does their margin proxy compare?",
    "Do any traffic sources have high revenue but low AOV (or vice versa), suggesting different acquisition quality?",
    "Is slower fulfillment correlated with higher returns? (requires returns model / delivered vs returned analysis)",
    "How does revenue and margin evolve over time (monthly trend) by category and traffic source?"
]

for i, q in enumerate(next_questions, 1):
    print(f"{i}. {q}")
