# stuff 

Notebook verifying DuckDB setup, loading CSV data, and running summary analytics.

In [None]:
import duckdb
import pandas as pd

# Check DuckDB installation
print('DuckDB version:', duckdb.__version__)

# Connect to in-memory database
con = duckdb.connect(database=':memory:')

DuckDB version: 1.4.1


In [2]:
# Load CSV into DuckDB table
con.execute("""
CREATE TABLE sales AS 
SELECT * FROM read_csv_auto("data/sales.csv", header=True);
""")

# Preview table
con.execute("SELECT * FROM sales LIMIT 5;").df()

Unnamed: 0,transaction_id,date,customer_id,gender,age,product_category,quantity,price,total_amount
0,1,2023-11-24,CUST001,Male,34,Beauty,3,50,150
1,2,2023-02-27,CUST002,Female,26,Clothing,2,500,1000
2,3,2023-01-13,CUST003,Male,50,Electronics,1,30,30
3,4,2023-05-21,CUST004,Male,37,Clothing,1,500,500
4,5,2023-05-06,CUST005,Male,30,Beauty,2,50,100


In [4]:
# ✅ Total revenue, orders, and unique customers
summary = con.execute("""
SELECT 
  SUM(total_amount) AS total_revenue,
  COUNT(DISTINCT transaction_id) AS total_orders,
  COUNT(DISTINCT customer_id) AS unique_customers
FROM sales;
""").df()
summary

Unnamed: 0,total_revenue,total_orders,unique_customers
0,456000.0,1000,1000


In [4]:
# 📅 Monthly and Daily revenue
monthly = con.execute("""
SELECT strftime(date, '%Y-%m') AS month, SUM(total_amount) AS revenue
FROM sales
GROUP BY month
ORDER BY month
LIMIT 6;
""").df()

daily = con.execute("""
SELECT date, SUM(total_amount) AS revenue
FROM sales
GROUP BY date
ORDER BY date
LIMIT 6;
""").df()

monthly, daily

(     month  revenue
 0  2023-01  35450.0
 1  2023-02  44060.0
 2  2023-03  28990.0
 3  2023-04  33870.0
 4  2023-05  53150.0
 5  2023-06  36715.0,
         date  revenue
 0 2023-01-01   3600.0
 1 2023-01-02   1765.0
 2 2023-01-03    600.0
 3 2023-01-04   1240.0
 4 2023-01-05   1100.0
 5 2023-01-06    620.0)

In [6]:
# 🏷️ Revenue by product category
category_rev = con.execute("""
SELECT product_category, SUM(total_amount) AS revenue
FROM sales
GROUP BY product_category
ORDER BY revenue DESC;
""").df()
category_rev

Unnamed: 0,product_category,revenue
0,Electronics,156905.0
1,Clothing,155580.0
2,Beauty,143515.0


In [7]:
# 👥 Revenue by age group
age_group_rev = con.execute("""
SELECT 
  CASE 
    WHEN age < 25 THEN '<25'
    WHEN age BETWEEN 25 AND 34 THEN '25-34'
    WHEN age BETWEEN 35 AND 44 THEN '35-44'
    WHEN age BETWEEN 45 AND 54 THEN '45-54'
    ELSE '55+'
  END AS age_group,
  SUM(total_amount) AS revenue
FROM sales
GROUP BY age_group
ORDER BY age_group;
""").df()
age_group_rev

Unnamed: 0,age_group,revenue
0,25-34,97090.0
1,35-44,96835.0
2,45-54,97235.0
3,55+,90190.0
4,<25,74650.0


In [None]:
# 💰 Average Order Value (AOV) by month
aov_month = con.execute("""
SELECT strftime(date, '%Y-%m') AS month,
       SUM(total_amount)/COUNT(DISTINCT transaction_id) AS avg_order_value
FROM sales
GROUP BY month
ORDER BY month
LIMIT 6;
""").df()
aov_month

Unnamed: 0,month,avg_order_value
0,2023-01,466.447368
1,2023-02,518.352941
2,2023-03,397.123288
3,2023-04,393.837209
4,2023-05,506.190476
5,2023-06,476.818182
6,2023-07,492.569444
7,2023-08,393.191489
8,2023-09,363.384615
9,2023-10,485.208333
