In [14]:
import pandas as pd
import sqlite3

customers = pd.read_csv("/kaggle/input/day20-datasets/day_20_customers.csv")
orders = pd.read_csv("/kaggle/input/day20-datasets/day_20_orders.csv")

conn = sqlite3.connect(":memory:")
customers.to_sql("customers", conn, index=False, if_exists="replace")
orders.to_sql("orders", conn, index=False, if_exists="replace")



18

In [15]:
pd.read_sql("SELECT * FROM orders", conn)

Unnamed: 0,order_id,customer_id,product,category,amount,order_date
0,2001,1,Mobile,Electronics,18000,2024-01-05
1,2002,1,Earphones,Electronics,1200,2024-02-10
2,2003,2,Shoes,Fashion,2500,2024-01-12
3,2004,2,Watch,Accessories,3200,2024-02-01
4,2005,3,Laptop,Electronics,55000,2024-03-03
5,2006,3,Mouse,Electronics,800,2024-03-15
6,2007,4,Book,Education,600,2024-02-20
7,2008,5,Tablet,Electronics,26000,2024-03-10
8,2009,6,Backpack,Fashion,1800,2024-01-22
9,2010,6,T-shirt,Fashion,900,2024-02-14


In [16]:
pd.read_sql("SELECT * FROM customers", conn)

Unnamed: 0,customer_id,name,city
0,1,Kavya,Delhi
1,2,Ritvik,Mumbai
2,3,Sneha,Bangalore
3,4,Aman,Pune
4,5,Tanya,Delhi
5,6,Varun,Mumbai
6,7,Mehul,Ahmedabad
7,8,Ishita,Bangalore


# ðŸ§  Problem 1 â€” High-Value Customers

Find customers whose total spending is above the average customer spending.

(Tests: GROUP BY + subquery reasoning)

In [17]:
pd.read_sql("""
SELECT customer_id, SUM(amount) AS total_spent
FROM orders
GROUP BY customer_id
HAVING SUM(amount) > (
    SELECT AVG(total_spent) FROM (
        SELECT SUM(amount) AS total_spent FROM orders
        GROUP BY customer_id
    )
);
""", conn)

Unnamed: 0,customer_id,total_spent
0,1,19200
1,3,57900
2,5,28800


# ðŸ§  Problem 2 â€” Best Category per City

For each city, find the category that generates the highest total revenue.

(Tests: JOIN + GROUP BY + window ranking)

In [18]:
pd.read_sql("""
SELECT city, category, total_revenue
FROM (
  SELECT c.city, o.category,
         SUM(o.amount) AS total_revenue,
         RANK() OVER (
           PARTITION BY c.city
           ORDER BY SUM(o.amount) DESC
         ) AS rnk
  FROM customers c
  JOIN orders o ON c.customer_id = o.customer_id
  GROUP BY c.city, o.category
)
WHERE rnk = 1;

""", conn)

Unnamed: 0,city,category,total_revenue
0,Ahmedabad,Home,10700
1,Bangalore,Electronics,57900
2,Delhi,Electronics,48000
3,Mumbai,Fashion,5200
4,Pune,Education,1000


# ðŸ§  Problem 3 â€” Revenue Contribution %

Show each category and its percentage contribution to total revenue.

(Tests: SUM + window function for overall total)

In [19]:
pd.read_sql("""
SELECT category, SUM(amount) AS category_revenue, ROUND(
    SUM(amount) * 100 / SUM(SUM(amount)) OVER (), 2
) AS revenue_percent
FROM ORDERS
GROUP BY category 
;
""", conn)

Unnamed: 0,category,category_revenue,revenue_percent
0,Accessories,4700,3.0
1,Beauty,2900,2.0
2,Education,1000,0.0
3,Electronics,105900,81.0
4,Fashion,5200,3.0
5,Home,10700,8.0


# ðŸ§  Problem 4 â€” Repeat vs One-Time Customers

Classify each customer as:

"Repeat" if they placed more than 1 order

"One-Time" otherwise

(Tests: GROUP BY + CASE)

In [20]:
pd.read_sql("""
SELECT customer_id, COUNT(*) AS order_count, CASE 
    WHEN COUNT(*) > 1 THEN 'Repeat'
    ELSE 'One-Time'
    END AS customer_type
FROM ORDERS
GROUP BY customer_id 
;
""", conn)

Unnamed: 0,customer_id,order_count,customer_type
0,1,2,Repeat
1,2,3,Repeat
2,3,3,Repeat
3,4,2,Repeat
4,5,2,Repeat
5,6,2,Repeat
6,7,2,Repeat
7,8,2,Repeat


# ðŸ§  Problem 5 â€” Monthly Revenue Growth

Show total revenue per month and the difference compared to previous month.

(Tests: GROUP BY month + window LAG)

In [21]:
pd.read_sql("""
SELECT month, total_revenue, total_revenue - LAG(total_revenue) OVER( ORDER BY month) as growth
FROM (
    SELECT SUBSTR(order_date, 1, 7) as month, SUM(amount) AS total_revenue
    FROM orders
    GROUP BY SUBSTR(order_date, 1, 7)
)
;
""", conn)

Unnamed: 0,month,total_revenue,growth
0,2024-01,22300,
1,2024-02,8100,-14200.0
2,2024-03,86400,78300.0
3,2024-04,13600,-72800.0


# ðŸ§  Problem 6 â€” Orders Above Customer Average

Find orders where the order amount is higher than that customerâ€™s average order value.

(Tests: correlated subquery or window AVG OVER PARTITION)

In [22]:
pd.read_sql("""
SELECT * FROM (
    SELECT *, AVG(amount) OVER(
        PARTITION BY customer_id
    ) AS avg_per_customer
    FROM orders
)
WHERE amount > avg_per_customer
;
""", conn)

Unnamed: 0,order_id,customer_id,product,category,amount,order_date,avg_per_customer
0,2001,1,Mobile,Electronics,18000,2024-01-05,9600.0
1,2003,2,Shoes,Fashion,2500,2024-01-12,2400.0
2,2004,2,Watch,Accessories,3200,2024-02-01,2400.0
3,2005,3,Laptop,Electronics,55000,2024-03-03,19300.0
4,2007,4,Book,Education,600,2024-02-20,500.0
5,2008,5,Tablet,Electronics,26000,2024-03-10,14400.0
6,2009,6,Backpack,Fashion,1800,2024-01-22,1350.0
7,2017,7,Vacuum Cleaner,Home,7200,2024-04-12,5350.0
8,2012,8,Skincare Kit,Beauty,2200,2024-02-25,1450.0


# ðŸ§  Problem 7 â€” Top 2 Customers per City

For each city, find the top 2 customers by total spending.

(Tests: GROUP BY + window RANK with PARTITION)

In [23]:
pd.read_sql("""
SELECT *
FROM (
  SELECT c.city, c.name,
         SUM(o.amount) AS total_spent,
         RANK() OVER (
           PARTITION BY c.city
           ORDER BY SUM(o.amount) DESC
         ) AS rnk
  FROM customers c
  JOIN orders o ON c.customer_id = o.customer_id
  GROUP BY c.city, c.name
)
WHERE rnk <= 2;

""", conn)

Unnamed: 0,city,name,total_spent,rnk
0,Ahmedabad,Mehul,10700,1
1,Bangalore,Sneha,57900,1
2,Bangalore,Ishita,2900,2
3,Delhi,Tanya,28800,1
4,Delhi,Kavya,19200,2
5,Mumbai,Ritvik,7200,1
6,Mumbai,Varun,2700,2
7,Pune,Aman,1000,1


# ðŸ§  Problem 8 â€” Category Performance Labeling

For each category, label:

"Strong" if revenue â‰¥ average category revenue

"Weak" otherwise

(Tests: conditional aggregation + subquery)

In [24]:
pd.read_sql("""
SELECT category, total_revenue,
       CASE
         WHEN total_revenue >= avg_revenue THEN 'Strong'
         ELSE 'Weak'
       END AS performance
FROM (
  SELECT category,
         SUM(amount) AS total_revenue,
         AVG(SUM(amount)) OVER () AS avg_revenue
  FROM orders
  GROUP BY category
);

""", conn)

Unnamed: 0,category,total_revenue,performance
0,Accessories,4700,Weak
1,Beauty,2900,Weak
2,Education,1000,Weak
3,Electronics,105900,Strong
4,Fashion,5200,Weak
5,Home,10700,Weak


# ðŸ§  Problem 9 â€” Customer Lifetime Value Running Total

For each customer, show each order and their running total spending over time.

(Tests: window SUM with ORDER BY)

In [25]:
pd.read_sql("""
SELECT customer_id, order_date, amount,
       SUM(amount) OVER (
         PARTITION BY customer_id
         ORDER BY order_date
       ) AS running_total
FROM orders;

""", conn)

Unnamed: 0,customer_id,order_date,amount,running_total
0,1,2024-01-05,18000,18000
1,1,2024-02-10,1200,19200
2,2,2024-01-12,2500,2500
3,2,2024-02-01,3200,5700
4,2,2024-04-08,1500,7200
5,3,2024-03-03,55000,55000
6,3,2024-03-15,800,55800
7,3,2024-04-20,2100,57900
8,4,2024-02-20,600,600
9,4,2024-03-18,400,1000


# ðŸ§  Problem 10 â€” Optimization Thinking

You run this query and itâ€™s slow on large data:

> SELECT *

> FROM orders

> WHERE customer_id = 5;

## Answer in words:


**a) Why might this query be slow?**
-> because without an index, the database would have to scan the entire table. 


**b) What would you do to improve performance?**
-> by creating an index


**c) What would EXPLAIN likely show before and after optimization?**
-> before index, it would show - scan table orders 
after index it would show - search table orders using index idx_customer
