# Import / Config

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
from pathlib import Path
from dotenv import load_dotenv

from IPython.display import Markdown, display

import edurel.utils.dbcon as dbcu
import edurel.utils.db as dbu
import edurel.utils.duckdb as ddbu
import edurel.utils.llm as llmu
import edurel.utils.llmchat as llmc
import edurel.widgets.mermaid_viz as mmw
import edurel.widgets.chatman as cmw
import edurel.utils.misc as mu

load_dotenv() 
BASE_DIR = os.getenv("BASE_DIR")
DB_DIR = f"{BASE_DIR}/databases"


  from pydantic.v1.fields import FieldInfo as FieldInfoV1


# Database

In [4]:
con = dbcu.foodmart()
# additional_fks = {}
additional_fks = {"Sales": [
    "Customer|CustomerId|CustomerId", 
    "Product|ProductID|ProductID",
    "Store|StoreId|StoreId",
    "TimeByDay|TimeId|TimeId"
    ]}

db = dbu.DbHandler(con,additional_fks=additional_fks)
db.schema_yaml_print(["nullable", "fkname"])

tables:
- tablename: Customer
  columns:
  - columnname: Customerid
    type: INTEGER
  - columnname: Lastname
    type: VARCHAR
  - columnname: Firstname
    type: VARCHAR
  - columnname: City
    type: VARCHAR
  - columnname: State
    type: VARCHAR
  - columnname: Country
    type: VARCHAR
  - columnname: Maritalstatus
    type: VARCHAR
  - columnname: Yearlyincome
    type: VARCHAR
  - columnname: Gender
    type: VARCHAR
  - columnname: Education
    type: VARCHAR
  primary_key:
  - Customerid
- tablename: Product
  columns:
  - columnname: Productid
    type: INTEGER
  - columnname: Pname
    type: VARCHAR
  - columnname: Brand
    type: VARCHAR
  - columnname: Subcategory
    type: VARCHAR
  - columnname: Category
    type: VARCHAR
  - columnname: Department
    type: VARCHAR
  - columnname: Family
    type: VARCHAR
  primary_key:
  - Productid
- tablename: Sales
  columns:
  - columnname: Productid
    type: INTEGER
  - columnname: Timeid
    type: INTEGER
  - columnname: Custo

## Year Range

In [6]:
sql = """
select TYear
from timebyday
group by TYear
order by TYear;
"""
db.sql_print(sql)

┌───────┐
│ TYear │
│ int32 │
├───────┤
│  2014 │
│  2015 │
└───────┘



# Subqueries

## 01

Find all customers who have made purchases with total store sales greater than the average store sales across all transactions.


In [7]:
sql = """
SELECT DISTINCT c.Customerid, c.Firstname, c.Lastname
FROM Customer c
JOIN Sales s ON c.Customerid = s.Customerid
WHERE s.Storesales > (SELECT AVG(Storesales) FROM Sales)
ORDER BY c.Lastname, c.Firstname;
"""
db.sql_print(sql)

┌────────────┬───────────┬────────────┐
│ Customerid │ Firstname │  Lastname  │
│   int32    │  varchar  │  varchar   │
├────────────┼───────────┼────────────┤
│       2724 │ Catherine │ Abahamdeh  │
│       3872 │ John      │ Abalos     │
│       6979 │ Glen      │ Abbassi    │
│       4881 │ Carroll   │ Abbate     │
│       9163 │ Rusty     │ Abbey      │
│       8288 │ Bonnie    │ Abbott     │
│        278 │ Eric      │ Abbott     │
│       8862 │ Patrick   │ Abbott     │
│       6049 │ Sherry    │ Abbott     │
│       1972 │ Muffy     │ Abbruzzese │
│         ·  │   ·       │    ·       │
│         ·  │   ·       │    ·       │
│         ·  │   ·       │    ·       │
│       4490 │ Roland    │ Zumsteg    │
│       5613 │ Joyce     │ Zuniga     │
│       8630 │ Shirley   │ Zuniga     │
│       2049 │ Elizabeth │ Zuvich     │
│       5721 │ Harriett  │ Zvibleman  │
│       6632 │ Ramona    │ Zvibleman  │
│       1469 │ Todd      │ Zweifel    │
│       3855 │ Julieanna │ Zwier      │


## 02

List all products that have never been sold.

In [8]:
sql = """
SELECT p.Productid, p.Pname, p.Brand, p.Category
FROM Product p
WHERE p.Productid NOT IN (SELECT DISTINCT Productid FROM Sales)
ORDER BY p.Category, p.Pname;
"""
db.sql_print(sql)

┌───────────┬─────────────────┬─────────┬──────────────────┐
│ Productid │      Pname      │  Brand  │     Category     │
│   int32   │     varchar     │ varchar │     varchar      │
├───────────┼─────────────────┼─────────┼──────────────────┤
│      1560 │ CDR Grape Jelly │ CDR     │ Jams and Jellies │
└───────────┴─────────────────┴─────────┴──────────────────┘



## 03

Using a CTE, find the top 3 countries by total store sales and then list all stores in those countries.

In [10]:
sql = """
WITH TopCountries AS (
    SELECT st.Country, SUM(s.Storesales) as TotalSales
    FROM Store st
    JOIN Sales s ON st.Storeid = s.Storeid
    GROUP BY st.Country
    ORDER BY TotalSales DESC
    LIMIT 3
)
SELECT s.Storeid, s.Sname, s.City, s.State, s.Country
FROM Store s
WHERE s.Country IN (SELECT Country FROM TopCountries)
ORDER BY s.Country, s.Sname;
"""
db.sql_print(sql)

┌─────────┬──────────┬───────────────┬───────────┬─────────┐
│ Storeid │  Sname   │     City      │   State   │ Country │
│  int32  │ varchar  │    varchar    │  varchar  │ varchar │
├─────────┼──────────┼───────────────┼───────────┼─────────┤
│      19 │ Store 19 │ Vancouver     │ BC        │ Canada  │
│      20 │ Store 20 │ Victoria      │ BC        │ Canada  │
│       1 │ Store 1  │ Acapulco      │ Guerrero  │ Mexico  │
│      10 │ Store 10 │ Orizaba       │ Veracruz  │ Mexico  │
│      12 │ Store 12 │ Hidalgo       │ Zacatecas │ Mexico  │
│      18 │ Store 18 │ Hidalgo       │ Zacatecas │ Mexico  │
│      21 │ Store 21 │ San Andres    │ DF        │ Mexico  │
│       4 │ Store 4  │ Camacho       │ Zacatecas │ Mexico  │
│       5 │ Store 5  │ Guadalajara   │ Jalisco   │ Mexico  │
│       8 │ Store 8  │ Merida        │ Yucatan   │ Mexico  │
│       · │    ·     │   ·           │ ·         │  ·      │
│       · │    ·     │   ·           │ ·         │  ·      │
│       · │    ·     │  

## 04

Using a CTE, calculate the total sales per product category and then find all products in categories where total sales exceed 100000.

In [11]:
sql = """
WITH CategorySales AS (
    SELECT p.Category, SUM(s.Storesales) as TotalSales
    FROM Product p
    JOIN Sales s ON p.Productid = s.Productid
    GROUP BY p.Category
)
SELECT p.Productid, p.Pname, p.Category, cs.TotalSales
FROM Product p
JOIN CategorySales cs ON p.Category = cs.Category
WHERE cs.TotalSales > 100000
ORDER BY cs.TotalSales DESC, p.Pname;
"""
db.sql_print(sql)

┌───────────┬────────────────────────────────┬────────────┬───────────────┐
│ Productid │             Pname              │  Category  │  TotalSales   │
│   int32   │            varchar             │  varchar   │ decimal(38,2) │
├───────────┼────────────────────────────────┼────────────┼───────────────┤
│       327 │ Better Canned Beets            │ Vegetables │     206070.44 │
│       339 │ Better Canned Peas             │ Vegetables │     206070.44 │
│       329 │ Better Canned String Beans     │ Vegetables │     206070.44 │
│       333 │ Better Canned Tomatos          │ Vegetables │     206070.44 │
│       331 │ Better Canned Yams             │ Vegetables │     206070.44 │
│       328 │ Better Creamed Corn            │ Vegetables │     206070.44 │
│       400 │ Big Time Fajita French Fries   │ Vegetables │     206070.44 │
│       417 │ Big Time Frozen Broccoli       │ Vegetables │     206070.44 │
│       406 │ Big Time Frozen Carrots        │ Vegetables │     206070.44 │
│       416 

## 05

Find customers whose total yearly purchases are greater than the average total purchases of all customers in their state.

In [12]:
sql = """
WITH CustomerTotals AS (
    SELECT c.Customerid, c.State, SUM(s.Storesales) as TotalPurchases
    FROM Customer c
    JOIN Sales s ON c.Customerid = s.Customerid
    GROUP BY c.Customerid, c.State
),
StateAverages AS (
    SELECT State, AVG(TotalPurchases) as AvgPurchases
    FROM CustomerTotals
    GROUP BY State
)
SELECT c.Customerid, c.Firstname, c.Lastname, c.State, ct.TotalPurchases, sa.AvgPurchases
FROM Customer c
JOIN CustomerTotals ct ON c.Customerid = ct.Customerid
JOIN StateAverages sa ON c.State = sa.State
WHERE ct.TotalPurchases > sa.AvgPurchases
ORDER BY c.State, ct.TotalPurchases DESC;
"""
db.sql_print(sql)

┌────────────┬───────────┬───────────┬───────────┬────────────────┬───────────────────┐
│ Customerid │ Firstname │ Lastname  │   State   │ TotalPurchases │   AvgPurchases    │
│   int32    │  varchar  │  varchar  │  varchar  │ decimal(38,2)  │      double       │
├────────────┼───────────┼───────────┼───────────┼────────────────┼───────────────────┤
│       5465 │ Dorothy   │ Ahmari    │ BC        │         342.21 │ 73.77386004514673 │
│       1769 │ Joyce     │ Minsky    │ BC        │         291.13 │ 73.77386004514673 │
│       6821 │ Katherine │ Worlay    │ BC        │         282.91 │ 73.77386004514673 │
│         59 │ Elizabeth │ Moss      │ BC        │         277.51 │ 73.77386004514673 │
│        943 │ David     │ Chavez    │ BC        │         267.34 │ 73.77386004514673 │
│       5283 │ Jack      │ Moore     │ BC        │         254.28 │ 73.77386004514673 │
│       2730 │ Helen     │ Wright    │ BC        │         250.25 │ 73.77386004514673 │
│       8716 │ Gregory   │ Wood 

# Window Functions

## 01

Rank products within each category by their total unit sales, showing the top 5 products per category.

In [13]:
sql = """
WITH ProductSales AS (
    SELECT
        p.Category,
        p.Pname,
        SUM(s.Unitsales) as TotalUnits,
        RANK() OVER (PARTITION BY p.Category ORDER BY SUM(s.Unitsales) DESC) as CategoryRank
    FROM Product p
    JOIN Sales s ON p.Productid = s.Productid
    GROUP BY p.Category, p.Pname
)
SELECT Category, Pname, TotalUnits, CategoryRank
FROM ProductSales
WHERE CategoryRank <= 5
ORDER BY Category, CategoryRank;
"""
db.sql_print(sql)

┌───────────────────┬────────────────────────────────────────────┬────────────┬──────────────┐
│     Category      │                   Pname                    │ TotalUnits │ CategoryRank │
│      varchar      │                  varchar                   │   int128   │    int64     │
├───────────────────┼────────────────────────────────────────────┼────────────┼──────────────┤
│ Baking Goods      │ BBB Best White Sugar                       │        561 │            1 │
│ Baking Goods      │ BBB Best Sesame Oil                        │        549 │            2 │
│ Baking Goods      │ BBB Best Pepper                            │        547 │            3 │
│ Baking Goods      │ Super White Sugar                          │        546 │            4 │
│ Baking Goods      │ Super Salt                                 │        542 │            5 │
│ Bathroom Products │ Hilltop Mint Mouthwash                     │        622 │            1 │
│ Bathroom Products │ Hilltop Silky Smooth Hair Co

## 02

Calculate the running total of store sales for each store ordered by time, showing the cumulative sales over time.

In [14]:
sql = """
SELECT
    s.Storeid,
    st.Sname,
    t.TYear,
    t.TMonth,
    SUM(s.Storesales) as MonthlySales,
    SUM(SUM(s.Storesales)) OVER (
        PARTITION BY s.Storeid
        ORDER BY t.TYear, t.TMonthnumber
        ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
    ) as RunningTotal
FROM Sales s
JOIN Store st ON s.Storeid = st.Storeid
JOIN TimeByDay t ON s.Timeid = t.Timeid
GROUP BY s.Storeid, st.Sname, t.TYear, t.TMonth, t.TMonthnumber
ORDER BY s.Storeid, t.TYear, t.TMonthnumber;
"""
db.sql_print(sql)

┌─────────┬──────────┬───────┬───────────┬───────────────┬───────────────┐
│ Storeid │  Sname   │ TYear │  TMonth   │ MonthlySales  │ RunningTotal  │
│  int32  │ varchar  │ int32 │  varchar  │ decimal(38,2) │ decimal(38,2) │
├─────────┼──────────┼───────┼───────────┼───────────────┼───────────────┤
│       1 │ Store 1  │  2015 │ January   │       5668.27 │       5668.27 │
│       1 │ Store 1  │  2015 │ February  │       4212.25 │       9880.52 │
│       1 │ Store 1  │  2015 │ March     │       5182.62 │      15063.14 │
│       1 │ Store 1  │  2015 │ April     │       4604.30 │      19667.44 │
│       1 │ Store 1  │  2015 │ May       │       4312.83 │      23980.27 │
│       1 │ Store 1  │  2015 │ June      │       3384.40 │      27364.67 │
│       1 │ Store 1  │  2015 │ July      │       4396.12 │      31760.79 │
│       1 │ Store 1  │  2015 │ August    │       3686.99 │      35447.78 │
│       1 │ Store 1  │  2015 │ September │       4672.65 │      40120.43 │
│       1 │ Store 1  │  2

## 03

For each customer, show their purchase amount alongside the previous and next purchase amounts ordered by date.

In [15]:
sql = """
SELECT
    c.Customerid,
    c.Firstname,
    c.Lastname,
    t.TYear,
    t.TMonth,
    t.TDayofmonth,
    s.Storesales,
    LAG(s.Storesales) OVER (PARTITION BY c.Customerid ORDER BY t.TYear, t.TMonthnumber, t.TDayofmonth) as PreviousPurchase,
    LEAD(s.Storesales) OVER (PARTITION BY c.Customerid ORDER BY t.TYear, t.TMonthnumber, t.TDayofmonth) as NextPurchase
FROM Customer c
JOIN Sales s ON c.Customerid = s.Customerid
JOIN TimeByDay t ON s.Timeid = t.Timeid
ORDER BY c.Customerid, t.TYear, t.TMonthnumber, t.TDayofmonth;
"""
db.sql_print(sql)

┌────────────┬───────────┬──────────┬───────┬──────────┬─────────────┬───────────────┬──────────────────┬───────────────┐
│ Customerid │ Firstname │ Lastname │ TYear │  TMonth  │ TDayofmonth │  Storesales   │ PreviousPurchase │ NextPurchase  │
│   int32    │  varchar  │ varchar  │ int32 │ varchar  │    int32    │ decimal(13,2) │  decimal(13,2)   │ decimal(13,2) │
├────────────┼───────────┼──────────┼───────┼──────────┼─────────────┼───────────────┼──────────────────┼───────────────┤
│          3 │ Jeanne    │ Derry    │  2014 │ April    │          27 │          4.46 │             NULL │          6.84 │
│          3 │ Jeanne    │ Derry    │  2014 │ April    │          27 │          6.84 │             4.46 │          6.81 │
│          3 │ Jeanne    │ Derry    │  2014 │ April    │          27 │          6.81 │             6.84 │          4.26 │
│          3 │ Jeanne    │ Derry    │  2014 │ April    │          27 │          4.26 │             6.81 │          2.79 │
│          3 │ Jeanne   

## 04

Calculate the percentage contribution of each product's sales to its department's total sales.

In [16]:
sql = """
SELECT
    p.Department,
    p.Pname,
    SUM(s.Storesales) as ProductSales,
    SUM(SUM(s.Storesales)) OVER (PARTITION BY p.Department) as DepartmentSales,
    ROUND(100.0 * SUM(s.Storesales) / SUM(SUM(s.Storesales)) OVER (PARTITION BY p.Department), 2) as PercentageContribution
FROM Product p
JOIN Sales s ON p.Productid = s.Productid
GROUP BY p.Department, p.Pname
ORDER BY p.Department, ProductSales DESC;
"""
db.sql_print(sql)

┌─────────────────────┬────────────────────────────┬───────────────┬─────────────────┬────────────────────────┐
│     Department      │           Pname            │ ProductSales  │ DepartmentSales │ PercentageContribution │
│       varchar       │          varchar           │ decimal(38,2) │  decimal(38,2)  │         double         │
├─────────────────────┼────────────────────────────┼───────────────┼─────────────────┼────────────────────────┤
│ Alcoholic Beverages │ Portsmouth Light Beer      │       1989.75 │        41137.07 │                   4.84 │
│ Alcoholic Beverages │ Good White Zinfandel Wine  │       1951.09 │        41137.07 │                   4.74 │
│ Alcoholic Beverages │ Portsmouth Chardonnay Wine │       1824.76 │        41137.07 │                   4.44 │
│ Alcoholic Beverages │ Top Measure Light Wine     │       1710.24 │        41137.07 │                   4.16 │
│ Alcoholic Beverages │ Walrus Chardonnay          │       1702.40 │        41137.07 │                  

## 05

Assign quartiles to customers based on their total yearly income within each education level, grouping customers into four equal groups.

In [17]:
sql = """
WITH CustomerSales AS (
    SELECT
        c.Customerid,
        c.Firstname,
        c.Lastname,
        c.Education,
        c.Yearlyincome,
        SUM(s.Storesales) as TotalSpent
    FROM Customer c
    JOIN Sales s ON c.Customerid = s.Customerid
    GROUP BY c.Customerid, c.Firstname, c.Lastname, c.Education, c.Yearlyincome
)
SELECT
    Customerid,
    Firstname,
    Lastname,
    Education,
    Yearlyincome,
    TotalSpent,
    NTILE(4) OVER (PARTITION BY Education ORDER BY TotalSpent) as SpendingQuartile
FROM CustomerSales
ORDER BY Education, SpendingQuartile, TotalSpent DESC;
"""
db.sql_print(sql)

┌────────────┬───────────┬────────────┬─────────────────────┬───────────────┬───────────────┬──────────────────┐
│ Customerid │ Firstname │  Lastname  │      Education      │ Yearlyincome  │  TotalSpent   │ SpendingQuartile │
│   int32    │  varchar  │  varchar   │       varchar       │    varchar    │ decimal(38,2) │      int64       │
├────────────┼───────────┼────────────┼─────────────────────┼───────────────┼───────────────┼──────────────────┤
│       3425 │ Joseph    │ Fernandez  │ Bachelors Degree    │ $150K +       │         55.31 │                1 │
│       9786 │ Bruce     │ Berg       │ Bachelors Degree    │ $50K - $70K   │         55.11 │                1 │
│       6038 │ Erica     │ Wall       │ Bachelors Degree    │ $30K - $50K   │         55.07 │                1 │
│       1950 │ Kathaleen │ Cohen      │ Bachelors Degree    │ $50K - $70K   │         55.05 │                1 │
│       7555 │ Debbie    │ Smiglewski │ Bachelors Degree    │ $70K - $90K   │         55.01 │   