In [1]:
# SQL Learning Notebook with Synthetic Data
!uv pip install ipython-sql pandas jupysql sqlalchemy faker

import pandas as pd
import sqlite3
from faker import Faker
import random

fake = Faker()

# Generate users
def generate_users(n=100):
    users = []
    for i in range(n):
        users.append({
            'user_id': i + 1,
            'name': fake.name(),
            'email': fake.email(),
            'signup_date': fake.date_between(start_date='-2y', end_date='today')
        })
    return pd.DataFrame(users)

# Generate orders
def generate_orders(users_df, n=300):
    orders = []
    for i in range(n):
        user = users_df.sample(1).iloc[0]
        orders.append({
            'order_id': i + 1,
            'user_id': user['user_id'],
            'amount': round(random.uniform(10.0, 500.0), 2),
            'order_date': fake.date_between(start_date=user['signup_date'], end_date='today')
        })
    return pd.DataFrame(orders)

users_df = generate_users()
orders_df = generate_orders(users_df)

# Create SQLite In-Memory Database
conn = sqlite3.connect(':memory:')

# Load DataFrames into SQL Tables
users_df.to_sql('users', conn, index=False, if_exists='replace')
orders_df.to_sql('orders', conn, index=False, if_exists='replace')

# Run SQL Queries Using Pandas
# Total spend per user
query = """
    SELECT u.name, u.email, COUNT(o.order_id) AS num_orders, 
           SUM(o.amount) AS total_spent
    FROM users u
    JOIN orders o ON u.user_id = o.user_id
    GROUP BY u.user_id
    ORDER BY total_spent DESC
    LIMIT 10
"""

results = pd.read_sql_query(query, conn)
print("Top 10 users by spend:")
print(results)

[2mUsing Python 3.9.6 environment at: flexenv[0m
[2mAudited [1m5 packages[0m [2min 17ms[0m[0m
Top 10 users by spend:
              name                         email  num_orders  total_spent
0     Sherry Avila  brittanyrichards@example.com           5      1889.03
1     Ian Santiago      timothycohen@example.net           4      1781.52
2  Jessica Johnson        brownkeith@example.org           6      1777.83
3   David Lawrence        smithnancy@example.net           5      1627.71
4  Rebecca Johnson         ycalderon@example.org           5      1601.80
5  Kristina Wright        margaret16@example.com           5      1451.33
6   James Mitchell          anntyler@example.com           5      1442.69
7   Steven Hoffman     matthewscasey@example.org           7      1429.70
8      Amanda Webb       rileykendra@example.org           5      1425.89
9      Cindy Lopez          hnichols@example.net           4      1350.63
Top 10 users by spend:
              name                    

In [4]:
# Define the SQL query as a string
query = """
    SELECT * 
    FROM users
    LIMIT 15;
"""

# Execute the query and store the result in a pandas DataFrame
all_users = pd.read_sql_query(query, conn)

# Display the first 5 rows of the result
print("All columns from the users table:")
all_users

All columns from the users table:


Unnamed: 0,user_id,name,email,signup_date
0,1,Cynthia Young,shane79@example.net,2024-04-28
1,2,Meghan Singh,nharding@example.org,2024-08-09
2,3,Arthur Norman,morenocharles@example.org,2025-04-13
3,4,Kyle Taylor,tmartin@example.org,2023-07-26
4,5,Angela Keller,stephanie51@example.net,2024-06-15
5,6,Grace Fisher,trujilloashley@example.org,2024-02-24
6,7,Natalie Fuller,chamberscynthia@example.net,2025-03-08
7,8,Nicole Sanders,jpatterson@example.net,2023-11-10
8,9,Barbara Delacruz,ahamilton@example.com,2025-03-22
9,10,Wanda Smith,charlesjackson@example.com,2023-10-15


In [6]:
query = """
    SELECT 
        order_id, 
        amount 
    FROM 
        orders
    LIMIT 15;
"""

order_amounts = pd.read_sql_query(query, conn)
print("Specific columns from the orders table:")
order_amounts

Specific columns from the orders table:


Unnamed: 0,order_id,amount
0,1,415.53
1,2,201.46
2,3,220.15
3,4,376.96
4,5,64.32
5,6,300.07
6,7,270.92
7,8,60.34
8,9,351.38
9,10,337.79


In [5]:
query = """
    SELECT *
    FROM orders
    WHERE amount > 450;
"""

high_value_orders = pd.read_sql_query(query, conn)
print("High-value orders (> $450):")
high_value_orders

High-value orders (> $450):


Unnamed: 0,order_id,user_id,amount,order_date
0,14,34,499.75,2025-01-27
1,24,47,479.19,2025-04-15
2,30,66,455.78,2025-05-09
3,34,13,479.66,2024-10-08
4,53,9,463.23,2025-07-10
5,74,14,498.95,2023-08-20
6,77,76,465.83,2024-08-26
7,79,95,498.52,2025-04-10
8,122,96,450.01,2024-12-08
9,129,30,462.3,2024-12-24


In [7]:
query = """
    SELECT name, signup_date
    FROM users
    ORDER BY signup_date DESC
    LIMIT 5;
"""

newest_users = pd.read_sql_query(query, conn)
print("The 5 newest users:")
newest_users

The 5 newest users:


Unnamed: 0,name,signup_date
0,James Cooper,2025-07-13
1,Michelle Hernandez DVM,2025-07-05
2,Barbara Jones,2025-06-16
3,Douglas Peterson,2025-06-12
4,Steven Jordan,2025-06-05


In [8]:
query = """
    SELECT order_id, user_id, amount
    FROM orders
    ORDER BY amount DESC
    LIMIT 10;
"""

largest_orders = pd.read_sql_query(query, conn)
print("The 10 largest orders:")
largest_orders

The 10 largest orders:


Unnamed: 0,order_id,user_id,amount
0,14,34,499.75
1,254,77,499.45
2,74,14,498.95
3,79,95,498.52
4,205,26,497.94
5,188,48,497.4
6,221,65,494.33
7,213,65,493.91
8,297,67,493.19
9,286,25,489.74


In [9]:
query = """
    SELECT 
        user_id,
        COUNT(order_id) AS number_of_orders
    FROM 
        orders
    GROUP BY 
        user_id
    ORDER BY
        number_of_orders DESC
    LIMIT 10;
"""

user_order_counts = pd.read_sql_query(query, conn)
print("Top 10 users by number of orders:")
user_order_counts

Top 10 users by number of orders:


Unnamed: 0,user_id,number_of_orders
0,93,7
1,90,6
2,81,6
3,33,6
4,89,5
5,87,5
6,77,5
7,75,5
8,74,5
9,73,5


In [10]:
query = """
    SELECT
        user_id,
        SUM(amount) AS total_spent
    FROM
        orders
    GROUP BY
        user_id
    ORDER BY
        total_spent DESC
    LIMIT 10;
"""

user_total_spent = pd.read_sql_query(query, conn)
print("Top 10 users by total amount spent:")
user_total_spent

Top 10 users by total amount spent:


Unnamed: 0,user_id,total_spent
0,48,1889.03
1,65,1781.52
2,90,1777.83
3,77,1627.71
4,74,1601.8
5,51,1451.33
6,35,1442.69
7,93,1429.7
8,41,1425.89
9,30,1350.63


In [11]:
query = """
    -- Select columns from both tables
    SELECT 
        u.name, 
        u.email, 
        COUNT(o.order_id) AS num_orders, -- Aggregate from orders table
        SUM(o.amount) AS total_spent     -- Aggregate from orders table
    
    -- Start with the users table (aliased as u)
    FROM 
        users u
    
    -- JOIN it to the orders table (aliased as o)
    -- on the key that links them: user_id
    JOIN 
        orders o ON u.user_id = o.user_id
        
    -- Group the joined results by user to perform the aggregations
    GROUP BY 
        u.user_id
    
    -- Order the final results
    ORDER BY 
        total_spent DESC
        
    -- And get the top 10
    LIMIT 10;
"""

final_report = pd.read_sql_query(query, conn)
print("Final Report: Top 10 users by spend with names and order counts:")
final_report

Final Report: Top 10 users by spend with names and order counts:


Unnamed: 0,name,email,num_orders,total_spent
0,Sherry Avila,brittanyrichards@example.com,5,1889.03
1,Ian Santiago,timothycohen@example.net,4,1781.52
2,Jessica Johnson,brownkeith@example.org,6,1777.83
3,David Lawrence,smithnancy@example.net,5,1627.71
4,Rebecca Johnson,ycalderon@example.org,5,1601.8
5,Kristina Wright,margaret16@example.com,5,1451.33
6,James Mitchell,anntyler@example.com,5,1442.69
7,Steven Hoffman,matthewscasey@example.org,7,1429.7
8,Amanda Webb,rileykendra@example.org,5,1425.89
9,Cindy Lopez,hnichols@example.net,4,1350.63


In [12]:
query = """
    SELECT
        u.user_id,
        u.name,
        COUNT(o.order_id) AS num_orders
    FROM
        users u
    JOIN
        orders o ON u.user_id = o.user_id
    GROUP BY
        u.user_id
    HAVING
        COUNT(o.order_id) >= 3 -- We can't use WHERE here!
    ORDER BY
        num_orders DESC;
"""

frequent_shoppers = pd.read_sql_query(query, conn)
print("Frequent Shoppers (3+ orders):")
frequent_shoppers

Frequent Shoppers (3+ orders):


Unnamed: 0,user_id,name,num_orders
0,93,Steven Hoffman,7
1,90,Jessica Johnson,6
2,81,Daniel Preston,6
3,33,Laura Carpenter,6
4,89,Mr. Joshua Bonilla,5
...,...,...,...
56,14,Raven Johnston,3
57,13,Daniel Morris,3
58,11,Donna Wilson,3
59,9,Barbara Delacruz,3


In [13]:
query = """
    -- Start by defining a Common Table Expression (CTE) named 'user_spend'
    WITH user_spend AS (
        SELECT
            u.user_id,
            SUM(o.amount) AS total_spent
        FROM
            users u
        JOIN
            orders o ON u.user_id = o.user_id
        GROUP BY
            u.user_id
    )
    -- Now, select from our temporary CTE as if it were a real table
    SELECT
        AVG(total_spent) AS average_spend_per_user
    FROM
        user_spend;
"""

avg_spend_report = pd.read_sql_query(query, conn)
print("The average total spend across all users:")
avg_spend_report

The average total spend across all users:


Unnamed: 0,average_spend_per_user
0,772.987449


In [14]:
query = """
    -- We can nest our logic inside a CTE for readability
    WITH user_spend AS (
        SELECT
            u.user_id,
            u.name,
            SUM(o.amount) AS total_spent
        FROM
            users u
        JOIN
            orders o ON u.user_id = o.user_id
        GROUP BY
            u.user_id
    )
    -- Now, use a CASE statement on the result of our CTE
    SELECT
        name,
        total_spent,
        CASE
            WHEN total_spent > 1000 THEN 'High Value'
            WHEN total_spent > 300 AND total_spent <= 1000 THEN 'Medium Value'
            ELSE 'Low Value'
        END AS customer_segment -- The result of our CASE logic
    FROM
        user_spend
    ORDER BY
        total_spent DESC
    LIMIT 35;
"""

customer_segments = pd.read_sql_query(query, conn)
print("Customer Segmentation Report:")
customer_segments

Customer Segmentation Report:


Unnamed: 0,name,total_spent,customer_segment
0,Sherry Avila,1889.03,High Value
1,Ian Santiago,1781.52,High Value
2,Jessica Johnson,1777.83,High Value
3,David Lawrence,1627.71,High Value
4,Rebecca Johnson,1601.8,High Value
5,Kristina Wright,1451.33,High Value
6,James Mitchell,1442.69,High Value
7,Steven Hoffman,1429.7,High Value
8,Amanda Webb,1425.89,High Value
9,Cindy Lopez,1350.63,High Value
