In [2]:
# exploration_orders.ipynb

import duckdb
import pandas as pd
import matplotlib.pyplot as plt

# Connect to existing DuckDB database
con = duckdb.connect("instacart.duckdb")

# Preview order structure
df_orders = con.execute("SELECT * FROM orders LIMIT 10").fetchdf()
df_orders
tables = con.execute("SHOW TABLES").fetchdf()["name"].tolist()
for table in tables:
    print(f"--- {table} ---")
    print(con.execute(f"DESCRIBE {table}").fetchdf())



--- aisles ---
  column_name column_type null   key default extra
0    aisle_id      BIGINT  YES  None    None  None
1       aisle     VARCHAR  YES  None    None  None
--- departments ---
     column_name column_type null   key default extra
0  department_id      BIGINT  YES  None    None  None
1     department     VARCHAR  YES  None    None  None
--- order_products_prior ---
         column_name column_type null   key default extra
0           order_id      BIGINT  YES  None    None  None
1         product_id      BIGINT  YES  None    None  None
2  add_to_cart_order      BIGINT  YES  None    None  None
3          reordered      BIGINT  YES  None    None  None
--- order_products_train ---
         column_name column_type null   key default extra
0           order_id      BIGINT  YES  None    None  None
1         product_id      BIGINT  YES  None    None  None
2  add_to_cart_order      BIGINT  YES  None    None  None
3          reordered      BIGINT  YES  None    None  None
--- orders -

In [4]:

# 1. Top Selling Products
df_user_orders2 = con.execute("""
    SELECT 
    p.product_name, COUNT(*) as purchases
    FROM products p
    JOIN order_products_train t ON t.product_id = p.product_id
    GROUP BY p.product_name
    ORDER BY 2 DESC
    LIMIT 10
;""").fetchdf()
print(df_user_orders2)




             product_name  purchases
0                  Banana      18726
1  Bag of Organic Bananas      15480
2    Organic Strawberries      10894
3    Organic Baby Spinach       9784
4             Large Lemon       8135
5         Organic Avocado       7409
6    Organic Hass Avocado       7293
7            Strawberries       6494
8                   Limes       6033
9     Organic Raspberries       5546


In [9]:
# 2. Most Popular Aisles
df_user_orders = con.execute("""
    SELECT a.aisle_id, a.aisle, COUNT(*) as count_times_aisle_used
    FROM aisles a
    JOIN products p on a.aisle_id = p.aisle_id
    JOIN order_products_train t on p.product_id = t.product_id
    GROUP BY a.aisle_id, a.aisle
    ORDER BY 3 DeSC
""").fetchdf()
print(df_user_orders)

     aisle_id                       aisle  count_times_aisle_used
0          83            fresh vegetables                  150609
1          24                fresh fruits                  150473
2         123  packaged vegetables fruits                   78493
3         120                      yogurt                   55240
4          21             packaged cheese                   41699
..        ...                         ...                     ...
129        10            kitchen supplies                     448
130       102         baby bath body care                     328
131        82            baby accessories                     306
132       113                frozen juice                     294
133       132                      beauty                     287

[134 rows x 3 columns]


In [15]:
# 3. Which Day of the Week Do People Order Most?
df_user_orders = con.execute("""
    SELECT 
        order_dow, 
        CASE WHEN order_dow = 0 THEN 'Sunday'
            WHEN order_dow = 1 THEN 'Monday'
            WHEN order_dow = 2 THEN 'Tuesday'
            WHEN order_dow = 3 THEN 'Wednesday'
            WHEN order_dow = 4 THEN 'Thursday'
            WHEN order_dow = 5 THEN 'Friday'
            WHEN order_dow = 6 THEN 'Saturday'
            END AS order_day_name,
        COUNT(*) AS total_orders
    FROM orders
    GROUP by order_dow
    ORDER BY total_orders DESC
""").fetchdf()
print(df_user_orders)

   order_dow order_day_name  total_orders
0          0         Sunday        600905
1          1         Monday        587478
2          2        Tuesday        467260
3          5         Friday        453368
4          6       Saturday        448761
5          3      Wednesday        436972
6          4       Thursday        426339


In [17]:
# 4. What Time of Day Do People Order?
df_user_orders = con.execute("""
    SELECT order_hour_of_day, COUNT(*) AS total_orders
    FROM orders
    GROUP by order_hour_of_day
    ORDER BY 2 DESC
""").fetchdf()
print(df_user_orders)

   order_hour_of_day  total_orders
0                 10        288418
1                 11        284728
2                 15        283639
3                 14        283042
4                 13        277999
5                 12        272841
6                 16        272553
7                 09        257812
8                 17        228795
9                 18        182912
10                08        178201
11                19        140569
12                20        104292
13                07         91868
14                21         78109
15                22         61468
16                23         40043
17                06         30529
18                00         22758
19                01         12398
20                05          9569
21                02          7539
22                04          5527
23                03          5474


In [23]:
# 5. Top Departments by Volume
df_user_orders = con.execute("""
    SELECT d.department, COUNT(*) as department_product_orders_count
    FROM order_products_train t
    JOIN products p ON p.product_id = t.product_id
    JOIN departments d ON p.department_id = d.department_id
    GROUP BY 1
    ORDER BY 2 DESC
""").fetchdf()
print(df_user_orders)



         department  department_product_orders_count
0           produce                           409087
1        dairy eggs                           217051
2            snacks                           118862
3         beverages                           114046
4            frozen                           100426
5            pantry                            81242
6            bakery                            48394
7      canned goods                            46799
8              deli                            44291
9   dry goods pasta                            38713
10        household                            35986
11     meat seafood                            30307
12        breakfast                            29500
13    personal care                            21570
14           babies                            14941
15    international                            11902
16          missing                             8251
17          alcohol                           