In [21]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

engine = create_engine('sqlite://', echo=False)

files = {
    "olist_customers": "olist_customers_dataset.csv",
    "olist_sellers": "olist_sellers_dataset.csv",
    "olist_order_reviews": "olist_order_reviews_dataset.csv",
    "olist_order_items": "olist_order_items_dataset.csv",
    "olist_products_dataset": "olist_products_dataset.csv",
    "olist_geolocation": "olist_geolocation_dataset.csv",
    "product_category_name_translation": "product_category_name_translation.csv",
    "olist_orders": "olist_orders_dataset.csv",
    "olist_order_payments": "olist_order_payments_dataset.csv"
}

for table_name, file_name in files.items():
    try:
        df = pd.read_csv(file_name, on_bad_lines='skip', engine='python')
        df.to_sql(table_name, con=engine, if_exists='replace', index=False)
        print(f"✅ {table_name} loaded")
    except Exception as e:
        print(f"❌ Error {table_name}: {e}")

✅ olist_customers loaded
✅ olist_sellers loaded
✅ olist_order_reviews loaded
✅ olist_order_items loaded
✅ olist_products_dataset loaded
✅ olist_geolocation loaded
✅ product_category_name_translation loaded
✅ olist_orders loaded
✅ olist_order_payments loaded


In [22]:
sql='''

Select * from olist_customers
limit 5


''';


df_sql = pd.read_sql_query(sql,con=engine)
df_sql.index.name = 'index'
df_sql.head()

Unnamed: 0_level_0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP


In [23]:
q1 = """
SELECT
    ROUND(
        COUNT(CASE WHEN r.review_score = 5 THEN 1 END) * 100.0 / COUNT(*),
        2
    ) AS percentage
FROM olist_orders AS o
JOIN olist_order_reviews AS r ON o.order_id = r.order_id
WHERE o.order_purchase_timestamp LIKE '2018-01%';
"""
df_q1 = pd.read_sql_query(q1, engine)
df_q1.index.name = 'index'
df_q1

Unnamed: 0_level_0,percentage
index,Unnamed: 1_level_1
0,56.5


In [24]:
q2 = """
WITH yearly_stats AS (
    SELECT
        STRFTIME('%Y', o.order_purchase_timestamp) AS year,
        COUNT(DISTINCT o.order_id) AS total_orders,
        COUNT(DISTINCT c.customer_unique_id) AS unique_customers
    FROM olist_orders o
    JOIN olist_customers c ON c.customer_id = o.customer_id
    GROUP BY 1
)
SELECT
    year,
    total_orders,
    unique_customers,
    ROUND(100.0 * (total_orders - LAG(total_orders) OVER (ORDER BY year)) /
          LAG(total_orders) OVER (ORDER BY year), 2) AS orders_growth_pct,
    ROUND(100.0 * (unique_customers - LAG(unique_customers) OVER (ORDER BY year)) /
          LAG(unique_customers) OVER (ORDER BY year), 2) AS customers_growth_pct
FROM yearly_stats;
"""
df_q2 = pd.read_sql_query(q2, engine)
df_q2.index.name = 'index'
df_q2

Unnamed: 0_level_0,year,total_orders,unique_customers,orders_growth_pct,customers_growth_pct
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2016,75,75,,
1,2017,10705,10615,14173.33,14053.33
2,2018,12839,12742,19.93,20.04


In [25]:
q3 = """
SELECT
    c.customer_unique_id,
    STRFTIME('%Y-%m', o.order_purchase_timestamp) AS year_month,
    ROUND(SUM(p.payment_value) / COUNT(DISTINCT o.order_id), 2) AS average_order_value
FROM olist_customers c
JOIN olist_orders o ON o.customer_id = c.customer_id
JOIN olist_order_payments p ON p.order_id = o.order_id
GROUP BY 1, 2
ORDER BY year_month DESC, average_order_value DESC;
"""
df_q3 = pd.read_sql_query(q3, engine)
df_q3.index.name = 'index'
df_q3

Unnamed: 0_level_0,customer_unique_id,year_month,average_order_value
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,c1ee153508c6b785b491443a95ff364e,2018-09,414.04
1,09687a7b7431a93b5c53b50ba779bf94,2018-09,258.67
2,08642cd329066fe11ec63293f714f2f8,2018-09,191.46
3,ef0103e9602d12594d19c2b666219bc1,2018-09,84.58
4,230a1e9f42924d41f37fd22a1d4a9707,2018-09,69.46
...,...,...,...
23530,8329519e31cb1b89bd44c3c6ae417ad6,2016-10,22.86
23531,3f4f614c632af7fc7508462a7cb55ac2,2016-10,18.62
23532,b7d76e111c89f7ebf14761390f0f7d17,2016-09,136.23
23533,4854e9b3feff728c13ee5fc7d1547e92,2016-09,75.06


In [26]:
q4 = """
SELECT
    c.customer_city,
    ROUND(SUM(p.payment_value), 2) AS revenue
FROM olist_order_payments p
JOIN olist_orders o ON o.order_id = p.order_id
JOIN olist_customers c ON c.customer_id = o.customer_id
WHERE o.order_purchase_timestamp >= '2016-01-01'
  AND o.order_purchase_timestamp <= '2018-12-31'
GROUP BY 1
ORDER BY revenue DESC
LIMIT 5;
"""
df_q4 = pd.read_sql_query(q4, engine)
df_q4.index.name = 'index'
df_q4

Unnamed: 0_level_0,customer_city,revenue
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,sao paulo,531981.93
1,rio de janeiro,297004.78
2,belo horizonte,97655.75
3,brasilia,76086.63
4,curitiba,61950.81


In [27]:
q5 = """
SELECT
    c.customer_state,
    ROUND(SUM(p.payment_value), 2) AS revenue
FROM olist_order_payments p
JOIN olist_orders o ON o.order_id = p.order_id
JOIN olist_customers c ON c.customer_id = o.customer_id
WHERE o.order_purchase_timestamp >= '2016-01-01'
  AND o.order_purchase_timestamp <= '2018-12-31'
GROUP BY 1
ORDER BY revenue DESC;
"""
df_q5 = pd.read_sql_query(q5, engine)
df_q5.index.name = 'index'
df_q5

Unnamed: 0_level_0,customer_state,revenue
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,SP,1457839.12
1,RJ,531221.23
2,MG,432560.88
3,RS,213335.1
4,PR,194428.36
5,SC,152734.95
6,BA,140761.48
7,GO,89312.64
8,DF,76086.63
9,PE,75616.28


In [28]:
q6 = """
SELECT
    oi.seller_id,
    COUNT(oi.order_id) AS total_goods_sold,
    ROUND(SUM(oi.price), 2) AS total_revenue,
    COUNT(DISTINCT o.customer_id) AS total_customers,
    SUM(CASE WHEN r.review_score = 5 THEN 1 ELSE 0 END) AS count_5_star_ratings,
    ROUND(AVG(r.review_score), 2) AS avg_rating
FROM olist_order_items oi
JOIN olist_orders o ON oi.order_id = o.order_id
JOIN olist_order_reviews r ON oi.order_id = r.order_id
GROUP BY oi.seller_id
HAVING count_5_star_ratings > 50
ORDER BY total_revenue DESC
LIMIT 10;
"""
df_q6 = pd.read_sql_query(q6, engine)
df_q6.index.name = 'index'
df_q6

Unnamed: 0_level_0,seller_id,total_goods_sold,total_revenue,total_customers,count_5_star_ratings,avg_rating
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,4869f7a5dfa277a7dca6462dcf3b52b2,107,21839.8,105,60,4.01
1,4a3ca9315b744ce9f8e9374361493884,190,19015.6,175,94,3.92
2,da8622b14eb17ae2831f4ac5b9dab84a,154,16624.78,131,88,4.06
3,7a67c85e85bb2ce8582c35f2203ad736,131,15261.5,131,88,4.33
4,1025f0e2d44d7041d6cf58b6550e0bfa,129,14093.28,88,62,3.77
5,6560211a19b47992c3666cc44a7e94c0,206,13597.97,188,107,3.93
6,955fee9216a65b617aa5c0531780ce60,138,10355.15,106,69,3.95
7,1f50f920176fa81dab994f9023523100,177,9812.34,128,117,4.21
8,cc419e0650a3c5ba77189a1882b7556a,158,9088.44,149,94,4.18
9,3d871de0142ce09b7081e2b9d1733cb1,98,8079.6,94,52,4.14


In [29]:
q7 = """
SELECT
    c.customer_state,
    COUNT(o.order_id) AS total_orders_in_state,
    SUM(CASE WHEN o.order_status = 'delivered' THEN 1 ELSE 0 END) AS delivered_in_state,
    ROUND(
        SUM(CASE WHEN o.order_status = 'delivered' THEN 1.0 ELSE 0.0 END) * 100.0 / COUNT(o.order_id),
        2
    ) AS delivery_success_rate
FROM olist_customers c
JOIN olist_orders o ON o.customer_id = c.customer_id
GROUP BY 1
ORDER BY delivery_success_rate DESC;
"""
df_q7 = pd.read_sql_query(q7, engine)
df_q7.index.name = 'index'
df_q7

Unnamed: 0_level_0,customer_state,total_orders_in_state,delivered_in_state,delivery_success_rate
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,AP,11,11,100.0
1,AC,25,25,100.0
2,RN,110,109,99.09
3,MT,190,188,98.95
4,SE,94,93,98.94
5,ES,455,450,98.9
6,PB,135,133,98.52
7,PR,1154,1136,98.44
8,PI,110,108,98.18
9,MS,162,159,98.15


In [30]:
q8 = """
SELECT
    t.product_category_name_english AS product_category_name,
    p.payment_type,
    COUNT(*) AS payments_for_products
FROM olist_order_payments p
JOIN olist_order_items i ON p.order_id = i.order_id
JOIN olist_products_dataset prod ON i.product_id = prod.product_id
JOIN product_category_name_translation t ON prod.product_category_name = t.product_category_name
GROUP BY 1, 2 ORDER BY payments_for_products DESC;
"""
df_q8 = pd.read_sql_query(q8, engine)
df_q8.index.name = 'index'
df_q8

Unnamed: 0_level_0,product_category_name,payment_type,payments_for_products
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,bed_bath_table,credit_card,4806
1,health_beauty,credit_card,4011
2,furniture_decor,credit_card,3528
3,sports_leisure,credit_card,3499
4,computers_accessories,credit_card,2874
...,...,...,...
246,furniture_bedroom,debit_card,1
247,home_appliances_2,debit_card,1
248,industry_commerce_and_business,voucher,1
249,la_cuisine,voucher,1


In [31]:
q9 = """
WITH city_coords AS (
    SELECT
        geolocation_city,
        AVG(geolocation_lat) AS lat,
        AVG(geolocation_lng) AS lng
    FROM olist_geolocation
    GROUP BY 1
)
SELECT
    o.order_id,
    c.customer_city,
    s.seller_city,
    ROUND(SQRT(
        (c_coord.lat - s_coord.lat)*(c_coord.lat - s_coord.lat) +
        (c_coord.lng - s_coord.lng)*(c_coord.lng - s_coord.lng)
    ) * 111, 2) AS distance_km
FROM olist_orders o
JOIN olist_customers c ON o.customer_id = c.customer_id
JOIN olist_order_items i ON o.order_id = i.order_id
JOIN olist_sellers s ON i.seller_id = s.seller_id
JOIN city_coords c_coord ON c.customer_city = c_coord.geolocation_city
JOIN city_coords s_coord ON s.seller_city = s_coord.geolocation_city
LIMIT 10;
"""
df_q9 = pd.read_sql_query(q9, engine)
df_q9.index.name = 'index'
df_q9

Unnamed: 0_level_0,order_id,customer_city,seller_city,distance_km
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,203096f03d82e0dffbc41ebc2e2bcfb7,sao paulo,sao paulo,0.0
1,2807d0e504d6d4894d41672727bc139f,sao paulo,sao paulo,0.0
2,25f4376934e13d3508486352e11a5db0,sao jose dos campos,sao paulo,94.89
3,68873cf91053cd11e6b49a766db5af1a,sao paulo,sao paulo,0.0
4,6d25592267349b322799e2beb687871e,sao paulo,sao paulo,0.0
5,6d25592267349b322799e2beb687871e,sao paulo,sao paulo,0.0
6,95cf9f239f724799131f7ca949209bd9,sao paulo,sao paulo,0.0
7,5561adcb0fd46da4cad3048fa4e7fc00,sao paulo,sao paulo,0.0
8,75351e48296ef42211a0b80c427aae57,sao paulo,sao paulo,0.0
9,37d6c8f1b209eab8d1b6522a1a3e4d88,sao paulo,sao paulo,0.0
