In [2]:
import pandas as pd
customers = pd.read_csv("olist_customers_dataset.csv")
geolocation = pd.read_csv("olist_geolocation_dataset.csv")
orders = pd.read_csv("olist_orders_dataset.csv")
order_items = pd.read_csv("olist_order_items_dataset.csv")
products = pd.read_csv("olist_products_dataset.csv")
sellers = pd.read_csv("olist_sellers_dataset.csv")

print("Customers dataset columns:\n", customers.columns, "\n")
print("Geolocation dataset columns:\n", geolocation.columns, "\n")
print("Orders dataset columns:\n", orders.columns, "\n")
print("Order Items dataset columns:\n", order_items.columns, "\n")
print("Products dataset columns:\n", products.columns, "\n")
print("Sellers dataset columns:\n", sellers.columns, "\n")

Customers dataset columns:
 Index(['customer_id', 'customer_unique_id', 'customer_zip_code_prefix',
       'customer_city', 'customer_state'],
      dtype='object') 

Geolocation dataset columns:
 Index(['geolocation_zip_code_prefix', 'geolocation_lat', 'geolocation_lng',
       'geolocation_city', 'geolocation_state'],
      dtype='object') 

Orders dataset columns:
 Index(['order_id', 'customer_id', 'order_status', 'order_purchase_timestamp',
       'order_approved_at', 'order_delivered_carrier_date',
       'order_delivered_customer_date', 'order_estimated_delivery_date'],
      dtype='object') 

Order Items dataset columns:
 Index(['order_id', 'order_item_id', 'product_id', 'seller_id',
       'shipping_limit_date', 'price', 'freight_value'],
      dtype='object') 

Products dataset columns:
 Index(['product_id', 'product_category_name', 'product_name_lenght',
       'product_description_lenght', 'product_photos_qty', 'product_weight_g',
       'product_length_cm', 'product_height_

In [3]:
orders = orders[[
    "order_id", "customer_id", "order_purchase_timestamp",
    "order_delivered_customer_date", "order_estimated_delivery_date"
]]

order_items = order_items[[
    "order_id", "product_id", "seller_id", "freight_value"
]]

customers = customers[[
    "customer_id", "customer_zip_code_prefix", "customer_state"
]]

sellers = sellers[[
    "seller_id", "seller_zip_code_prefix", "seller_state"
]]

products = products[[
    "product_id", "product_category_name", "product_weight_g",
    "product_length_cm", "product_height_cm", "product_width_cm"
]]

geo = geolocation.groupby("geolocation_zip_code_prefix")[["geolocation_lat", "geolocation_lng"]].mean().reset_index()

df = pd.merge(orders, order_items, on="order_id", how="inner")

df = pd.merge(df, customers, on="customer_id", how="inner")

df = pd.merge(df, sellers, on="seller_id", how="inner")

df = pd.merge(df, products, on="product_id", how="left")

df = pd.merge(df, geo, left_on="customer_zip_code_prefix", right_on="geolocation_zip_code_prefix", how="left")
df = df.rename(columns={"geolocation_lat": "customer_lat", "geolocation_lng": "customer_lng"})
df = df.drop(columns=["geolocation_zip_code_prefix"])

df = pd.merge(df, geo, left_on="seller_zip_code_prefix", right_on="geolocation_zip_code_prefix", how="left")
df = df.rename(columns={"geolocation_lat": "seller_lat", "geolocation_lng": "seller_lng"})
df = df.drop(columns=["geolocation_zip_code_prefix"])
df["delivery_delay"] = (pd.to_datetime(df["order_delivered_customer_date"]) >
                        pd.to_datetime(df["order_estimated_delivery_date"])).astype(int)

df["delay_days"] = (
    pd.to_datetime(df["order_delivered_customer_date"]) -
    pd.to_datetime(df["order_estimated_delivery_date"])
).dt.days

df.to_csv("olist_final_dataset.csv", index=False)

print("Final dataset shape:", df.shape)
print("Sample rows:\n", df.head())

Final dataset shape: (112650, 23)
Sample rows:
                            order_id                       customer_id  \
0  e481f51cbdc54678b7cc49136f2d6af7  9ef432eb6251297304e76186b10a928d   
1  53cdb2fc8bc7dce0b6741e2150273451  b0830fb4747a6c6d20dea0b8c802d7ef   
2  47770eb9100c2d0c44946d9cf07ec65d  41ce2a54c0b03bf3443c3d931a367089   
3  949d5b44dbf5de918fe9c16f97b45f8a  f88197465ea7920adcdbec7375364d82   
4  ad21c59c0840e6cb83a9ceb5573f8159  8ab97904e6daea8866dbdbc4fb7aad2c   

  order_purchase_timestamp order_delivered_customer_date  \
0      2017-10-02 10:56:33           2017-10-10 21:25:13   
1      2018-07-24 20:41:37           2018-08-07 15:27:45   
2      2018-08-08 08:38:49           2018-08-17 18:06:29   
3      2017-11-18 19:28:06           2017-12-02 00:28:42   
4      2018-02-13 21:18:39           2018-02-16 18:17:02   

  order_estimated_delivery_date                        product_id  \
0           2017-10-18 00:00:00  87285b34884572647811a353c7ac498a   
1           20

In [7]:
df.columns


Index(['order_id', 'customer_id', 'order_purchase_timestamp',
       'order_delivered_customer_date', 'order_estimated_delivery_date',
       'product_id', 'seller_id', 'freight_value', 'customer_zip_code_prefix',
       'customer_state', 'seller_zip_code_prefix', 'seller_state',
       'product_category_name', 'product_weight_g', 'product_length_cm',
       'product_height_cm', 'product_width_cm', 'customer_lat', 'customer_lng',
       'seller_lat', 'seller_lng', 'delivery_delay', 'delay_days'],
      dtype='object')

In [8]:

df = df.drop(columns=["order_id", "customer_id", "product_id", "seller_id"])

print("Final dataset shape:", df.shape)
print("Remaining columns:\n", df.columns.tolist())


Final dataset shape: (112650, 19)
Remaining columns:
 ['order_purchase_timestamp', 'order_delivered_customer_date', 'order_estimated_delivery_date', 'freight_value', 'customer_zip_code_prefix', 'customer_state', 'seller_zip_code_prefix', 'seller_state', 'product_category_name', 'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm', 'customer_lat', 'customer_lng', 'seller_lat', 'seller_lng', 'delivery_delay', 'delay_days']


In [9]:
df_sample = df.sample(n=10000, random_state=42)

# sample for GitHub
df_sample.to_csv("olist_final_sample.csv", index=False)

print("Sample dataset shape:", df_sample.shape)


Sample dataset shape: (10000, 19)
