In [3]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import folium

# Display plots inline
%matplotlib inline

# Set a consistent style for plots
sns.set_style("whitegrid")


In [8]:
# Load the necessary datasets
customers_df = pd.read_csv('../data/olist_customers_dataset.csv')
orders_df = pd.read_csv('../data/olist_orders_dataset.csv')
geolocation_df = pd.read_csv('../data/olist_geolocation_dataset.csv')

In [9]:
# Print the shapes to confirm loading
print("Customers shape:", customers_df.shape)
print("Orders shape:", orders_df.shape)
print("Geolocation shape:", geolocation_df.shape)

Customers shape: (99441, 5)
Orders shape: (99441, 8)
Geolocation shape: (1000163, 5)


In [10]:
customers_df.head()


Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP


In [11]:
geolocation_df.head()


Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,1037,-23.545621,-46.639292,sao paulo,SP
1,1046,-23.546081,-46.64482,sao paulo,SP
2,1046,-23.546129,-46.642951,sao paulo,SP
3,1041,-23.544392,-46.639499,sao paulo,SP
4,1035,-23.541578,-46.641607,sao paulo,SP


In [12]:
orders_df.head()


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15 00:00:00
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26 00:00:00


In [13]:
# Rename columns
geolocation_df.rename(columns={
    'geolocation_zip_code_prefix': 'zip_prefix',
    'geolocation_lat': 'latitude',
    'geolocation_lng': 'longitude',
    'geolocation_city': 'city',
    'geolocation_state': 'state'
}, inplace=True)

# Drop duplicates 
geolocation_df.drop_duplicates(inplace=True)

# Check for missing or null values
print(geolocation_df.isna().sum())

zip_prefix    0
latitude      0
longitude     0
city          0
state         0
dtype: int64


In [17]:
import pandas as pd

# Group rows by zip_prefix and compute the average lat/long
aggregated_geolocation = geolocation_df.groupby('zip_prefix', as_index=False).agg({
    'latitude': 'mean',
    'longitude': 'mean',
    'city': lambda x: x.mode()[0] if len(x.mode()) > 0 else None,
    'state': lambda x: x.mode()[0] if len(x.mode()) > 0 else None
})

# This gives one row per zip_prefix, with an aggregated lat/long
aggregated_geolocation.head()


Unnamed: 0,zip_prefix,latitude,longitude,city,state
0,1001,-23.550227,-46.634039,sao paulo,SP
1,1002,-23.547657,-46.634991,sao paulo,SP
2,1003,-23.549,-46.635582,sao paulo,SP
3,1004,-23.549829,-46.634792,sao paulo,SP
4,1005,-23.549547,-46.636406,sao paulo,SP


In [18]:
aggregated_geolocation.shape

(19015, 5)

In [16]:
from shapely.geometry import Point

# Create a geometry column from longitude and latitude
geometry = [Point(xy) for xy in zip(geolocation_df['longitude'], geolocation_df['latitude'])]
geolocation_gdf = gpd.GeoDataFrame(geolocation_df, crs="EPSG:4326", geometry=geometry)

# Check the new GeoDataFrame
geolocation_gdf.head()


Unnamed: 0,zip_prefix,latitude,longitude,city,state,geometry
0,1037,-23.545621,-46.639292,sao paulo,SP,POINT (-46.63929 -23.54562)
1,1046,-23.546081,-46.64482,sao paulo,SP,POINT (-46.64482 -23.54608)
2,1046,-23.546129,-46.642951,sao paulo,SP,POINT (-46.64295 -23.54613)
3,1041,-23.544392,-46.639499,sao paulo,SP,POINT (-46.6395 -23.54439)
4,1035,-23.541578,-46.641607,sao paulo,SP,POINT (-46.64161 -23.54158)
