In [1]:
# Ad-hoc EDA: Unique State Analysis for Sellers and Customers
# Import necessary libraries
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

# Set display options for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("=== AD-HOC EDA: STATE ANALYSIS ===")
print("Loading data and analyzing unique states in seller and customer data")
print("-" * 60)


=== AD-HOC EDA: STATE ANALYSIS ===
Loading data and analyzing unique states in seller and customer data
------------------------------------------------------------


In [3]:
# Load CSV files from data folder
print("STEP 1: LOADING DATA")
print("-" * 30)

# Define file paths
data_path = Path('../data/brazilian-ecommerce/')

# Load sellers and customers datasets
try:
    sellers_df = pd.read_csv(data_path / 'olist_sellers_dataset.csv')
    print(f"✓ Sellers data loaded: {sellers_df.shape[0]:,} rows x {sellers_df.shape[1]} cols")
    print(f"  Columns: {list(sellers_df.columns)}")
except FileNotFoundError:
    print("❌ Error: olist_sellers_dataset.csv not found!")

try:
    customers_df = pd.read_csv(data_path / 'olist_customers_dataset.csv')
    print(f"✓ Customers data loaded: {customers_df.shape[0]:,} rows x {customers_df.shape[1]} cols")
    print(f"  Columns: {list(customers_df.columns)}")
except FileNotFoundError:
    print("❌ Error: olist_customers_dataset.csv not found!")

print("\n" + "="*60)


STEP 1: LOADING DATA
------------------------------
✓ Sellers data loaded: 3,095 rows x 4 cols
  Columns: ['seller_id', 'seller_zip_code_prefix', 'seller_city', 'seller_state']
✓ Customers data loaded: 99,441 rows x 5 cols
  Columns: ['customer_id', 'customer_unique_id', 'customer_zip_code_prefix', 'customer_city', 'customer_state']



In [4]:
# ANALYZE UNIQUE VALUES IN SELLER_STATE
print("SELLER STATE ANALYSIS")
print("-" * 40)

print("=== UNIQUE VALUES IN SELLER_STATE ===")
seller_states = sellers_df['seller_state'].unique()
print(f"Number of unique seller states: {len(seller_states)}")
print("\nUnique seller states (alphabetically sorted):")
for state in sorted(seller_states):
    print(f"  - {state}")

print(f"\nSeller state distribution (top 10):")
seller_state_counts = sellers_df['seller_state'].value_counts()
print(seller_state_counts.head(10))

print(f"\nSeller state distribution (all states):")
print(seller_state_counts)


SELLER STATE ANALYSIS
----------------------------------------
=== UNIQUE VALUES IN SELLER_STATE ===
Number of unique seller states: 23

Unique seller states (alphabetically sorted):
  - AC
  - AM
  - BA
  - CE
  - DF
  - ES
  - GO
  - MA
  - MG
  - MS
  - MT
  - PA
  - PB
  - PE
  - PI
  - PR
  - RJ
  - RN
  - RO
  - RS
  - SC
  - SE
  - SP

Seller state distribution (top 10):
seller_state
SP    1849
PR     349
MG     244
SC     190
RJ     171
RS     129
GO      40
DF      30
ES      23
BA      19
Name: count, dtype: int64

Seller state distribution (all states):
seller_state
SP    1849
PR     349
MG     244
SC     190
RJ     171
RS     129
GO      40
DF      30
ES      23
BA      19
CE      13
PE       9
PB       6
RN       5
MS       5
MT       4
RO       2
SE       2
PI       1
AC       1
MA       1
AM       1
PA       1
Name: count, dtype: int64


In [5]:
# ANALYZE UNIQUE VALUES IN CUSTOMER_STATE
print("\n" + "="*60)
print("CUSTOMER STATE ANALYSIS")
print("-" * 40)

print("=== UNIQUE VALUES IN CUSTOMER_STATE ===")
customer_states = customers_df['customer_state'].unique()
print(f"Number of unique customer states: {len(customer_states)}")
print("\nUnique customer states (alphabetically sorted):")
for state in sorted(customer_states):
    print(f"  - {state}")

print(f"\nCustomer state distribution (top 10):")
customer_state_counts = customers_df['customer_state'].value_counts()
print(customer_state_counts.head(10))

print(f"\nCustomer state distribution (all states):")
print(customer_state_counts)



CUSTOMER STATE ANALYSIS
----------------------------------------
=== UNIQUE VALUES IN CUSTOMER_STATE ===
Number of unique customer states: 27

Unique customer states (alphabetically sorted):
  - AC
  - AL
  - AM
  - AP
  - BA
  - CE
  - DF
  - ES
  - GO
  - MA
  - MG
  - MS
  - MT
  - PA
  - PB
  - PE
  - PI
  - PR
  - RJ
  - RN
  - RO
  - RR
  - RS
  - SC
  - SE
  - SP
  - TO

Customer state distribution (top 10):
customer_state
SP    41746
RJ    12852
MG    11635
RS     5466
PR     5045
SC     3637
BA     3380
DF     2140
ES     2033
GO     2020
Name: count, dtype: int64

Customer state distribution (all states):
customer_state
SP    41746
RJ    12852
MG    11635
RS     5466
PR     5045
SC     3637
BA     3380
DF     2140
ES     2033
GO     2020
PE     1652
CE     1336
PA      975
MT      907
MA      747
MS      715
PB      536
PI      495
RN      485
AL      413
SE      350
TO      280
RO      253
AM      148
AC       81
AP       68
RR       46
Name: count, dtype: int64


In [6]:
# Read the orders CSV file
orders_df = pd.read_csv('../data/brazilian-ecommerce/olist_orders_dataset.csv')

# Check unique order statuses in source data
print("=== SOURCE DATA (CSV) ===")
print("Unique order statuses:")
print(orders_df['order_status'].unique())
print(f"\nTotal unique statuses: {len(orders_df['order_status'].unique())}")
print("\nStatus counts:")
print(orders_df['order_status'].value_counts())

=== SOURCE DATA (CSV) ===
Unique order statuses:
['delivered' 'invoiced' 'shipped' 'processing' 'unavailable' 'canceled'
 'created' 'approved']

Total unique statuses: 8

Status counts:
order_status
delivered      96478
shipped         1107
canceled         625
unavailable      609
invoiced         314
processing       301
created            5
approved           2
Name: count, dtype: int64
