# 📊 Brazilian E-Commerce Dataset - Exploratory Data Analysis
**Author:** Jeisson Steve Rojas Velásquez  
**Date:** June 15, 2025  

This notebook presents a statistical overview of key variables from the Brazilian E-Commerce Public Dataset by Olist.
It includes summary statistics, distribution analysis, and visualizations for variables such as sales, customer spending, delivery time, and reviews.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew, kurtosis

# Settings
sns.set(style='whitegrid')

## 📥 Load Summary Data

In [None]:
eda = pd.read_csv('eda_summary.csv')
eda

## 📋 EDA Summary Table

In [None]:
eda.style.format({
    'Mean': '{:,.0f}', 'Max': '{:,.0f}', 'Min': '{:,.0f}', 'Mode': '{:,.0f}',
    'Median': '{:,.0f}', 'Range': '{:,.0f}', 'StdDev': '{:,.2f}',
    'Variance': '{:,.2e}', 'Skewness': '{:.2f}', 'Kurtosis': '{:.2f}',
    'Q1': '{:,.0f}', 'Q3': '{:,.0f}', 'IQR': '{:,.0f}'
})

## 📈 Visualizing Distributions

### Sales

In [None]:
# Simulating a distribution (for visualization only)
import numpy as np
np.random.seed(0)
simulated_data = np.random.normal(loc=eda[eda['Variable'] == 'Sales']['Mean'].values[0],
                                    scale=eda[eda['Variable'] == 'Sales']['StdDev'].values[0],
                                    size=1000)

plt.figure(figsize=(10, 4))
sns.histplot(simulated_data, kde=True, color='skyblue')
plt.title('Sales Distribution (Simulated)')
plt.xlabel('Sales')
plt.ylabel('Frequency')
plt.show()

### Customers

In [None]:
# Simulating a distribution (for visualization only)
import numpy as np
np.random.seed(0)
simulated_data = np.random.normal(loc=eda[eda['Variable'] == 'Customers']['Mean'].values[0],
                                    scale=eda[eda['Variable'] == 'Customers']['StdDev'].values[0],
                                    size=1000)

plt.figure(figsize=(10, 4))
sns.histplot(simulated_data, kde=True, color='skyblue')
plt.title('Customers Distribution (Simulated)')
plt.xlabel('Customers')
plt.ylabel('Frequency')
plt.show()

### Payment

In [None]:
# Simulating a distribution (for visualization only)
import numpy as np
np.random.seed(0)
simulated_data = np.random.normal(loc=eda[eda['Variable'] == 'Payment']['Mean'].values[0],
                                    scale=eda[eda['Variable'] == 'Payment']['StdDev'].values[0],
                                    size=1000)

plt.figure(figsize=(10, 4))
sns.histplot(simulated_data, kde=True, color='skyblue')
plt.title('Payment Distribution (Simulated)')
plt.xlabel('Payment')
plt.ylabel('Frequency')
plt.show()

### Product

In [None]:
# Simulating a distribution (for visualization only)
import numpy as np
np.random.seed(0)
simulated_data = np.random.normal(loc=eda[eda['Variable'] == 'Product']['Mean'].values[0],
                                    scale=eda[eda['Variable'] == 'Product']['StdDev'].values[0],
                                    size=1000)

plt.figure(figsize=(10, 4))
sns.histplot(simulated_data, kde=True, color='skyblue')
plt.title('Product Distribution (Simulated)')
plt.xlabel('Product')
plt.ylabel('Frequency')
plt.show()

### Sellers

In [None]:
# Simulating a distribution (for visualization only)
import numpy as np
np.random.seed(0)
simulated_data = np.random.normal(loc=eda[eda['Variable'] == 'Sellers']['Mean'].values[0],
                                    scale=eda[eda['Variable'] == 'Sellers']['StdDev'].values[0],
                                    size=1000)

plt.figure(figsize=(10, 4))
sns.histplot(simulated_data, kde=True, color='skyblue')
plt.title('Sellers Distribution (Simulated)')
plt.xlabel('Sellers')
plt.ylabel('Frequency')
plt.show()

### Geographic

In [None]:
# Simulating a distribution (for visualization only)
import numpy as np
np.random.seed(0)
simulated_data = np.random.normal(loc=eda[eda['Variable'] == 'Geographic']['Mean'].values[0],
                                    scale=eda[eda['Variable'] == 'Geographic']['StdDev'].values[0],
                                    size=1000)

plt.figure(figsize=(10, 4))
sns.histplot(simulated_data, kde=True, color='skyblue')
plt.title('Geographic Distribution (Simulated)')
plt.xlabel('Geographic')
plt.ylabel('Frequency')
plt.show()

### Review

In [None]:
# Simulating a distribution (for visualization only)
import numpy as np
np.random.seed(0)
simulated_data = np.random.normal(loc=eda[eda['Variable'] == 'Review']['Mean'].values[0],
                                    scale=eda[eda['Variable'] == 'Review']['StdDev'].values[0],
                                    size=1000)

plt.figure(figsize=(10, 4))
sns.histplot(simulated_data, kde=True, color='skyblue')
plt.title('Review Distribution (Simulated)')
plt.xlabel('Review')
plt.ylabel('Frequency')
plt.show()

### Temporal

In [None]:
# Simulating a distribution (for visualization only)
import numpy as np
np.random.seed(0)
simulated_data = np.random.normal(loc=eda[eda['Variable'] == 'Temporal']['Mean'].values[0],
                                    scale=eda[eda['Variable'] == 'Temporal']['StdDev'].values[0],
                                    size=1000)

plt.figure(figsize=(10, 4))
sns.histplot(simulated_data, kde=True, color='skyblue')
plt.title('Temporal Distribution (Simulated)')
plt.xlabel('Temporal')
plt.ylabel('Frequency')
plt.show()