### 1. Data Loading

In [56]:
# Import dependencies
import pandas as pd

In [58]:
# Load the data
df = pd.read_csv("Resources/retail_store_data_large.csv")

# Display the data
display(df.head())

Unnamed: 0,Date,Store,Region,Product Category,Product Subcategory,Units Sold,Unit Price,Marketing Spend,Revenue,COGS,Gross Profit,Net Profit
0,2022-01-01,Store B,West,Sports,Outdoor,36,434.16,183.14,15629.76,6911.575877,8718.184123,8535.044123
1,2022-01-01,Store F,West,Beauty,Haircare,19,276.49,397.66,5253.31,2320.134775,2933.175225,2535.515225
2,2022-01-01,Store C,South,Grocery,Snacks,61,861.96,314.91,52579.56,21946.025852,30633.534148,30318.624148
3,2022-01-01,Store J,South,Grocery,Fresh Produce,94,372.56,9.22,35020.64,23996.010323,11024.629677,11015.409677
4,2022-01-01,Store B,North,Grocery,Snacks,15,547.82,230.34,8217.3,5510.70735,2706.59265,2476.25265


### 2. Data Cleaning

In [61]:
# 1. Initial inspection
print("Initial shape:", df.shape)
print("\nData types:\n", df.dtypes)
print("\nMissing values:\n", df.isnull().sum())

Initial shape: (100000, 12)

Data types:
 Date                    object
Store                   object
Region                  object
Product Category        object
Product Subcategory     object
Units Sold               int64
Unit Price             float64
Marketing Spend        float64
Revenue                float64
COGS                   float64
Gross Profit           float64
Net Profit             float64
dtype: object

Missing values:
 Date                   0
Store                  0
Region                 0
Product Category       0
Product Subcategory    0
Units Sold             0
Unit Price             0
Marketing Spend        0
Revenue                0
COGS                   0
Gross Profit           0
Net Profit             0
dtype: int64


In [63]:
# 2. Handle duplicates
df = df.drop_duplicates()
print("\nShape after removing duplicates:", df.shape)


Shape after removing duplicates: (100000, 12)


In [65]:
# 3. Fix date format with error handling
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')  # converts invalid dates to NaT
invalid_dates = df[df['Date'].isna()]
if not invalid_dates.empty:
    print(f"\nWarning: {len(invalid_dates)} rows with invalid dates")

In [69]:
# 4. Standardize text fields
df['Region'] = df['Region'].str.title().str.strip()  # also removes extra whitespace
df['Product_Category'] = df['Product Category'].str.title().str.strip()

In [75]:
# 5. Handle missing values more robustly
# For numerical columns, consider median instead of 0 for marketing spend
median_marketing = df['Marketing Spend'].median()
df['Marketing_Spend'] = df['Marketing Spend'].fillna(median_marketing)

# For categorical columns
df['Region'] = df['Region'].fillna('Unknown')

In [81]:
# 6. Validate numerical ranges
print("\nValue ranges:")
print("Revenue:", df['Revenue'].min(), "-", df['Revenue'].max())
print("Gross Profit:", df['Gross Profit'].min(), "-", df['Gross Profit'].max())


Value ranges:
Revenue: 6.04 - 98974.26
Gross Profit: 2.419622333484721 - 58681.29466251924


In [83]:
# 7. Calculate derived metrics with safety checks
df['Gross Margin'] = (df['Gross Profit'] / df['Revenue'].replace(0, pd.NA)).round(4)
invalid_margins = df[df['Gross Margin'].isna()]
if not invalid_margins.empty:
    print(f"\nWarning: {len(invalid_margins)} rows with zero revenue")

In [85]:
# 8. Additional quality checks
# Check for negative values where they shouldn't exist
negative_revenue = df[df['Revenue'] < 0]
if not negative_revenue.empty:
    print(f"\nWarning: {len(negative_revenue)} rows with negative revenue")

In [87]:
# 9. Final inspection
print("\nFinal shape:", df.shape)
print("\nSample of cleaned data:")
print(df.sample(5))


Final shape: (100000, 15)

Sample of cleaned data:
            Date    Store Region Product Category Product Subcategory  \
80042 2024-05-26  Store D   East      Electronics              Phones   
21089 2022-08-20  Store D   East           Beauty            Skincare   
78887 2024-05-13  Store H  South           Sports             Fitness   
63499 2023-11-26  Store B  South             Toys      Action Figures   
46359 2023-05-25  Store I   East           Beauty            Skincare   

       Units Sold  Unit Price  Marketing Spend   Revenue          COGS  \
80042          87      343.55           206.71  29888.85  12201.246049   
21089          23      113.37           384.87   2607.51   1706.454671   
78887          28      244.99           391.10   6859.72   2940.398260   
63499          67      344.67            58.96  23092.89  12807.626806   
46359          66      995.70            83.16  65716.20  37062.668185   

       Gross Profit    Net Profit Product_Category  Marketing_Sp

In [91]:
# 10. Export cleaned data with timestamp
from datetime import datetime
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
df.to_csv(f"retail_data_cleaned_{timestamp}.csv", index=False)
df.to_parquet(f"retail_data_cleaned_{timestamp}.parquet")

print("\nCleaning complete! Files saved with timestamp.")


Cleaning complete! Files saved with timestamp.
