In [1]:
# Imports
import numpy as np
import pandas as pd
import re

# Read in dataset
walmart = pd.read_csv("walmart_dataset.csv", low_memory=False)
walmart.head()

# Check number of unique items per column (original behavior)
unique = walmart.nunique()
unique

# Drop uninformative columns
walmart.drop(columns=["RunDate", "PROMOTION"], inplace=True)


# Inspect missing data
walmart.isna().sum()

# Clean PRODUCT_SIZE to numeric data (Preserve decimal values; turn non-numeric to NaN)
walmart["PRODUCT_SIZE"] = pd.to_numeric(walmart["PRODUCT_SIZE"], errors="coerce")

# Info summary
walmart.info()

# Normalize text to lowercase
text_cols = ["DEPARTMENT", "CATEGORY", "SUBCATEGORY", "BRAND", "BREADCRUMBS", "PRODUCT_NAME"]

for col in text_cols:
    walmart[col] = walmart[col].str.lower()

# Add new product_clean column to help with matching
# Remove common packaging / size / fluff words
remove_tokens = [
    "family", "size", "family size", "value", "value pack", "pack", "pk", "ct", "count",
    "bottle", "bottles", "can", "cans", "bag", "bags", "box", "boxes", "carton", "tub",
    "jar", "jars", "cup", "cups",
    "oz", "fl", "fl oz", "ounce", "ounces",
    "lb", "lbs", "pound", "pounds",
    "gal", "gallon", "gallons",
    "stick", "sticks"
]

remove_pattern = r"\b(" + "|".join(re.escape(t) for t in remove_tokens) + r")\b"

def clean_product_name(name, brand=None):
    """
    Create a simplified product name for fuzzy matching:
    - lowercase
    - remove brand token
    - remove punctuation
    - strip common size/packaging tokens
    - collapse extra whitespace
    """
    s = str(name).lower()

    # Remove brand name if in product name
    if brand:
        b = str(brand).lower()
        s = re.sub(r"\b" + re.escape(b) + r"\b", " ", s)

    # Remove non-alphanumeric characters but keep spaces
    s = re.sub(r"[^a-z0-9\s]", " ", s)

    # Remove common packaging / size text
    s = re.sub(remove_pattern, " ", s)

    # Collapse multiple spaces and trim
    s = re.sub(r"\s+", " ", s).strip()

    return s

walmart["product_clean"] = walmart.apply(
    lambda row: clean_product_name(row["PRODUCT_NAME"], row["BRAND"]),
    axis=1
)

# Check distinct values after cleaning
walmart[["PRODUCT_NAME", "product_clean"]].head()

# Drop rows that are missing critical fields
critical_cols = [
    "SHIPPING_LOCATION",
    "DEPARTMENT",
    "CATEGORY",
    "BREADCRUMBS",
    "SKU",
    "PRODUCT_URL",
    "PRODUCT_NAME",
    "BRAND",
    "PRICE_RETAIL",
    "PRICE_CURRENT"
]

walmart.dropna(subset=critical_cols, inplace=True)

# Check remaining missing values
walmart.isna().sum()

# Drop duplicates
walmart.drop_duplicates(
    subset=["SKU", "product_clean", "PRODUCT_SIZE", "PRICE_CURRENT", "PRICE_RETAIL"],
    inplace=True
)
# See number of unique values now
walmart.nunique()

# Sort for readability (by category / subcategory / brand / product_clean)
walmart.sort_values(
    by=["CATEGORY", "SUBCATEGORY", "BRAND", "product_clean"],
    inplace=True
)

# Category-level summary
category_groups = walmart.groupby(["CATEGORY", "SUBCATEGORY"])
category_groups[["PRICE_RETAIL"]].describe()

# Saved cleaned dataset
walmart.to_csv("walmart_cleaned.csv", index=False)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568534 entries, 0 to 568533
Data columns (total 14 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   index              568534 non-null  int64  
 1   SHIPPING_LOCATION  568534 non-null  int64  
 2   DEPARTMENT         568534 non-null  object 
 3   CATEGORY           568534 non-null  object 
 4   SUBCATEGORY        361324 non-null  object 
 5   BREADCRUMBS        568534 non-null  object 
 6   SKU                568534 non-null  int64  
 7   PRODUCT_URL        568534 non-null  object 
 8   PRODUCT_NAME       568534 non-null  object 
 9   BRAND              568507 non-null  object 
 10  PRICE_RETAIL       568534 non-null  float64
 11  PRICE_CURRENT      568534 non-null  float64
 12  PRODUCT_SIZE       504974 non-null  float64
 13  tid                568534 non-null  int64  
dtypes: float64(3), int64(4), object(7)
memory usage: 60.7+ MB
