In [2]:
import pandas as pd

df1 = pd.read_csv('alo_yoga_products.csv')

In [4]:
df1

Unnamed: 0,name,current_color,price,sale_price,badge,image_url,product_url,colors_available,sizes_available,rating,review_count,scraped_at
0,Airlift Intrigue Bra - Espresso,,$68,,,https://cdn.shopify.com/s/files/1/2185/2813/fi...,https://www.aloyoga.com/products/w9679r-airlif...,"Espresso, 12, 12, Espresso, Black, Navy, Anthr...",,,,2025-09-06 14:02:15
1,7/8 High-Waist Airlift Legging - Espresso,,$128,,,https://cdn.shopify.com/s/files/1/2185/2813/fi...,https://www.aloyoga.com/products/w51314r-7-8-h...,"Espresso, 12, 12, Espresso, Black, Navy, Anthr...",,,,2025-09-06 14:02:16
2,Work It Mini Skirt - Green Olive,,$98,,,https://cdn.shopify.com/s/files/1/2185/2813/fi...,https://www.aloyoga.com/products/w6503r-work-i...,"Green Olive, Green Olive, Black",,,,2025-09-06 14:02:17
3,Work It Bomber - Green Olive,,$228,,,https://cdn.shopify.com/s/files/1/2185/2813/fi...,https://www.aloyoga.com/products/w4689r-work-i...,"Green Olive, Green Olive, Black",,,,2025-09-06 14:02:17
4,Airlift Intrigue Bra - Black,,$68,,,https://cdn.shopify.com/s/files/1/2185/2813/fi...,https://www.aloyoga.com/products/w9679r-airlif...,"Black, 12, 12, Black, Espresso, Navy, Anthraci...",,,,2025-09-06 14:02:18
...,...,...,...,...,...,...,...,...,...,...,...,...
715,Airlift Strength Bra - Navy,,$78,,,https://cdn.shopify.com/s/files/1/2185/2813/fi...,https://www.aloyoga.com/products/w9724r-airlif...,"Navy, Navy, Black, Espresso, Bold Red",,,,2025-09-06 14:09:40
716,Airlift Advantage Racerback Bra - Spearmint,,$68,,,https://cdn.shopify.com/s/files/1/2185/2813/fi...,https://www.aloyoga.com/products/w9555r-airlif...,"Spearmint, Spearmint, Black, White, Macadamia,...",,,,2025-09-06 14:09:40
717,Airlift Headband - Lunar Grey,,$34,,,https://cdn.shopify.com/s/files/1/2185/2813/fi...,https://www.aloyoga.com/products/a0054u-airlif...,"Lunar Grey, Lunar Grey, Green Olive, Neon Bubb...",,,,2025-09-06 14:09:41
718,Polar Fleece Wintry Mix Skirt - Black,,$98,,,https://cdn.shopify.com/s/files/1/2185/2813/fi...,https://www.aloyoga.com/products/w6476r-polar-...,"Black, Black, Ivory, Charcoal Green",,,,2025-09-06 14:09:42


In [6]:
df1.describe()

Unnamed: 0,current_color,sale_price,badge,sizes_available,rating,review_count
count,0.0,0.0,0.0,0.0,0.0,0.0
mean,,,,,,
std,,,,,,
min,,,,,,
25%,,,,,,
50%,,,,,,
75%,,,,,,
max,,,,,,


In [8]:
import pandas as pd
import numpy as np
import re
from pathlib import Path

# ------------------ config ------------------
FILES = [
    "alo_yoga_products.csv",
    "altardstate_products.csv",
    "cupshe_products.csv",
    "edikted_products.csv",
    "gymshark_products.csv",
    "nakd_products.csv",
    "princess_polly.csv",
    "vuori_products.csv",
]
DATA_DIR = Path("./")  # folder with your CSVs

# ------------------ helpers ------------------
def normalize_cols(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    out.columns = [re.sub(r"\s+", "_", c.strip().lower()) for c in out.columns]
    return out

RANGE_SEPS = r"[-–—to]+"  # -, en dash, em dash, or the word 'to'

def parse_price_series(s: pd.Series) -> pd.Series:
    """
    Return a single numeric 'price' series.
    Handles $, commas, and ranges like '40-60' or '$40–$60' (uses midpoint).
    """
    txt = s.astype(str).str.strip()
    txt = txt.str.replace(r"[^\d\.,\-–—to]", "", regex=True).str.replace(",", "", regex=False)
    parts = txt.str.split(rf"\s*{RANGE_SEPS}\s*", n=1, regex=True)

    pmin = parts.apply(lambda x: x[0].strip() if isinstance(x, list) and len(x) >= 1 else np.nan)
    pmax = parts.apply(lambda x: x[1].strip() if isinstance(x, list) and len(x) >= 2 else np.nan)

    pmin = pd.to_numeric(pmin, errors="coerce")
    pmax = pd.to_numeric(pmax, errors="coerce")

    price = pmin.copy()
    both = pmin.notna() & pmax.notna()
    price[both] = (pmin[both] + pmax[both]) / 2.0
    return price

def infer_col(df, candidates):
    for c in candidates:
        if c in df.columns:
            return c
    return None

# ------------------ extraction ------------------
rows = []
for fname in FILES:
    df = pd.read_csv(DATA_DIR / fname)
    df = normalize_cols(df)

    # infer likely columns
    cat_col   = infer_col(df, ["category","product_category","product_type","type","categories","taxonomy"])
    price_col = infer_col(df, ["price","sale_price","current_price","list_price","final_price","price_usd"])
    brand_col = infer_col(df, ["brand","brand_name","vendor","label"])

    # categories
    if cat_col:
        cats = sorted(pd.Series(df[cat_col]).dropna().astype(str).unique())
        # keep it compact for the table; tweak join as you like
        categories_str = ", ".join(cats[:15]) + ("…" if len(cats) > 15 else "")
    else:
        categories_str = "—"  # missing in this file (you may replace with your curated list)

    # price
    if price_col:
        price_num = parse_price_series(df[price_col])
        avg_price = round(price_num.mean(skipna=True), 2) if price_num.notna().any() else None
    else:
        avg_price = None

    # brands
    if brand_col:
        brands = sorted(pd.Series(df[brand_col]).dropna().astype(str).unique())
        brands_str = ", ".join(brands[:15]) + ("…" if len(brands) > 15 else "")
    else:
        brands_str = "—"

    rows.append({
        "File": fname,
        "Product Categories": categories_str,
        "Avg Price": f"${avg_price:.2f}" if isinstance(avg_price, (int,float)) else "—",
        "Brands": brands_str
    })

summary = pd.DataFrame(rows)
summary


Unnamed: 0,File,Product Categories,Avg Price,Brands
0,alo_yoga_products.csv,—,$125.45,—
1,altardstate_products.csv,"Clothing, Clothing/Dresses",$68.57,"ALTAR'D STATE, AS REVIVAL"
2,cupshe_products.csv,—,$36.64,—
3,edikted_products.csv,—,$25.32,—
4,gymshark_products.csv,—,$42.48,—
5,nakd_products.csv,,$69.89,"Camelia Farhoodi x NA-KD, Cecilie Haugaard x N..."
6,princess_polly.csv,—,$56.94,"Lioness, Motel, Princess Polly, Princess Polly..."
7,vuori_products.csv,—,$92.70,—


In [10]:
import pandas as pd
from pathlib import Path

# If you ran the cleaning step already, use the master it created:
MASTER = Path("cleaned/all_products_master.csv")
df = pd.read_csv(MASTER)
df.columns = [c.strip().lower() for c in df.columns]

# --- 1) Your curated Product Categories per file (from your table) ---
CURATED_CATS = {
    "alo_yoga_products.csv": "Bras, Leggings, Skirts, Bombers, Coverups, Shorts, Pullovers, Sweatpants, Tees",
    "altardstate_products.csv": "Dresses (Maxi, Midi, Mini)",
    "cupshe_products.csv": "Bikini Sets, One-Piece Swimsuits",
    "edikted_products.csv": "Sweaters, Sweatpants, Jeans, Tops, Mini Skirts, Shorts, Skorts",
    "gymshark_products.csv": "Sports Bras, Crop Tops, Leggings, Shorts, One-Pieces",
    "nakd_products.csv": "Coats, Jackets, Blazers, Jeans, Tops, Sweater, Boots, Ballerinas, Trenchcoats",
    "princess_polly.csv": "Mini Dresses",
    "vuori_products.csv": "Pants, Leggings, Tees, Joggers, Tops, Jackets, Tank, Cardigan, Hoodie, Trouser, Bra, Short",
}

# --- 2) Avg price per file (computed from numeric 'price' in the cleaned master) ---
avg_price = (
    df.groupby("source_file", dropna=False)["price"]
      .mean()
      .round(2)
      .rename("Avg Price")
      .reset_index()
)

# --- 3) Brand list per file (if 'brand' exists) ---
def uniq_join(series, limit=None):
    vals = sorted(set(str(x) for x in series.dropna() if str(x).strip()))
    if limit and len(vals) > limit:
        return ", ".join(vals[:limit]) + "..."
    return ", ".join(vals) if vals else "—"

brands = (
    df.groupby("source_file", dropna=False)["brand"]
      .apply(lambda s: uniq_join(s, limit=None))  # set limit=15 if you want to truncate
      .rename("Brands")
      .reset_index()
)

# --- 4) Build summary table in your desired order ---
FILES = [
    "alo_yoga_products.csv",
    "altardstate_products.csv",
    "cupshe_products.csv",
    "edikted_products.csv",
    "gymshark_products.csv",
    "nakd_products.csv",
    "princess_polly.csv",
    "vuori_products.csv",
]

summary = pd.DataFrame({"File": FILES})
summary["Product Categories"] = summary["File"].map(CURATED_CATS)

summary = (summary
           .merge(avg_price.rename(columns={"source_file": "File"}), on="File", how="left")
           .merge(brands.rename(columns={"source_file": "File"}), on="File", how="left"))

# Format Avg Price like $00.00 and fill missing with —
summary["Avg Price"] = summary["Avg Price"].apply(lambda x: f"${x:0.2f}" if pd.notna(x) else "—")
summary["Brands"] = summary["Brands"].fillna("—")

# Show full text (no truncation)
pd.set_option("display.max_colwidth", None)
summary


FileNotFoundError: [Errno 2] No such file or directory: 'cleaned/all_products_master.csv'

In [12]:
import pandas as pd
import numpy as np
import re
from pathlib import Path

DATA_DIR = Path("./")  # your notebook + csvs in same folder

FILES = [
    "alo_yoga_products.csv",
    "altardstate_products.csv",
    "cupshe_products.csv",
    "edikted_products.csv",
    "gymshark_products.csv",
    "nakd_products.csv",
    "princess_polly.csv",
    "vuori_products.csv",
]

CURATED_CATS = {
    "alo_yoga_products.csv": "Bras, Leggings, Skirts, Bombers, Coverups, Shorts, Pullovers, Sweatpants, Tees",
    "altardstate_products.csv": "Dresses (Maxi, Midi, Mini)",
    "cupshe_products.csv": "Bikini Sets, One-Piece Swimsuits",
    "edikted_products.csv": "Sweaters, Sweatpants, Jeans, Tops, Mini Skirts, Shorts, Skorts",
    "gymshark_products.csv": "Sports Bras, Crop Tops, Leggings, Shorts, One-Pieces",
    "nakd_products.csv": "Coats, Jackets, Blazers, Jeans, Tops, Sweater, Boots, Ballerinas, Trenchcoats",
    "princess_polly.csv": "Mini Dresses",
    "vuori_products.csv": "Pants, Leggings, Tees, Joggers, Tops, Jackets, Tank, Cardigan, Hoodie, Trouser, Bra, Short",
}

RANGE_SEPS = r"[-–—to]+"

def parse_price_series(s: pd.Series) -> pd.Series:
    txt = s.astype(str).str.strip()
    txt = txt.str.replace(r"[^\d\.,\-–—to]", "", regex=True).str.replace(",", "", regex=False)
    parts = txt.str.split(rf"\s*{RANGE_SEPS}\s*", n=1, regex=True)
    pmin = parts.apply(lambda x: x[0] if isinstance(x, list) and len(x) >= 1 else np.nan)
    pmax = parts.apply(lambda x: x[1] if isinstance(x, list) and len(x) >= 2 else np.nan)
    pmin = pd.to_numeric(pmin, errors="coerce")
    pmax = pd.to_numeric(pmax, errors="coerce")
    price = pmin.copy()
    both = pmin.notna() & pmax.notna()
    price[both] = (pmin[both] + pmax[both]) / 2.0
    return price

def infer_col(df, candidates):
    for c in candidates:
        if c in df.columns:
            return c
    return None

rows = []
for fname in FILES:
    df = pd.read_csv(DATA_DIR / fname)
    df.columns = [c.strip().lower() for c in df.columns]

    price_col = infer_col(df, ["price","sale_price","current_price","list_price","final_price","price_usd"])
    brand_col = infer_col(df, ["brand","brand_name","vendor","label"])

    if price_col:
        price_num = parse_price_series(df[price_col])
        avg_price = round(price_num.mean(skipna=True), 2) if price_num.notna().any() else None
    else:
        avg_price = None

    if brand_col:
        brands = sorted(set(str(x) for x in df[brand_col].dropna() if str(x).strip()))
        brands_str = ", ".join(brands)
    else:
        brands_str = "—"

    rows.append({
        "File": fname,
        "Product Categories": CURATED_CATS.get(fname,"—"),
        "Avg Price": f"${avg_price:.2f}" if isinstance(avg_price,(int,float)) else "—",
        "Brands": brands_str
    })

summary = pd.DataFrame(rows)
summary


Unnamed: 0,File,Product Categories,Avg Price,Brands
0,alo_yoga_products.csv,"Bras, Leggings, Skirts, Bombers, Coverups, Sho...",$125.45,—
1,altardstate_products.csv,"Dresses (Maxi, Midi, Mini)",$68.57,"ALTAR'D STATE, AS REVIVAL"
2,cupshe_products.csv,"Bikini Sets, One-Piece Swimsuits",$36.64,—
3,edikted_products.csv,"Sweaters, Sweatpants, Jeans, Tops, Mini Skirts...",$25.32,—
4,gymshark_products.csv,"Sports Bras, Crop Tops, Leggings, Shorts, One-...",$42.48,—
5,nakd_products.csv,"Coats, Jackets, Blazers, Jeans, Tops, Sweater,...",$69.89,"Camelia Farhoodi x NA-KD, Cecilie Haugaard x N..."
6,princess_polly.csv,Mini Dresses,$56.94,"Lioness, Motel, Princess Polly, Princess Polly..."
7,vuori_products.csv,"Pants, Leggings, Tees, Joggers, Tops, Jackets,...",$92.70,—


In [14]:
import pandas as pd
from pathlib import Path

DATA_DIR = Path("./")  # folder with your *_cleaned.csv files

FILES = [
    "alo_yoga_products_cleaned.csv",
    "altardstate_products_cleaned.csv",
    "cupshe_products_cleaned.csv",
    "edikted_products_cleaned.csv",
    "gymshark_products_cleaned.csv",
    "nakd_products_cleaned.csv",
    "princess_polly_cleaned.csv",
    "vuori_products_cleaned.csv",
]

# stitch all into one dataframe with source_file column
dfs = []
for f in FILES:
    df = pd.read_csv(DATA_DIR / f)
    df["source_file"] = f
    dfs.append(df)
all_cleaned = pd.concat(dfs, ignore_index=True)
all_cleaned.head()


Unnamed: 0,price(USD),image_url,product_url,product_name,current_color_from_name,available_colors_cleaned,source_file,product_id,wishlist_pid,name,...,discount,colors,fit,color,variant,available_colors,title,price (USD),stock_status,sales
0,68.0,https://cdn.shopify.com/s/files/1/2185/2813/fi...,https://www.aloyoga.com/products/w9679r-airlif...,Airlift Intrigue Bra,Espresso,"Espresso, Espresso, Black, Navy, Anthracite, T...",alo_yoga_products_cleaned.csv,,,,...,,,,,,,,,,
1,128.0,https://cdn.shopify.com/s/files/1/2185/2813/fi...,https://www.aloyoga.com/products/w51314r-7-8-h...,7/8 High-Waist Airlift Legging,Espresso,"Espresso, Espresso, Black, Navy, Anthracite, P...",alo_yoga_products_cleaned.csv,,,,...,,,,,,,,,,
2,98.0,https://cdn.shopify.com/s/files/1/2185/2813/fi...,https://www.aloyoga.com/products/w6503r-work-i...,Work It Mini Skirt,Green Olive,"Green Olive, Green Olive, Black",alo_yoga_products_cleaned.csv,,,,...,,,,,,,,,,
3,228.0,https://cdn.shopify.com/s/files/1/2185/2813/fi...,https://www.aloyoga.com/products/w4689r-work-i...,Work It Bomber,Green Olive,"Green Olive, Green Olive, Black",alo_yoga_products_cleaned.csv,,,,...,,,,,,,,,,
4,68.0,https://cdn.shopify.com/s/files/1/2185/2813/fi...,https://www.aloyoga.com/products/w9679r-airlif...,Airlift Intrigue Bra,Black,"Black, Black, Espresso, Navy, Anthracite, Toas...",alo_yoga_products_cleaned.csv,,,,...,,,,,,,,,,


In [16]:
import re

CURATED_ALLOWED_CATEGORIES = {
    "alo_yoga_products_cleaned.csv": ["Bras","Leggings","Skirts","Bombers","Coverups","Shorts","Pullovers","Sweatpants","Tees"],
    "altardstate_products_cleaned.csv": ["Dresses","Maxi","Midi","Mini"],
    "cupshe_products_cleaned.csv": ["Bikini Sets","One-Piece Swimsuits"],
    "edikted_products_cleaned.csv": ["Sweaters","Sweatpants","Jeans","Tops","Mini Skirts","Shorts","Skorts"],
    "gymshark_products_cleaned.csv": ["Sports Bras","Crop Tops","Leggings","Shorts","One-Pieces"],
    "nakd_products_cleaned.csv": ["Coats","Jackets","Blazers","Jeans","Tops","Sweater","Boots","Ballerinas","Trenchcoats"],
    "princess_polly_cleaned.csv": ["Mini Dresses"],
    "vuori_products_cleaned.csv": ["Pants","Leggings","Tees","Joggers","Tops","Jackets","Tank","Cardigan","Hoodie","Trouser","Bra","Short"],
}

CATEGORY_KEYWORDS = {
    "Bikini Sets": [r"\bbikini\b"],
    "One-Piece Swimsuits": [r"\bone[-\s]?piece\b", r"\bswimsuit\b"],
    "Leggings": [r"\blegging(s)?\b"],
    "Shorts": [r"\bshort(s)?\b"],
    "Bras": [r"\bbra(s)?\b"],
    "Sports Bras": [r"\bsports?\s*bra(s)?\b"],
    "Sweatpants": [r"\bsweatpant(s)?\b"],
    "Sweaters": [r"\bsweater(s)?\b"],
    "Dresses": [r"\bdress(es)?\b"],
    "Mini Dresses": [r"\bmini\b.*\bdress\b"],
    "Coats": [r"\bcoat(s)?\b"],
    "Jackets": [r"\bjacket(s)?\b"],
    "Tops": [r"\btop(s)?\b", r"\btee(s)?\b", r"\btank\b"],
    "Jeans": [r"\bjean(s)?\b"],
    # add more as needed
}

compiled_rx = {cat: re.compile("|".join(pats), flags=re.I) for cat, pats in CATEGORY_KEYWORDS.items()}

def classify(row):
    src = row["source_file"]
    text = " ".join([str(row.get("product_name","")), str(row.get("category",""))]).lower()
    allowed = CURATED_ALLOWED_CATEGORIES.get(src, CATEGORY_KEYWORDS.keys())
    for cat in allowed:
        if cat in compiled_rx and compiled_rx[cat].search(text):
            return cat
    return "Uncategorized"

all_cleaned["final_category"] = all_cleaned.apply(classify, axis=1)


In [18]:
GROUPS = {
    "Swimwear": ["Bikini Sets","One-Piece Swimsuits"],
    "Activewear": ["Leggings","Sports Bras","Shorts","Sweatpants","Bras"],
    "Outerwear": ["Coats","Jackets"],
    "Dresses": ["Dresses","Mini Dresses"],
    "Other": []
}

def map_group(cat):
    for g,members in GROUPS.items():
        if cat in members:
            return g
    return "Other"

all_cleaned["category_group"] = all_cleaned["final_category"].apply(map_group)


In [20]:
out_dir = Path("by_category_groups")
out_dir.mkdir(exist_ok=True)

for grp, g in all_cleaned.groupby("category_group"):
    g.to_csv(out_dir / f"{grp}.csv", index=False)

print("Exported:", list(out_dir.glob("*.csv")))


Exported: [PosixPath('by_category_groups/Dresses.csv'), PosixPath('by_category_groups/Other.csv'), PosixPath('by_category_groups/Activewear.csv')]


In [22]:
import pandas as pd
from pathlib import Path
import re

DATA_DIR = Path("./")  # where your *_cleaned.csv files are
FILES = [
    "alo_yoga_products_cleaned.csv",
    "altardstate_products_cleaned.csv",
    "cupshe_products_cleaned.csv",
    "edikted_products_cleaned.csv",
    "gymshark_products_cleaned.csv",
    "nakd_products_cleaned.csv",
    "princess_polly_cleaned.csv",
    "vuori_products_cleaned.csv",
]

# map filename → brand name (fallback if brand col missing)
FILE_BRANDS = {
    "alo_yoga_products_cleaned.csv": "Alo Yoga",
    "altardstate_products_cleaned.csv": "Altar'd State",
    "cupshe_products_cleaned.csv": "Cupshe",
    "edikted_products_cleaned.csv": "Edikted",
    "gymshark_products_cleaned.csv": "Gymshark",
    "nakd_products_cleaned.csv": "NA-KD",
    "princess_polly_cleaned.csv": "Princess Polly",
    "vuori_products_cleaned.csv": "Vuori",
}

dfs = []
for f in FILES:
    df = pd.read_csv(DATA_DIR / f)
    df.columns = [c.lower().strip() for c in df.columns]

    # add missing brand
    if "brand" not in df.columns:
        df["brand"] = FILE_BRANDS[f]

    df["source_file"] = f
    dfs.append(df)

all_data = pd.concat(dfs, ignore_index=True)

# normalize key columns (fill missing)
if "product_name" not in all_data.columns:
    raise ValueError("No product_name column found — check cleaned files")

# ---------------------
# Category classification
# ---------------------
CATEGORY_KEYWORDS = {
    "Swimwear": ["bikini", "swimsuit", "one-piece"],
    "Activewear": ["legging","bra","sports bra","short","tee","tank","jogger","hoodie",
                   "pullover","sweatpant","crop top","skort","sweater","cardigan"],
    "Dresses": ["dress","maxi","midi","mini"],
    "Outerwear": ["jacket","coat","bomber","blazer","trench"],
    "Footwear": ["boot","ballerina"],
}

def assign_group(row):
    text = " ".join([
        str(row.get("product_name","")),
        str(row.get("category",""))
    ]).lower()
    for group, kws in CATEGORY_KEYWORDS.items():
        if any(re.search(rf"\b{k}\b", text) for k in kws):
            return group
    return "Other"

all_data["category_group"] = all_data.apply(assign_group, axis=1)

# ---------------------
# Save master
# ---------------------
all_data.to_csv("all_products_cleaned_master.csv", index=False)

print(all_data["category_group"].value_counts())


category_group
Other         6883
Activewear     416
Outerwear       52
Dresses         24
Name: count, dtype: int64


In [24]:
# === 1) Merge all cleaned CSVs ===
import pandas as pd
import numpy as np
import re
from pathlib import Path

DATA_DIR = Path("./")
FILES = [
    "alo_yoga_products_cleaned.csv",
    "altardstate_products_cleaned.csv",
    "cupshe_products_cleaned.csv",
    "edikted_products_cleaned.csv",
    "gymshark_products_cleaned.csv",
    "nakd_products_cleaned.csv",
    "princess_polly_cleaned.csv",
    "vuori_products_cleaned.csv",
]

FILE_BRANDS = {
    "alo_yoga_products_cleaned.csv": "Alo Yoga",
    "altardstate_products_cleaned.csv": "ALTAR'D STATE",
    "cupshe_products_cleaned.csv": "Cupshe",
    "edikted_products_cleaned.csv": "Edikted",
    "gymshark_products_cleaned.csv": "Gymshark",
    "nakd_products_cleaned.csv": "NA-KD",
    "princess_polly_cleaned.csv": "Princess Polly",
    "vuori_products_cleaned.csv": "Vuori",
}

dfs = []
for f in FILES:
    df = pd.read_csv(DATA_DIR / f)
    df.columns = [c.strip().lower() for c in df.columns]
    if "brand" not in df.columns:
        df["brand"] = FILE_BRANDS.get(f, "")
    df["source_file"] = f
    dfs.append(df)

all_data = pd.concat(dfs, ignore_index=True)

# Keep a tidy schema (add columns if present, else NaN)
keep_cols = ["product_name","brand","price","category","product_type","url","image_url","sku","id","source_file"]
for c in keep_cols:
    if c not in all_data.columns:
        all_data[c] = np.nan

# === 2) Build a rich searchable text field ===
def join_text(row):
    bits = [
        str(row.get("product_name","")),
        str(row.get("category","")),
        str(row.get("product_type",""))
    ]
    return " ".join([b for b in bits if b]).lower()

all_data["_text"] = all_data.apply(join_text, axis=1)

# === 3) Robust keyword sets (regex with plural/variants) ===
RX = {
    "Swimwear": re.compile(
        r"\b(bikini|bikini set[s]?|one[-\s]?piece[s]?|swimsuit[s]?)\b", re.I
    ),
    "Dresses": re.compile(
        r"\b(dress|dresses)\b|"
        r"\b(mini|midi|maxi)\b.*\bdress(es)?\b|"
        r"\bdress(es)?\b.*\b(mini|midi|maxi)\b", re.I
    ),
    "Outerwear": re.compile(
        r"\b(jacket|jackets|coat|coats|bomber|bombers|blazer|blazers|trench|trenchcoat|trench coat|puffer|parka|windbreaker|shacket|gilet)\b",
        re.I
    ),
    "Footwear": re.compile(
        r"\b(boot|boots|ballerina|ballerinas|flat|flats)\b", re.I
    ),
    "Activewear": re.compile(
        r"\b(legging|leggings|jogger|joggers|short|shorts|skirt|skirts|skort|skorts|"
        r"tee|tees|t-?shirt|t-?shirts|tank|tanks|tank\s*top|tank\s*tops|"
        r"bra|bras|sports?\s*bra|sports?\s*bras|"
        r"hoodie|hoodies|sweater|sweaters|sweatshirt|sweatshirts|pullover|pullovers|"
        r"cardigan|cardigans)\b",
        re.I
    ),
}

# Priority so specific categories win before generic ones:
PRIORITY = ["Dresses", "Swimwear", "Outerwear", "Footwear", "Activewear"]

def classify_group(text: str) -> str:
    for group in PRIORITY:
        if RX[group].search(text or ""):
            return group
    return "Other"

all_data["category_group"] = all_data["_text"].apply(classify_group)

# === 4) Results & sanity checks ===
counts = all_data["category_group"].value_counts(dropna=False)
print(counts)

# peek at some "Other" rows to see what's being missed:
others_sample = all_data.loc[all_data["category_group"]=="Other", ["product_name","category","product_type","brand","source_file"]].head(25)
display(others_sample)

# save master with group
all_data.drop(columns=["_text"], inplace=True)
all_data.to_csv("all_products_cleaned_master_with_groups.csv", index=False)
print("Saved -> all_products_cleaned_master_with_groups.csv")


category_group
Other         6648
Activewear     378
Dresses        270
Outerwear       79
Name: count, dtype: int64


Unnamed: 0,product_name,category,product_type,brand,source_file
6,Cropped Serenity Coverup,,,Alo Yoga,alo_yoga_products_cleaned.csv
10,Accolade Straight Leg Sweatpant,,,Alo Yoga,alo_yoga_products_cleaned.csv
13,Accolade Straight Leg Sweatpant,,,Alo Yoga,alo_yoga_products_cleaned.csv
17,Suit Up Trouser (Regular),,,Alo Yoga,alo_yoga_products_cleaned.csv
18,ALO Softsculpt Precision 1/4 Zip Long Sleeve,,,Alo Yoga,alo_yoga_products_cleaned.csv
21,ALO Sunset Sneaker,,,Alo Yoga,alo_yoga_products_cleaned.csv
23,Unisex Half-Crew Throwback Sock,,,Alo Yoga,alo_yoga_products_cleaned.csv
27,Accolade Straight Leg Sweatpant,,,Alo Yoga,alo_yoga_products_cleaned.csv
28,Serenity Wide Leg Sweatpant,,,Alo Yoga,alo_yoga_products_cleaned.csv
29,ALO Sunset Sneaker,,,Alo Yoga,alo_yoga_products_cleaned.csv


Saved -> all_products_cleaned_master_with_groups.csv


In [26]:
import re
import pandas as pd

# 1) Build a search text (if not already present)
if "_text" not in all_data.columns:
    def _join(r):
        return " ".join([
            str(r.get("product_name","")),
            str(r.get("category","")),
            str(r.get("product_type",""))
        ]).lower()
    all_data["_text"] = all_data.apply(_join, axis=1)

# 2) Extend regex patterns to catch the "Other" cases you observed
# NOTE: we only re-check rows currently labeled "Other"
EXTRA_RX = {
    "Swimwear": re.compile(r"\b(cover[-\s]?up|coverup)s?\b", re.I),

    "Footwear": re.compile(
        r"\b(sneaker|sneakers|runner|runners|sock|socks)\b", re.I
    ),

    "Activewear": re.compile(
        r"\b("
        r"sweatpant|sweatpants|"
        r"trouser|trousers|"
        r"zip\s*long\s*sleeve|1/4\s*zip|quarter\s*zip|"
        r"crewneck|"
        r"boxer\s*pant|boxer\s*pants|boxer\s*short|boxer\s*shorts|"
        r"cap|caps|"
        r"headband|headbands|"
        r"claw\s*clip|claw\s*clips"
        r")\b", re.I
    ),

    # If you also want to catch generic "pant/pants" as activewear, uncomment below:
    # "Activewear_generic": re.compile(r"\b(pant|pants)\b", re.I),
}

# 3) Re-categorize only the rows in "Other" using the new patterns
def recategorize_other(text: str) -> str:
    if pd.isna(text):
        return "Other"
    if EXTRA_RX["Swimwear"].search(text):
        return "Swimwear"
    if EXTRA_RX["Footwear"].search(text):
        return "Footwear"
    if EXTRA_RX["Activewear"].search(text):
        return "Activewear"
    return "Other"

mask_other = all_data["category_group"].eq("Other")
all_data.loc[mask_other, "category_group"] = all_data.loc[mask_other, "_text"].apply(recategorize_other)

# 4) Check the new distribution and preview any remaining "Other"
print(all_data["category_group"].value_counts(dropna=False))

others_left = all_data.loc[all_data["category_group"].eq("Other"),
                           ["product_name","category","product_type","brand","source_file"]].head(30)
display(others_left)

# 5) (optional) Save updated master
all_data.drop(columns="_text", inplace=True, errors="ignore")
all_data.to_csv("all_products_cleaned_master_with_groups.csv", index=False)
print("Saved: all_products_cleaned_master_with_groups.csv")


category_group
Other         6518
Activewear     479
Dresses        270
Outerwear       79
Footwear        25
Swimwear         4
Name: count, dtype: int64


Unnamed: 0,product_name,category,product_type,brand,source_file
90,Scholar Knit High-Waist Cargo Pant,,,Alo Yoga,alo_yoga_products_cleaned.csv
111,Notable Beanie,,,Alo Yoga,alo_yoga_products_cleaned.csv
114,Fresh Mini Scrunchie (3-Pack),,,Alo Yoga,alo_yoga_products_cleaned.csv
115,Performance Fleece Gloves,,,Alo Yoga,alo_yoga_products_cleaned.csv
116,Performance Fleece Beanie,,,Alo Yoga,alo_yoga_products_cleaned.csv
130,Warrior Mat,,,Alo Yoga,alo_yoga_products_cleaned.csv
155,District Trucker Hat,,,Alo Yoga,alo_yoga_products_cleaned.csv
157,Airlift Solar Visor,,,Alo Yoga,alo_yoga_products_cleaned.csv
165,Ribbed Sea Coast Long Sleeve,,,Alo Yoga,alo_yoga_products_cleaned.csv
177,Cotton Real Deal Button Up Long Sleeve,,,Alo Yoga,alo_yoga_products_cleaned.csv


Saved: all_products_cleaned_master_with_groups.csv


In [28]:
# --- Make sure we have a searchable text field ---
import re
import pandas as pd
import numpy as np

if "_text" not in all_data.columns:
    def _join(r):
        return " ".join([
            str(r.get("product_name","")),
            str(r.get("category","")),
            str(r.get("product_type",""))
        ]).lower()
    all_data["_text"] = all_data.apply(_join, axis=1)

# --- 6-bucket regex (rich, plural-aware, hyphen-aware) ---
RX = {
    "Dresses": re.compile(
        r"\b(dress|dresses)\b|"
        r"\b(mini|midi|maxi)\b.*\bdress(es)?\b|"
        r"\bdress(es)?\b.*\b(mini|midi|maxi)\b",
        re.I
    ),
    "Swimwear": re.compile(
        r"\b(bikini|bikini set[s]?|one[-\s]?piece[s]?|swimsuit[s]?|cover[-\s]?up|coverup|cover[-\s]?ups|coverups)\b",
        re.I
    ),
    "Outerwear": re.compile(
        r"\b(jacket|jackets|coat|coats|bomber|bombers|blazer|blazers|"
        r"trench|trench\s*coat|trenchcoat|parka|puffer|windbreaker|"
        r"shacket|gilet|vest|vests)\b",
        re.I
    ),
    "Footwear": re.compile(
        r"\b(boot|boots|sneaker|sneakers|runner|runners|ballerina|ballerinas|flat|flats|loafer|loafers|sandal|sandals)\b",
        re.I
    ),
    "Activewear": re.compile(
        r"\b(legging|leggings|jogger|joggers|pant|pants|capri|capris|"
        r"short|shorts|skirt|skirts|skort|skorts|"
        r"tee|tees|t-?shirt|t-?shirts|"
        r"tank|tanks|long\s*sleeve|crewneck|button\s*down|henley|"
        r"bra|bras|sports?\s*bra|sports?\s*bras|"
        r"hoodie|hoodies|sweater|sweaters|sweatshirt|sweatshirts|pullover|pullovers|"
        r"cardigan|cardigans)\b",
        re.I
    ),
    "Accessories": re.compile(
        r"\b(beanie|beanies|hat|hats|cap|caps|visor|visors|"
        r"headband|headbands|scrunchie|scrunchies|"
        r"glove|gloves|mittens?|scarf|scarves|belt|belts|"
        r"clip|clips|claw\s*clip|claw\s*clips|"
        r"mat|mats|yoga\s*mat|"
        r"bag|bags|tote|totes|backpack|backpacks|duffel|duffle|"
        r"water\s*bottle|bottle|towel|socks?)\b",
        re.I
    ),
}

# Priority: put the *most specific apparel types* ahead of catch-all (Accessories)
PRIORITY = ["Dresses", "Swimwear", "Outerwear", "Footwear", "Activewear", "Accessories"]

def classify6(text: str) -> str:
    text = text or ""
    for grp in PRIORITY:
        if RX[grp].search(text):
            return grp
    return "Other"

all_data["category_group"] = all_data["_text"].apply(classify6)

# --- Results & quick debug peek ---
print(all_data["category_group"].value_counts(dropna=False))

others_sample = all_data.loc[all_data["category_group"].eq("Other"),
                             ["product_name","category","product_type","brand","source_file"]].head(25)
display(others_sample)

# --- (optional) export per bucket ---
out_dir = Path("by_category_groups_6")
out_dir.mkdir(exist_ok=True)
for grp, g in all_data.groupby(all_data["category_group"].fillna("Other")):
    g.drop(columns=["_text"], errors="ignore").to_csv(out_dir / f"{grp}.csv", index=False)

# Save full master with the 6-group label
all_data.drop(columns=["_text"], errors="ignore").to_csv("all_products_cleaned_master_6groups.csv", index=False)
print("Saved:", "all_products_cleaned_master_6groups.csv", "and", str(out_dir))


category_group
Other          6486
Activewear      440
Dresses         270
Outerwear        82
Accessories      72
Footwear         21
Swimwear          4
Name: count, dtype: int64


Unnamed: 0,product_name,category,product_type,brand,source_file
10,Accolade Straight Leg Sweatpant,,,Alo Yoga,alo_yoga_products_cleaned.csv
13,Accolade Straight Leg Sweatpant,,,Alo Yoga,alo_yoga_products_cleaned.csv
17,Suit Up Trouser (Regular),,,Alo Yoga,alo_yoga_products_cleaned.csv
27,Accolade Straight Leg Sweatpant,,,Alo Yoga,alo_yoga_products_cleaned.csv
28,Serenity Wide Leg Sweatpant,,,Alo Yoga,alo_yoga_products_cleaned.csv
43,Accolade Straight Leg Sweatpant,,,Alo Yoga,alo_yoga_products_cleaned.csv
47,Serenity Wide Leg Sweatpant,,,Alo Yoga,alo_yoga_products_cleaned.csv
57,Accolade Sweatpant,,,Alo Yoga,alo_yoga_products_cleaned.csv
84,Accolade Sweatpant,,,Alo Yoga,alo_yoga_products_cleaned.csv
131,Suit Up Trouser (Regular),,,Alo Yoga,alo_yoga_products_cleaned.csv


Saved: all_products_cleaned_master_6groups.csv and by_category_groups_6


In [30]:
import pandas as pd, numpy as np, re, unicodedata

# --------- 1) Build a clean text field (fresh) ----------
def normalize_text(s: str) -> str:
    if not isinstance(s, str):
        s = "" if pd.isna(s) else str(s)
    # unicode normalize (fix curly quotes etc.)
    s = unicodedata.normalize("NFKC", s)
    s = s.lower()
    # replace any non-letter/number with a space
    s = re.sub(r"[^a-z0-9]+", " ", s)
    # collapse multiple spaces
    s = re.sub(r"\s+", " ", s).strip()
    return s

def row_text(r):
    return normalize_text(" ".join([
        str(r.get("product_name","")),
        str(r.get("category","")),
        str(r.get("product_type",""))
    ]))

all_data["_t"] = all_data.apply(row_text, axis=1)

# --------- 2) Keyword sets (bigger net) ----------
# Keep Accessories separate (your Option B)
PAT = {
    "Dresses": [
        "dress", "mini dress", "midi dress", "maxi dress"
    ],
    "Swimwear": [
        "bikini", "one piece", "swimsuit", "coverup", "cover up"
    ],
    "Outerwear": [
        "jacket", "coat", "bomber", "blazer", "trench", "parka", "puffer",
        "windbreaker", "shacket", "gilet", "vest"
    ],
    "Footwear": [
        "boot", "sneaker", "runner", "ballerina", "flat", "loafer", "sandal", "slipper"
    ],
    "Activewear": [
        # bottoms
        "legging", "jogger", "sweatpant", "pant", "trouser", "capri", "short", "skirt", "skort",
        # tops
        "tee", "t shirt", "tank", "long sleeve", "henley", "polo", "button down", "crewneck",
        # bras/sweaters
        "bra", "sports bra", "hoodie", "sweater", "sweatshirt", "pullover", "cardigan",
        # 1-piece garments often used athleisure
        "onesie", "jumpsuit", "romper", "unitard", "bodysuit",
        # under/innerwear commonly bundled as active
        "thong", "brief", "boxer"
    ],
    "Accessories": [
        "beanie", "hat", "cap", "visor", "headband", "scrunchie",
        "glove", "mitten", "scarf", "belt", "clip", "claw clip",
        "mat", "yoga mat", "bag", "tote", "backpack", "duffel", "duffle",
        "water bottle", "bottle", "towel", "sock"
    ],
}

# Generate plural + simple variants automatically (e.g., "pant" -> "pants")
def expand_terms(terms):
    out = set()
    for t in terms:
        out.add(t)
        if " " not in t:  # single word: add plurals
            if not t.endswith("s"): out.add(t + "s")
        # also add hyphenless variants (t-shirt -> t shirt)
        out.add(t.replace("-", " "))
    return sorted(out)

PAT = {k: expand_terms(v) for k, v in PAT.items()}

# Compile regex per group as OR of tokens (simple substring match over normalized text)
RX = {k: re.compile(r"(?:^| )(" + "|".join(map(re.escape, v)) + r")(?: |$)") for k, v in PAT.items()}

# priority so specific apparel wins before accessories
PRIORITY = ["Dresses", "Swimwear", "Outerwear", "Footwear", "Activewear", "Accessories"]

def classify(text: str) -> str:
    for grp in PRIORITY:
        if RX[grp].search(text):
            return grp
    return "Other"

all_data["category_group"] = all_data["_t"].apply(classify)

# --------- 3) Results + a peek at remaining Others ----------
print(all_data["category_group"].value_counts(dropna=False))
display(all_data.loc[all_data["category_group"].eq("Other"),
                     ["product_name","category","product_type","brand","source_file"]].head(25))

# (optional) save
all_data.drop(columns=["_t"], inplace=True)
all_data.to_csv("all_products_cleaned_master_6groups.csv", index=False)
print("Saved: all_products_cleaned_master_6groups.csv")


category_group
Other          6663
Activewear      508
Outerwear        82
Accessories      72
Footwear         23
Dresses          23
Swimwear          4
Name: count, dtype: int64


Unnamed: 0,product_name,category,product_type,brand,source_file
355,Untangled Hair Tie 6-Pack,,,Alo Yoga,alo_yoga_products_cleaned.csv
395,Unisex It Slide,,,Alo Yoga,alo_yoga_products_cleaned.csv
562,Unisex It Slide,,,Alo Yoga,alo_yoga_products_cleaned.csv
566,Sheer Glow Boyshort,,,Alo Yoga,alo_yoga_products_cleaned.csv
631,Unisex It Slide,,,Alo Yoga,alo_yoga_products_cleaned.csv
661,Cashmere Jet Set Crew,,,Alo Yoga,alo_yoga_products_cleaned.csv
670,Sheer Glow Boyshort,,,Alo Yoga,alo_yoga_products_cleaned.csv
698,Glow Wristband (2-Pack),,,Alo Yoga,alo_yoga_products_cleaned.csv
720,,Clothing/Dresses,,ALTAR'D STATE,altardstate_products_cleaned.csv
721,,Clothing/Dresses,,ALTAR'D STATE,altardstate_products_cleaned.csv


Saved: all_products_cleaned_master_6groups.csv


In [32]:
# --- Stage 0: helpers & normalized fields ---
import re, unicodedata
import pandas as pd
import numpy as np

def norm(s):
    if not isinstance(s, str):
        s = "" if pd.isna(s) else str(s)
    s = unicodedata.normalize("NFKC", s).lower()
    # turn non-alphanum into space (so "Clothing/Dresses" -> "clothing dresses")
    s = re.sub(r"[^a-z0-9]+", " ", s)
    return re.sub(r"\s+", " ", s).strip()

def row_text(r):
    return norm(" ".join([
        str(r.get("product_name","")),
        str(r.get("category","")),
        str(r.get("product_type",""))
    ]))

all_data["_cat_norm"] = all_data["category"].apply(norm) if "category" in all_data.columns else ""
all_data["_text"]     = all_data.apply(row_text, axis=1)

# --- Stage 1: category-first mapping (deterministic) ---
CAT_MAP = [
    ("Dresses",   ["dress", "dresses"]),
    ("Swimwear",  ["bikini", "one piece", "swimsuit", "coverup", "cover up"]),
    ("Outerwear", ["jacket", "coat", "blazer", "trench", "parka", "puffer", "windbreaker", "shacket", "gilet", "vest"]),
    ("Footwear",  ["boot", "sneaker", "runner", "sandal", "slide", "loafer", "flat", "ballerina"]),
    # Activewear last here (it’s broader)
    ("Activewear",["legging", "jogger", "sweatpant", "pant", "trouser", "capri", "short", "skirt", "skort",
                   "tee", "t shirt", "tank", "long sleeve", "henley", "polo", "button down", "crewneck",
                   "bra", "sports bra", "hoodie", "sweater", "sweatshirt", "pullover", "cardigan",
                   "onesie", "jumpsuit", "romper", "unitard", "bodysuit", "thong", "brief", "boxer"]),
    ("Accessories",["beanie", "hat", "cap", "visor", "headband", "scrunchie", "glove", "mitten", "scarf", "belt",
                    "clip", "claw clip", "wristband", "hair tie", "mat", "yoga mat", "bag", "tote", "backpack",
                    "duffel", "duffle", "water bottle", "bottle", "towel", "sock"]),
]

def contains_any(text, keys):
    return any(f" {k} " in f" {text} " or f" {k}s " in f" {text} " for k in keys)

# seed with NaN, fill from category first
all_data["category_group"] = np.nan
for grp, keys in CAT_MAP:
    mask = all_data["category_group"].isna() & all_data["_cat_norm"].apply(lambda t: contains_any(t, keys))
    all_data.loc[mask, "category_group"] = grp

# --- Stage 2: regex fallback on product_name + category + product_type ---
PAT = {
    "Dresses":     ["dress", "mini dress", "midi dress", "maxi dress"],
    "Swimwear":    ["bikini", "one piece", "swimsuit", "coverup", "cover up"],
    "Outerwear":   ["jacket", "coat", "bomber", "blazer", "trench", "parka", "puffer", "windbreaker", "shacket", "gilet", "vest"],
    "Footwear":    ["boot", "sneaker", "runner", "ballerina", "flat", "loafer", "sandal", "slide", "slipper"],
    "Activewear":  ["legging", "jogger", "sweatpant", "pant", "trouser", "capri", "short", "skirt", "skort",
                    "tee", "t shirt", "tank", "long sleeve", "henley", "polo", "button down", "crewneck",
                    "bra", "sports bra", "hoodie", "sweater", "sweatshirt", "pullover", "cardigan",
                    "onesie", "jumpsuit", "romper", "unitard", "bodysuit", "thong", "brief", "boxer", "boyshort"],
    "Accessories": ["beanie", "hat", "cap", "visor", "headband", "scrunchie", "glove", "mitten", "scarf", "belt",
                    "clip", "claw clip", "wristband", "hair tie", "mat", "yoga mat", "bag", "tote", "backpack",
                    "duffel", "duffle", "water bottle", "bottle", "towel", "sock", "wrist band"],
}

# auto-expand simple plurals and hyphenless
def expand(terms):
    out = set()
    for t in terms:
        out.add(t)
        if " " not in t and not t.endswith("s"): out.add(t + "s")
        out.add(t.replace("-", " "))
    return sorted(out)

RX = {g: re.compile(r"(?:^| )(?:%s)(?: |$)" % "|".join(map(re.escape, expand(v)))) for g, v in PAT.items()}
PRIORITY = ["Dresses", "Swimwear", "Outerwear", "Footwear", "Activewear", "Accessories"]

def classify_fallback(text):
    for g in PRIORITY:
        if RX[g].search(text):
            return g
    return np.nan

mask_unlabeled = all_data["category_group"].isna()
all_data.loc[mask_unlabeled, "category_group"] = all_data.loc[mask_unlabeled, "_text"].apply(classify_fallback)

# Final fill: anything still NA becomes Other
all_data["category_group"] = all_data["category_group"].fillna("Other")

# Results
print(all_data["category_group"].value_counts(dropna=False))

# Look at 30 remaining 'Other' examples to extend terms if needed
display(all_data.loc[all_data["category_group"].eq("Other"),
                     ["product_name","category","product_type","brand","source_file"]].head(30))


category_group
Other          6409
Activewear      510
Dresses         270
Outerwear        82
Accessories      74
Footwear         26
Swimwear          4
Name: count, dtype: int64


  all_data.loc[mask, "category_group"] = grp


Unnamed: 0,product_name,category,product_type,brand,source_file
661,Cashmere Jet Set Crew,,,Alo Yoga,alo_yoga_products_cleaned.csv
967,,Clothing,,ALTAR'D STATE,altardstate_products_cleaned.csv
968,,Clothing,,ALTAR'D STATE,altardstate_products_cleaned.csv
969,,Clothing,,ALTAR'D STATE,altardstate_products_cleaned.csv
970,,Clothing,,ALTAR'D STATE,altardstate_products_cleaned.csv
971,,Clothing,,ALTAR'D STATE,altardstate_products_cleaned.csv
972,,Clothing,,ALTAR'D STATE,altardstate_products_cleaned.csv
973,,Clothing,,ALTAR'D STATE,altardstate_products_cleaned.csv
974,,Clothing,,ALTAR'D STATE,altardstate_products_cleaned.csv
975,,Clothing,,ALTAR'D STATE,altardstate_products_cleaned.csv


In [34]:
# === FIX & RECLASSIFY (6 buckets) ===
import pandas as pd, numpy as np, re, unicodedata

# 0) Harmonize product_name from alternatives and coalesce empties
name_alts = [c for c in ["product_name","name","title"] if c in all_data.columns]
if name_alts:
    all_data["product_name"] = None
    for c in name_alts:
        all_data["product_name"] = all_data["product_name"].fillna(all_data[c])
# strip whitespace
all_data["product_name"] = all_data["product_name"].astype(str).str.strip().replace({"None": np.nan, "nan": np.nan})

# 1) Normalization helpers
def norm(s):
    if not isinstance(s, str):
        s = "" if pd.isna(s) else str(s)
    s = unicodedata.normalize("NFKC", s).lower()
    s = re.sub(r"[^a-z0-9]+", " ", s)       # non-alnum -> space
    s = re.sub(r"\s+", " ", s).strip()
    return s

def row_text(r):
    return norm(" ".join([
        str(r.get("product_name","")),
        str(r.get("category","")),
        str(r.get("product_type","")),
    ]))

all_data["_cat_norm"] = all_data.get("category", pd.Series("", index=all_data.index)).apply(norm)
all_data["_text"]     = all_data.apply(row_text, axis=1)

# 2) Build plural-aware regex quickly
def plural_rx(word: str) -> str:
    """
    Turn 'dress' -> 'dress(?:es)?'; 'short' -> 'shorts?'; 'capri' -> 'capris?'
    Simple rule that works for our vocab.
    """
    if word.endswith("ss"):   # dress -> dresses
        return rf"{re.escape(word)}(?:es)?"
    elif word.endswith("ch") or word.endswith("sh"):
        return rf"{re.escape(word)}(?:es)?"
    elif word.endswith("y"):  # not handling consonant->ies here; we don't need it
        return rf"{re.escape(word)}s?"
    else:
        return rf"{re.escape(word)}s?"

def build_or_rx(words):
    tokens = []
    for w in words:
        w = w.replace("-", " ")
        if " " in w:
            # phrases: just allow single space between tokens
            tokens.append(re.escape(w))
        else:
            tokens.append(plural_rx(w))
    return re.compile(rf"(?:^|\s)(?:{'|'.join(tokens)})(?:\s|$)", re.I)

# 3) Stage-1: category-first deterministic mapping
CAT_RULES = {
    "Dresses":   ["dress"],  # matches dress/dresses
    "Swimwear":  ["bikini", "one piece", "swimsuit", "coverup", "cover up"],
    "Outerwear": ["jacket","coat","bomber","blazer","trench","parka","puffer","windbreaker","shacket","gilet","vest"],
    "Footwear":  ["boot","sneaker","runner","sandal","slide","loafer","flat","ballerina"],
    "Activewear":["legging","jogger","sweatpant","pant","trouser","capri","short","skirt","skort",
                  "tee","t shirt","tank","long sleeve","henley","polo","button down","crewneck",
                  "bra","sports bra","hoodie","sweater","sweatshirt","pullover","cardigan",
                  "onesie","jumpsuit","romper","unitard","bodysuit","thong","brief","boxer","boyshort"],
    "Accessories":["beanie","hat","cap","visor","headband","scrunchie","glove","mitten","scarf","belt",
                   "clip","claw clip","wristband","hair tie","mat","yoga mat","bag","tote","backpack",
                   "duffel","duffle","water bottle","bottle","towel","sock"],
}
CAT_RX = {k: build_or_rx(v) for k,v in CAT_RULES.items()}

# make sure target column accepts strings
all_data["category_group"] = pd.Series(pd.NA, index=all_data.index, dtype="object")

for grp in ["Dresses","Swimwear","Outerwear","Footwear","Activewear","Accessories"]:
    mask = all_data["category_group"].isna() & all_data["_cat_norm"].str.len().gt(0) & all_data["_cat_norm"].apply(lambda t: bool(CAT_RX[grp].search(t)))
    all_data.loc[mask, "category_group"] = grp

# 4) Stage-2: fallback on product_name + category + product_type combined text
FALLBACK_RULES = CAT_RULES  # reuse vocab
FB_RX = {k: build_or_rx(v) for k,v in FALLBACK_RULES.items()}
PRIORITY = ["Dresses","Swimwear","Outerwear","Footwear","Activewear","Accessories"]

def fallback_label(t: str):
    for grp in PRIORITY:
        if FB_RX[grp].search(t):
            return grp
    return pd.NA

mask_unlabeled = all_data["category_group"].isna()
all_data.loc[mask_unlabeled, "category_group"] = all_data.loc[mask_unlabeled, "_text"].apply(fallback_label)

# final fill
all_data["category_group"] = all_data["category_group"].fillna("Other")

print(all_data["category_group"].value_counts(dropna=False).sort_values(ascending=False).to_string())
display(all_data.loc[all_data["category_group"].eq("Other"), ["product_name","category","product_type","brand","source_file"]].head(20))


category_group
Activewear     2523
Dresses        2392
Other          1968
Outerwear       249
Accessories     134
Swimwear         76
Footwear         33


Unnamed: 0,product_name,category,product_type,brand,source_file
0,,,,Alo Yoga,alo_yoga_products_cleaned.csv
1,,,,Alo Yoga,alo_yoga_products_cleaned.csv
2,,,,Alo Yoga,alo_yoga_products_cleaned.csv
3,,,,Alo Yoga,alo_yoga_products_cleaned.csv
4,,,,Alo Yoga,alo_yoga_products_cleaned.csv
5,,,,Alo Yoga,alo_yoga_products_cleaned.csv
6,,,,Alo Yoga,alo_yoga_products_cleaned.csv
7,,,,Alo Yoga,alo_yoga_products_cleaned.csv
8,,,,Alo Yoga,alo_yoga_products_cleaned.csv
9,,,,Alo Yoga,alo_yoga_products_cleaned.csv


In [36]:
import pandas as pd

df_alo = pd.read_csv("alo_yoga_products_cleaned.csv")
print(df_alo.columns.tolist())
print(df_alo.head(10))


[' price(USD) ', 'image_url', 'product_url', 'product_name', 'current_color_from_name', 'available_colors_cleaned']
    price(USD)                                           image_url  \
0          68.0  https://cdn.shopify.com/s/files/1/2185/2813/fi...   
1         128.0  https://cdn.shopify.com/s/files/1/2185/2813/fi...   
2          98.0  https://cdn.shopify.com/s/files/1/2185/2813/fi...   
3         228.0  https://cdn.shopify.com/s/files/1/2185/2813/fi...   
4          68.0  https://cdn.shopify.com/s/files/1/2185/2813/fi...   
5         128.0  https://cdn.shopify.com/s/files/1/2185/2813/fi...   
6         118.0  https://cdn.shopify.com/s/files/1/2185/2813/fi...   
7          68.0  https://cdn.shopify.com/s/files/1/2185/2813/fi...   
8          58.0  https://cdn.shopify.com/s/files/1/2185/2813/fi...   
9         128.0  https://cdn.shopify.com/s/files/1/2185/2813/fi...   

                                         product_url  \
0  https://www.aloyoga.com/products/w9679r-airlif...   
1

In [38]:
import pandas as pd

df_alo = pd.read_csv("alo_yoga_products_cleaned.csv")

print("Alo columns:", list(df_alo.columns))
print("product_name nulls:", df_alo["product_name"].isna().sum())
print("product_name examples:", df_alo["product_name"].dropna().head(10).tolist())


Alo columns: [' price(USD) ', 'image_url', 'product_url', 'product_name', 'current_color_from_name', 'available_colors_cleaned']
product_name nulls: 0
product_name examples: ['Airlift Intrigue Bra', '7/8 High-Waist Airlift Legging', 'Work It Mini Skirt', 'Work It Bomber', 'Airlift Intrigue Bra', '7/8 High-Waist Airlift Legging', 'Cropped Serenity Coverup', 'Accolade Short', 'Ribbed Sea Coast Cropped Short Sleeve Tee', 'Accolade Crew Neck Pullover']


In [42]:
 # === Merge + Safe product_name + 6-Group Categorization (one cell) ===
import pandas as pd, numpy as np, re, unicodedata
from pathlib import Path

# --------------------------
# CONFIG: paths & filenames
# --------------------------
DATA_DIR = Path("./")  # folder containing your *_cleaned.csv files
FILES = [
    "alo_yoga_products_cleaned.csv",
    "altardstate_products_cleaned.csv",
    "cupshe_products_cleaned.csv",
    "edikted_products_cleaned.csv",
    "gymshark_products_cleaned.csv",
    "nakd_products_cleaned.csv",
    "princess_polly_cleaned.csv",
    "vuori_products_cleaned.csv",
]
FILE_BRANDS = {
    "alo_yoga_products_cleaned.csv": "Alo Yoga",
    "altardstate_products_cleaned.csv": "ALTAR'D STATE",
    "cupshe_products_cleaned.csv": "Cupshe",
    "edikted_products_cleaned.csv": "Edikted",
    "gymshark_products_cleaned.csv": "Gymshark",
    "nakd_products_cleaned.csv": "NA-KD",
    "princess_polly_cleaned.csv": "Princess Polly",
    "vuori_products_cleaned.csv": "Vuori",
}
OUT_MASTER = "all_products_cleaned_master_6groups.csv"
OUT_DIR_BUCKETS = Path("by_category_groups_6")
OUT_DIR_BUCKETS.mkdir(exist_ok=True)

# ---------------------------------
# Helpers: column canon & text norm
# ---------------------------------
def canon_colname(c: str) -> str:
    raw = unicodedata.normalize("NFKC", str(c)).strip().lower()
    # common cleanups
    raw = raw.replace("/", " ").replace("\\", " ")
    raw = re.sub(r"\s+", " ", raw)
    # parenthesis to underscores
    tmp = raw.replace("(", "_").replace(")", "_")
    tmp = re.sub(r"[^a-z0-9]+", "_", tmp).strip("_")
    # map common variants
    aliases = {
        "productname": "product_name",
        "product_name": "product_name",
        "name": "product_name",
        "title": "product_name",
        "price": "price",
        "price_usd": "price",
        "priceusd": "price",
        "price_us": "price",
        "price_usd_": "price",
        "price_usd__": "price",
        "image_url": "image_url",
        "product_url": "url",
        "url": "url",
        "link": "url",
        "category": "category",
        "product_category": "category",
        "product_type": "product_type",
        "type": "product_type",
        "categories": "category",
        "taxonomy": "category",
        "sku": "sku",
        "id": "sku",
        "product_id": "sku",
    }
    # special case like "price_usd" embedded
    if "price_usd" in tmp or raw.strip() == "price(usd)":
        return "price"
    return aliases.get(tmp, tmp)

def norm_text(s):
    if not isinstance(s, str):
        s = "" if pd.isna(s) else str(s)
    s = unicodedata.normalize("NFKC", s).lower()
    s = re.sub(r"[^a-z0-9]+", " ", s)
    return re.sub(r"\s+", " ", s).strip()

# --------------------------
# 1) Read & merge all files
# --------------------------
frames = []
for fname in FILES:
    df = pd.read_csv(DATA_DIR / fname)
    df.columns = [canon_colname(c) for c in df.columns]
    # after: df = pd.read_csv(DATA_DIR / fname)
# after: df.columns = [canon_colname(c) for c in df.columns]

# --- ensure unique columns by coalescing duplicates ---
if df.columns.duplicated().any():
    # build a new frame with one column per name
    cols = df.columns.tolist()
    from collections import defaultdict
    groups = defaultdict(list)
    for i, c in enumerate(cols):
        groups[c].append(i)

    new_df = pd.DataFrame(index=df.index)
    for c, idxs in groups.items():
        if len(idxs) == 1:
            new_df[c] = df.iloc[:, idxs[0]]
        else:
            # coalesce left->right: take first non-empty
            s = df.iloc[:, idxs[0]]
            for j in idxs[1:]:
                cand = df.iloc[:, j]
                # treat "", "nan", "None" as empty
                empty = s.isna() | (s.astype(str).str.strip().isin(["", "nan", "None"]))
                s = s.where(~empty, cand)
            new_df[c] = s
    df = new_df
# -----------------------------------------------

    df["source_file"] = fname
    # brand fallback from filename if needed
    if "brand" not in df.columns or df["brand"].isna().all():
        df["brand"] = FILE_BRANDS.get(fname, "")
    frames.append(df)

all_data = pd.concat(frames, ignore_index=True, sort=False)

# -------------------------------------------
# 2) SAFE coalesce for product_name (no wipe)
# -------------------------------------------
if "product_name" not in all_data.columns:
    all_data["product_name"] = pd.NA

for alt in ["name", "title"]:
    if alt in all_data.columns:
        mask = all_data["product_name"].isna() | (all_data["product_name"].astype(str).str.strip() == "")
        all_data.loc[mask, "product_name"] = all_data.loc[mask, alt]

# last-resort: derive from URL slug if still missing
if "url" in all_data.columns:
    mask = all_data["product_name"].isna()
    all_data.loc[mask, "product_name"] = (
        all_data.loc[mask, "url"]
        .astype(str)
        .str.extract(r"/([^/?#]+)(?:\?.*)?$")[0]
        .str.replace("-", " ", regex=False)
        .str.replace("_", " ", regex=False)
    )

# clean empties to NaN
all_data["product_name"] = (
    all_data["product_name"]
    .astype(str)
    .str.strip()
    .replace({"": pd.NA, "nan": pd.NA, "None": pd.NA})
)

# -------------------------------
# 3) Build normalized text fields
# -------------------------------
all_data["_cat_norm"] = all_data.get("category", pd.Series("", index=all_data.index)).apply(norm_text)
all_data["_text"] = (
    all_data.get("product_name", pd.Series("", index=all_data.index)).apply(norm_text)
    + " " + all_data.get("category", pd.Series("", index=all_data.index)).apply(norm_text)
    + " " + all_data.get("product_type", pd.Series("", index=all_data.index)).apply(norm_text)
).str.strip()

# -------------------------------------------------
# 4) Stage-1: category-first labeling (deterministic)
# -------------------------------------------------
def plural_rx(word: str) -> str:
    # very simple pluralization that covers our vocab well
    if word.endswith(("ss", "ch", "sh")):
        return rf"{re.escape(word)}(?:es)?"
    elif word.endswith("y"):
        return rf"{re.escape(word)}s?"   # good enough for our tokens
    else:
        return rf"{re.escape(word)}s?"

def build_or_rx(words):
    tokens = []
    for w in words:
        w = w.replace("-", " ")
        if " " in w:  # phrases
            tokens.append(re.escape(w))
        else:
            tokens.append(plural_rx(w))
    return re.compile(rf"(?:^|\s)(?:{'|'.join(tokens)})(?:\s|$)", re.I)

CAT_RULES = {
    "Dresses":   ["dress"],
    "Swimwear":  ["bikini", "one piece", "swimsuit", "coverup", "cover up"],
    "Outerwear": ["jacket","coat","bomber","blazer","trench","parka","puffer","windbreaker","shacket","gilet","vest"],
    "Footwear":  ["boot","sneaker","runner","sandal","slide","loafer","flat","ballerina","slipper"],
    "Activewear":["legging","jogger","sweatpant","pant","trouser","capri","short","skirt","skort",
                  "tee","t shirt","tank","long sleeve","henley","polo","button down","crewneck",
                  "bra","sports bra","hoodie","sweater","sweatshirt","pullover","cardigan",
                  "onesie","jumpsuit","romper","unitard","bodysuit","thong","brief","boxer","boyshort"],
    "Accessories":["beanie","hat","cap","visor","headband","scrunchie","glove","mitten","scarf","belt",
                   "clip","claw clip","wristband","hair tie","mat","yoga mat","bag","tote","backpack",
                   "duffel","duffle","water bottle","bottle","towel","sock"],
}
CAT_RX = {k: build_or_rx(v) for k, v in CAT_RULES.items()}

all_data["category_group"] = pd.Series(pd.NA, index=all_data.index, dtype="object")
priority = ["Dresses","Swimwear","Outerwear","Footwear","Activewear","Accessories"]  # specific -> broad

for grp in priority:
    m = all_data["category_group"].isna() & all_data["_cat_norm"].str.len().gt(0) & all_data["_cat_norm"].apply(lambda t: bool(CAT_RX[grp].search(t)))
    all_data.loc[m, "category_group"] = grp

# -----------------------------------------------------------
# 5) Stage-2: fallback on combined text (name/category/type)
# -----------------------------------------------------------
FB_RX = CAT_RX  # reuse same vocab
def fallback_label(t: str):
    for grp in priority:
        if FB_RX[grp].search(t):
            return grp
    return pd.NA

mask_unlabeled = all_data["category_group"].isna()
all_data.loc[mask_unlabeled, "category_group"] = all_data.loc[mask_unlabeled, "_text"].apply(fallback_label)

# final fill
all_data["category_group"] = all_data["category_group"].fillna("Other")

# --------------------------------
# 6) Save & basic sanity reporting
# --------------------------------
all_data.drop(columns=["_text","_cat_norm"], inplace=True, errors="ignore")
all_data.to_csv(OUT_MASTER, index=False)

print("Saved master ->", OUT_MASTER)
print("\nCategory counts:\n", all_data["category_group"].value_counts(dropna=False).to_string())

print("\nMissing product_name by source:")
print(all_data.groupby("source_file")["product_name"].apply(lambda s: s.isna().sum()).to_string())

# Export one CSV per bucket (optional but handy)
for grp, g in all_data.groupby(all_data["category_group"].fillna("Other")):
    g.to_csv(OUT_DIR_BUCKETS / f"{grp}.csv", index=False)

# Peek at any remaining 'Other'
others_preview = all_data.loc[all_data["category_group"].eq("Other"), ["product_name","category","product_type","brand","source_file"]].head(20)
print("\nSample of remaining 'Other':")
print(others_preview.to_string(index=False))


ValueError: No objects to concatenate

In [44]:
from pathlib import Path
import pandas as pd

DATA_DIR = Path("./")  # adjust if your CSVs are elsewhere

print("Dir:", DATA_DIR.resolve())
print("Files in dir:")
for p in sorted(DATA_DIR.glob("*.csv")):
    print(" -", p.name)

expected = [
    "alo_yoga_products_cleaned.csv",
    "altardstate_products_cleaned.csv",
    "cupshe_products_cleaned.csv",
    "edikted_products_cleaned.csv",
    "gymshark_products_cleaned.csv",
    "nakd_products_cleaned.csv",
    "princess_polly_cleaned.csv",
    "vuori_products_cleaned.csv",
]

missing = [f for f in expected if not (DATA_DIR / f).exists()]
print("\nMissing from expected list:", missing)

# try reading each and report shape
for f in expected:
    p = DATA_DIR / f
    if p.exists():
        try:
            df = pd.read_csv(p, nrows=3)
            print(f"OK  {f:35s} -> shape sample {df.shape}")
        except Exception as e:
            print(f"ERR {f:35s} -> {type(e).__name__}: {e}")


Dir: /Users/VATSAL/Clozyt recommender system
Files in dir:
 - all_products_cleaned_master.csv
 - all_products_cleaned_master_6groups.csv
 - all_products_cleaned_master_with_groups.csv
 - alo_yoga_products.csv
 - alo_yoga_products_cleaned.csv
 - altardstate_products.csv
 - altardstate_products_cleaned.csv
 - cupshe_products.csv
 - cupshe_products_cleaned.csv
 - edikted_products.csv
 - edikted_products_cleaned.csv
 - gymshark_products.csv
 - gymshark_products_cleaned.csv
 - nakd_products.csv
 - nakd_products_cleaned.csv
 - princess_polly.csv
 - princess_polly_cleaned.csv
 - vuori_products.csv
 - vuori_products_cleaned.csv

Missing from expected list: []
OK  alo_yoga_products_cleaned.csv       -> shape sample (3, 6)
OK  altardstate_products_cleaned.csv    -> shape sample (3, 13)
OK  cupshe_products_cleaned.csv         -> shape sample (3, 6)
OK  edikted_products_cleaned.csv        -> shape sample (3, 10)
OK  gymshark_products_cleaned.csv       -> shape sample (3, 9)
OK  nakd_products_clean

In [48]:
# === MERGE (glob) + SAFE product_name + 6-GROUP CLASSIFICATION ===
import pandas as pd, numpy as np, re, unicodedata
from pathlib import Path
from collections import defaultdict

DATA_DIR = Path("./")
OUT_MASTER = "all_products_cleaned_master_6groups.csv"
OUT_DIR = Path("by_category_groups_6")
OUT_DIR.mkdir(exist_ok=True)

# Map filename stem -> brand (fallback if 'brand' column missing/empty)
BRAND_MAP = {
    "alo_yoga_products_cleaned": "Alo Yoga",
    "altardstate_products_cleaned": "ALTAR'D STATE",
    "cupshe_products_cleaned": "Cupshe",
    "edikted_products_cleaned": "Edikted",
    "gymshark_products_cleaned": "Gymshark",
    "nakd_products_cleaned": "NA-KD",
    "princess_polly_cleaned": "Princess Polly",
    "vuori_products_cleaned": "Vuori",
}

def canon_colname(c: str) -> str:
    raw = unicodedata.normalize("NFKC", str(c)).strip().lower()
    raw = raw.replace("/", " ").replace("\\", " ")
    raw = re.sub(r"\s+", " ", raw)
    tmp = raw.replace("(", "_").replace(")", "_")
    tmp = re.sub(r"[^a-z0-9]+", "_", tmp).strip("_")
    aliases = {
        "productname":"product_name","product_name":"product_name","name":"product_name","title":"product_name",
        "price":"price","price_usd":"price","priceusd":"price","price_us":"price",
        "image_url":"image_url","product_url":"url","url":"url","link":"url",
        "category":"category","product_category":"category","categories":"category","taxonomy":"category",
        "product_type":"product_type","type":"product_type",
        "sku":"sku","id":"sku","product_id":"sku",
    }
    if "price_usd" in tmp or raw.strip()=="price(usd)":
        return "price"
    return aliases.get(tmp, tmp)

frames, loaded = [], []
for p in sorted(DATA_DIR.glob("*_cleaned.csv")):
    df = pd.read_csv(p)
    df.columns = [canon_colname(c) for c in df.columns]

    # coalesce duplicate-named columns (keep first non-empty)
    if df.columns.duplicated().any():
        groups = defaultdict(list)
        for i, c in enumerate(df.columns):
            groups[c].append(i)
        new_df = pd.DataFrame(index=df.index)
        for c, idxs in groups.items():
            if len(idxs)==1:
                new_df[c] = df.iloc[:, idxs[0]]
            else:
                s = df.iloc[:, idxs[0]]
                for j in idxs[1:]:
                    cand = df.iloc[:, j]
                    empty = s.isna() | (s.astype(str).str.strip().isin(["", "nan", "None"]))
                    s = s.where(~empty, cand)
                new_df[c] = s
        df = new_df

    df["source_file"] = p.name
    if "brand" not in df.columns or df["brand"].isna().all():
        df["brand"] = BRAND_MAP.get(p.stem, p.stem)

    frames.append(df)
    loaded.append(p.name)

assert frames, "No *_cleaned.csv files were loaded."
all_data = pd.concat(frames, ignore_index=True, sort=False)

# Ensure required columns exist in all_data
for c in ["product_name", "category", "product_type", "brand", "source_file"]:
    if c not in all_data.columns:
        all_data[c] = pd.NA

# SAFE coalesce for product_name (do not wipe existing values)
if "product_name" not in all_data.columns:
    all_data["product_name"] = pd.NA
for alt in ["name","title"]:
    if alt in all_data.columns:
        mask = all_data["product_name"].isna() | (all_data["product_name"].astype(str).str.strip()=="")
        all_data.loc[mask, "product_name"] = all_data.loc[mask, alt]
if "url" in all_data.columns:
    mask = all_data["product_name"].isna()
    all_data.loc[mask, "product_name"] = (
        all_data.loc[mask, "url"].astype(str)
        .str.extract(r"/([^/?#]+)(?:\?.*)?$")[0]
        .str.replace("-", " ", regex=False).str.replace("_", " ", regex=False)
    )
all_data["product_name"] = (
    all_data["product_name"].astype(str).str.strip()
    .replace({"": pd.NA, "nan": pd.NA, "None": pd.NA})
)

# --- normalized text for classification ---
def norm_text(s):
    if not isinstance(s, str):
        s = "" if pd.isna(s) else str(s)
    s = unicodedata.normalize("NFKC", s).lower()
    s = re.sub(r"[^a-z0-9]+", " ", s)
    return re.sub(r"\s+", " ", s).strip()

all_data["_cat_norm"] = all_data.get("category", pd.Series("", index=all_data.index)).apply(norm_text)
all_data["_text"] = (
    all_data.get("product_name", pd.Series("", index=all_data.index)).apply(norm_text)
    + " " + all_data.get("category", pd.Series("", index=all_data.index)).apply(norm_text)
    + " " + all_data.get("product_type", pd.Series("", index=all_data.index)).apply(norm_text)
).str.strip()

# --- 6-bucket vocab & regex (category-first, then fallback) ---
def plural_rx(word: str) -> str:
    if word.endswith(("ss","ch","sh")): return rf"{re.escape(word)}(?:es)?"
    elif word.endswith("y"):           return rf"{re.escape(word)}s?"
    else:                              return rf"{re.escape(word)}s?"

def build_or_rx(words):
    toks=[]
    for w in words:
        w = w.replace("-", " ")
        toks.append(plural_rx(w) if " " not in w else re.escape(w))
    return re.compile(rf"(?:^|\s)(?:{'|'.join(toks)})(?:\s|$)", re.I)

CAT_RULES = {
    "Dresses":   ["dress"],
    "Swimwear":  ["bikini","one piece","swimsuit","coverup","cover up"],
    "Outerwear": ["jacket","coat","bomber","blazer","trench","parka","puffer","windbreaker","shacket","gilet","vest"],
    "Footwear":  ["boot","sneaker","runner","sandal","slide","loafer","flat","ballerina","slipper"],
    "Activewear":["legging","jogger","sweatpant","pant","trouser","capri","short","skirt","skort",
                  "tee","t shirt","tank","long sleeve","henley","polo","button down","crewneck",
                  "bra","sports bra","hoodie","sweater","sweatshirt","pullover","cardigan",
                  "onesie","jumpsuit","romper","unitard","bodysuit","thong","brief","boxer","boyshort"],
    "Accessories":["beanie","hat","cap","visor","headband","scrunchie","glove","mitten","scarf","belt",
                   "clip","claw clip","wristband","hair tie","mat","yoga mat","bag","tote","backpack",
                   "duffel","duffle","water bottle","bottle","towel","sock"],
}
RX = {k: build_or_rx(v) for k,v in CAT_RULES.items()}
PRIORITY = ["Dresses","Swimwear","Outerwear","Footwear","Activewear","Accessories"]

all_data["category_group"] = pd.Series(pd.NA, index=all_data.index, dtype="object")
for grp in PRIORITY:
    m = all_data["category_group"].isna() & all_data["_cat_norm"].str.len().gt(0) & all_data["_cat_norm"].apply(lambda t: bool(RX[grp].search(t)))
    all_data.loc[m, "category_group"] = grp

mask = all_data["category_group"].isna()
def fb(t: str):
    for grp in PRIORITY:
        if RX[grp].search(t): return grp
    return pd.NA
all_data.loc[mask, "category_group"] = all_data.loc[mask, "_text"].apply(fb)
all_data["category_group"] = all_data["category_group"].fillna("Other")

# --- save & report ---
all_data.drop(columns=["_text","_cat_norm"], errors="ignore", inplace=True)
all_data.to_csv(OUT_MASTER, index=False)
for grp, g in all_data.groupby(all_data["category_group"].fillna("Other")):
    g.to_csv(OUT_DIR / f"{grp}.csv", index=False)

print("Loaded files:", loaded)
print("Saved master ->", OUT_MASTER)
print("\nCategory counts:\n", all_data["category_group"].value_counts(dropna=False).to_string())
print("\nMissing product_name by source:")
print(all_data.groupby("source_file")["product_name"].apply(lambda s: s.isna().sum()).to_string())

# Safe preview: only select columns that actually exist
safe_cols = [c for c in ["product_name","category","product_type","brand","source_file"] if c in all_data.columns]
print("\nSample of any remaining 'Other':")
print(all_data.loc[all_data["category_group"].eq("Other"), safe_cols].head(20).to_string(index=False))


Loaded files: ['alo_yoga_products_cleaned.csv', 'altardstate_products_cleaned.csv', 'cupshe_products_cleaned.csv', 'edikted_products_cleaned.csv', 'gymshark_products_cleaned.csv', 'nakd_products_cleaned.csv', 'princess_polly_cleaned.csv', 'vuori_products_cleaned.csv']
Saved master -> all_products_cleaned_master_6groups.csv

Category counts:
 category_group
Activewear     3033
Dresses        2415
Other          1249
Outerwear       331
Accessories     208
Swimwear         80
Footwear         59

Missing product_name by source:
source_file
alo_yoga_products_cleaned.csv       0
altardstate_products_cleaned.csv    0
cupshe_products_cleaned.csv         0
edikted_products_cleaned.csv        0
gymshark_products_cleaned.csv       0
nakd_products_cleaned.csv           0
princess_polly_cleaned.csv          0
vuori_products_cleaned.csv          0

Sample of any remaining 'Other':
                    product_name category product_type         brand                      source_file
           Cashm

In [50]:
# --- Re-categorize current 'Other' with extra terms ---
import re, unicodedata
import pandas as pd
import numpy as np

def norm_text(s):
    if not isinstance(s, str):
        s = "" if pd.isna(s) else str(s)
    s = unicodedata.normalize("NFKC", s).lower()
    s = re.sub(r"[^a-z0-9]+", " ", s)
    return re.sub(r"\s+", " ", s).strip()

# build a searchable text for only 'Other' rows
mask_other = all_data["category_group"].eq("Other")
other_text = (
    all_data.loc[mask_other, "product_name"].apply(norm_text).fillna("") + " " +
    all_data.loc[mask_other, "category"].apply(norm_text).fillna("") + " " +
    all_data.loc[mask_other, "product_type"].apply(norm_text).fillna("")
).str.strip()

# NEW tokens:
# - Activewear: top(s), button up, mock neck, thermal, seamless
# - Outerwear: duster, kimono, coatigan
EXTRA = {
    "Activewear": re.compile(
        r"(?:^|\s)(top|tops|button up|mock neck|thermal|seamless)(?:\s|$)", re.I
    ),
    "Outerwear": re.compile(
        r"(?:^|\s)(duster|kimono|coatigan)(?:\s|$)", re.I
    ),
}

# priority: map Outerwear before Activewear (so "duster top" becomes Outerwear)
def recat(txt: str) -> str:
    if EXTRA["Outerwear"].search(txt):   return "Outerwear"
    if EXTRA["Activewear"].search(txt):  return "Activewear"
    return "Other"

# apply just to 'Other'
new_labels = other_text.apply(recat)
all_data.loc[mask_other & new_labels.ne("Other"), "category_group"] = new_labels[new_labels.ne("Other")]

# report
print(all_data["category_group"].value_counts(dropna=False).to_string())

# peek at what remains in Other
safe_cols = [c for c in ["product_name","category","product_type","brand","source_file"] if c in all_data.columns]
print("\nRemaining 'Other' sample:")
print(all_data.loc[all_data["category_group"].eq("Other"), safe_cols].head(20).to_string(index=False))


category_group
Activewear     3773
Dresses        2415
Other           502
Outerwear       338
Accessories     208
Swimwear         80
Footwear         59

Remaining 'Other' sample:
                  product_name category product_type         brand                      source_file
         Cashmere Jet Set Crew      NaN         <NA>      Alo Yoga    alo_yoga_products_cleaned.csv
        Allyson Striped Zip Up Clothing         <NA> ALTAR'D STATE altardstate_products_cleaned.csv
 Katherine High Rise Seam Jean Clothing         <NA> ALTAR'D STATE altardstate_products_cleaned.csv
     Kaylie Straight Leg Jeans Clothing         <NA> ALTAR'D STATE altardstate_products_cleaned.csv
       Karson Distressed Jeans Clothing         <NA> ALTAR'D STATE altardstate_products_cleaned.csv
      Tracie Waffle Knit Tunic Clothing         <NA> ALTAR'D STATE altardstate_products_cleaned.csv
           Paizley Lace Blouse Clothing         <NA> ALTAR'D STATE altardstate_products_cleaned.csv
      Tracie Waffl

In [52]:
# --- Add CASUALWEAR bucket and re-route items context-aware ---
import re, unicodedata
import pandas as pd
import numpy as np
from pathlib import Path

def norm(s):
    if not isinstance(s, str):
        s = "" if pd.isna(s) else str(s)
    s = unicodedata.normalize("NFKC", s).lower()
    s = re.sub(r"[^a-z0-9]+", " ", s)
    return re.sub(r"\s+", " ", s).strip()

txt_all = (
    all_data.get("product_name", pd.Series("", index=all_data.index)).apply(norm) + " " +
    all_data.get("category", pd.Series("", index=all_data.index)).apply(norm) + " " +
    all_data.get("product_type", pd.Series("", index=all_data.index)).apply(norm)
).str.strip()

# Buckets:
# Innerwear (keep if you already added it earlier; otherwise skip lines with "Innerwear")
RX_INNERWEAR = re.compile(r"(?:^|\s)(bralette|lingerie|underwear|briefs?|thongs?|pant(y|ies)|boyshorts?|bodysuits?|unitard|catsuit)(?:\s|$)", re.I)

# Zip/outerwear nouns => Outerwear
RX_ZIP = re.compile(r"(?:^|\s)(zip\s*up|zipup|zip|quarter\s*zip|half\s*zip|1\/4\s*zip)(?:\s|$)", re.I)
RX_OUTERWEAR_NOUN = re.compile(r"(?:^|\s)(jacket|coat|parka|puffer|shell|windbreaker|anorak|tren(ch|chcoat)|gilet|vest|bomber|blazer)(?:\s|$)", re.I)

# Zip + athletic tops => Activewear
RX_ZIP_TOP = re.compile(r"(?:^|\s)(hoodie|fleece|sweater|sweatshirt|pullover|top)(?:\s|$)", re.I)

# CASUALWEAR: denim bottoms + everyday tops/blouses/tunics/camis/shirts
RX_CASUAL_DENIM = re.compile(r"(?:^|\s)(jeans?|denim|flare|bootcut|straight\s*leg|baggy)(?:\s|$)", re.I)
RX_CASUAL_TOPS  = re.compile(r"(?:^|\s)(blouse|tunic|cami|camisole|shirt|button\s*up|mock\s*neck|thermal|seamless|top|tops)(?:\s|$)", re.I)

# --- precedence rules ---
# 1) Innerwear wins
mask_inner = txt_all.str.contains(RX_INNERWEAR)
all_data.loc[mask_inner, "category_group"] = "Innerwear"

# 2) Zip + outerwear noun -> Outerwear
mask_zip_outer = (~mask_inner) & txt_all.str.contains(RX_ZIP) & txt_all.str.contains(RX_OUTERWEAR_NOUN)
all_data.loc[mask_zip_outer, "category_group"] = "Outerwear"

# 3) Zip + athletic top -> Activewear
mask_zip_top = (~mask_inner) & (~mask_zip_outer) & txt_all.str.contains(RX_ZIP) & txt_all.str.contains(RX_ZIP_TOP)
all_data.loc[mask_zip_top, "category_group"] = "Activewear"

# 4) Denim bottoms or casual tops -> Casualwear (only if not already labeled by above)
mask_casual = (~mask_inner) & (~mask_zip_outer) & (~mask_zip_top) & (txt_all.str.contains(RX_CASUAL_DENIM) | txt_all.str.contains(RX_CASUAL_TOPS))
all_data.loc[mask_casual, "category_group"] = "Casualwear"

# 5) Re-sweep any remaining 'Other' with same logic
mask_other = all_data["category_group"].eq("Other")
txt_other = txt_all[mask_other]
all_data.loc[mask_other & txt_other.str.contains(RX_INNERWEAR), "category_group"] = "Innerwear"
all_data.loc[mask_other & txt_other.str.contains(RX_ZIP) & txt_other.str.contains(RX_OUTERWEAR_NOUN), "category_group"] = "Outerwear"
all_data.loc[mask_other & (~txt_other.str.contains(RX_ZIP) | ~txt_other.str.contains(RX_OUTERWEAR_NOUN)) & txt_other.str.contains(RX_ZIP) & txt_other.str.contains(RX_ZIP_TOP), "category_group"] = "Activewear"
all_data.loc[mask_other & (txt_other.str.contains(RX_CASUAL_DENIM) | txt_other.str.contains(RX_CASUAL_TOPS)), "category_group"] = "Casualwear"

# Report + save
print(all_data["category_group"].value_counts(dropna=False).to_string())

safe_cols = [c for c in ["product_name","category","product_type","brand","source_file"] if c in all_data.columns]
print("\nRemaining 'Other' sample:")
print(all_data.loc[all_data["category_group"].eq("Other"), safe_cols].head(20).to_string(index=False))

# Persist (new master + per-bucket files)
out_master = "all_products_cleaned_master_8groups.csv"  # 6 original + Accessories + Casualwear (+ Innerwear if you added earlier)
all_data.to_csv(out_master, index=False)
outdir = Path("by_category_groups_8")
outdir.mkdir(exist_ok=True)
for grp, g in all_data.groupby(all_data["category_group"].fillna("Other")):
    g.to_csv(outdir / f"{grp}.csv", index=False)
print("\nSaved master ->", out_master)
print("Per-bucket CSVs ->", outdir)


category_group
Dresses        2376
Activewear     2217
Casualwear     1853
Outerwear       313
Other           251
Accessories     202
Innerwear        56
Footwear         55
Swimwear         52

Remaining 'Other' sample:
                   product_name category product_type         brand                      source_file
          Cashmere Jet Set Crew      NaN         <NA>      Alo Yoga    alo_yoga_products_cleaned.csv
         Allyson Striped Zip Up Clothing         <NA> ALTAR'D STATE altardstate_products_cleaned.csv
         Rena Aztec Quarter Zip Clothing         <NA> ALTAR'D STATE altardstate_products_cleaned.csv
         Mollie Pointelle Shrug Clothing         <NA> ALTAR'D STATE altardstate_products_cleaned.csv
       Waverly Stripe Crew Neck Clothing         <NA> ALTAR'D STATE altardstate_products_cleaned.csv
        Layla Ribbon Trim Shrug Clothing         <NA> ALTAR'D STATE altardstate_products_cleaned.csv
   New York Varsity Quarter Zip Clothing         <NA> ALTAR'D STATE alt

  mask_inner = txt_all.str.contains(RX_INNERWEAR)
  mask_zip_outer = (~mask_inner) & txt_all.str.contains(RX_ZIP) & txt_all.str.contains(RX_OUTERWEAR_NOUN)
  mask_zip_top = (~mask_inner) & (~mask_zip_outer) & txt_all.str.contains(RX_ZIP) & txt_all.str.contains(RX_ZIP_TOP)
  mask_casual = (~mask_inner) & (~mask_zip_outer) & (~mask_zip_top) & (txt_all.str.contains(RX_CASUAL_DENIM) | txt_all.str.contains(RX_CASUAL_TOPS))
  all_data.loc[mask_other & txt_other.str.contains(RX_INNERWEAR), "category_group"] = "Innerwear"
  all_data.loc[mask_other & txt_other.str.contains(RX_ZIP) & txt_other.str.contains(RX_OUTERWEAR_NOUN), "category_group"] = "Outerwear"
  all_data.loc[mask_other & txt_other.str.contains(RX_ZIP) & txt_other.str.contains(RX_OUTERWEAR_NOUN), "category_group"] = "Outerwear"
  all_data.loc[mask_other & (~txt_other.str.contains(RX_ZIP) | ~txt_other.str.contains(RX_OUTERWEAR_NOUN)) & txt_other.str.contains(RX_ZIP) & txt_other.str.contains(RX_ZIP_TOP), "category_group"] = "Activewea

In [54]:
# --- Final sweep of 'Other': shrug/sherpa/crew/layer ---
import re, unicodedata
import pandas as pd
import numpy as np

def norm(s):
    if not isinstance(s, str):
        s = "" if pd.isna(s) else str(s)
    s = unicodedata.normalize("NFKC", s).lower()
    s = re.sub(r"[^a-z0-9]+", " ", s)
    return re.sub(r"\s+", " ", s).strip()

mask_other = all_data["category_group"].eq("Other")
txt = (
    all_data.loc[mask_other, "product_name"].apply(norm).fillna("") + " " +
    all_data.loc[mask_other, "category"].apply(norm).fillna("") + " " +
    all_data.loc[mask_other, "product_type"].apply(norm).fillna("")
).str.strip()

# New rules:
RX_OUTERWEAR_SHRUG  = re.compile(r"(?:^|\s)(shrug|sherpa)(?:\s|$)", re.I)        # treat as Outerwear
RX_ACTIVE_CREW      = re.compile(r"(?:^|\s)(crew\s*neck|crew)(?:\s|$)", re.I)    # sweaters/crews -> Activewear
RX_CASUAL_LAYER     = re.compile(r"(?:^|\s)layer(ing)?(?:\s|$)", re.I)           # "Layer Clothing" -> Casualwear

to_outer = txt.apply(lambda t: bool(RX_OUTERWEAR_SHRUG.search(t)))
to_active = (~to_outer) & txt.apply(lambda t: bool(RX_ACTIVE_CREW.search(t)))
to_casual = (~to_outer) & (~to_active) & txt.apply(lambda t: bool(RX_CASUAL_LAYER.search(t)))

all_data.loc[mask_other & to_outer,  "category_group"] = "Outerwear"
all_data.loc[mask_other & to_active, "category_group"] = "Activewear"
all_data.loc[mask_other & to_casual, "category_group"] = "Casualwear"

# report and save
print(all_data["category_group"].value_counts(dropna=False).to_string())

safe_cols = [c for c in ["product_name","category","product_type","brand","source_file"] if c in all_data.columns]
print("\nRemaining 'Other' sample:")
print(all_data.loc[all_data["category_group"].eq("Other"), safe_cols].head(20).to_string(index=False))

all_data.to_csv("all_products_cleaned_master_8groups.csv", index=False)
print("\nUpdated master saved -> all_products_cleaned_master_8groups.csv")


category_group
Dresses        2376
Activewear     2228
Casualwear     1861
Outerwear       322
Other           223
Accessories     202
Innerwear        56
Footwear         55
Swimwear         52

Remaining 'Other' sample:
                                             product_name category product_type         brand                      source_file
                                   Allyson Striped Zip Up Clothing         <NA> ALTAR'D STATE altardstate_products_cleaned.csv
                                   Rena Aztec Quarter Zip Clothing         <NA> ALTAR'D STATE altardstate_products_cleaned.csv
                             New York Varsity Quarter Zip Clothing         <NA> ALTAR'D STATE altardstate_products_cleaned.csv
                                    Petal Motion Full Zip Clothing         <NA>    AS REVIVAL altardstate_products_cleaned.csv
Classic Leopard Print Cross Back Midkini & High Waist Set      NaN         <NA>        Cupshe      cupshe_products_cleaned.csv
                

In [56]:
mask_other = all_data["category_group"].eq("Other")
txt = all_data.loc[mask_other, "product_name"].fillna("").str.lower()

all_data.loc[mask_other & txt.str.contains("corset"), "category_group"] = "Innerwear"
all_data.loc[mask_other & txt.str.contains("poncho|shrug|cape"), "category_group"] = "Outerwear"
all_data.loc[mask_other & txt.str.contains("tankini|midkini|bikini"), "category_group"] = "Swimwear"


In [58]:
# Final clean-up sweep for 'Other'
mask_other = all_data["category_group"].eq("Other")
txt = all_data.loc[mask_other, "product_name"].fillna("").str.lower()

all_data.loc[mask_other & txt.str.contains("corset"), "category_group"] = "Innerwear"
all_data.loc[mask_other & txt.str.contains("poncho|shrug|cape"), "category_group"] = "Outerwear"
all_data.loc[mask_other & txt.str.contains("tankini|midkini|bikini"), "category_group"] = "Swimwear"

# Re-check distribution
print("Updated category counts:\n", all_data["category_group"].value_counts())


Updated category counts:
 category_group
Dresses        2376
Activewear     2228
Casualwear     1861
Outerwear       331
Accessories     202
Other           161
Innerwear       107
Footwear         55
Swimwear         54
Name: count, dtype: int64


In [98]:
# HYBRID RECOMMENDER (content + pseudo-collab) for your master CSV
import pandas as pd
import numpy as np
import re, unicodedata
from scipy.sparse import hstack, csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.neighbors import NearestNeighbors
from functools import lru_cache

@lru_cache(maxsize=4096)
def _pseudo(i):  # cache collab scores
    return pseudo_collab_scores(i)
    
MASTER_CSV = "all_products_cleaned_master_8groups.csv"  # adjust if needed
df = pd.read_csv(MASTER_CSV)

import re, unicodedata

def slug(s):
    s = "" if pd.isna(s) else str(s)
    s = unicodedata.normalize("NFKC", s).lower()
    s = re.sub(r"[^a-z0-9]+", " ", s).strip()
    return s

# new column used for deduplication
df["dedupe_key"] = df["product_name"].map(slug) + "|" + df["brand"].map(slug)


# --- Minimal schema safety
for c in ["product_name","brand","category_group","available_colors_cleaned","price"]:
    if c not in df.columns:
        df[c] = np.nan

# clean text helpers
def norm(s):
    if not isinstance(s, str):
        s = "" if pd.isna(s) else str(s)
    s = unicodedata.normalize("NFKC", s).lower()
    s = re.sub(r"[^a-z0-9]+"," ", s)
    return re.sub(r"\s+"," ", s).strip()

# fill sensible defaults
df["product_name"] = df["product_name"].fillna("").astype(str)
df["brand"] = df["brand"].fillna("UnknownBrand").astype(str)
df["category_group"] = df["category_group"].fillna("UnknownCategory").astype(str)
df["available_colors_cleaned"] = df["available_colors_cleaned"].fillna("").astype(str)

# parse price (strip currency/etc.)
def parse_price(x):
    if pd.isna(x): return np.nan
    s = str(x)
    s = re.sub(r"[^\d\.]", "", s)  # keep digits and dot
    try:
        return float(s) if s else np.nan
    except:
        return np.nan

df["price_num"] = df["price"].apply(parse_price)

# ---------- CONTENT VECTOR ----------
# text: product_name + brand + category (brand/category get extra weight via duplication)
content_text = (
    df["product_name"].map(norm) + " " +
    df["brand"].map(lambda s: (norm(s)+" ")*2) +    # duplicate -> more weight
    df["category_group"].map(lambda s: (norm(s)+" ")*2)
)

tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=2, max_features=40000)
X_tfidf = tfidf.fit_transform(content_text)

# (optional) one-hot brand/category and append to content
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
X_ohe = ohe.fit_transform(df[["brand","category_group"]])
X_content = hstack([X_tfidf, X_ohe]).tocsr()

# --- Brand centroids in content space (for "similar brands" lookups) ---
# --- Brand centroids in content space (robust: uses 1-D ndarrays, no np.matrix) ---
import numpy as np

brands = df["brand"].astype(str).values
unique_brands = np.unique(brands)

brand_centroid = {}
for b in unique_brands:
    idx = np.where(brands == b)[0]
    if len(idx) == 0:
        continue
    # 1 x n sparse mean -> convert to 1-D ndarray
    v = X_content[idx].mean(axis=0)          # shape (1, n), type np.matrix / sparse
    v = np.asarray(v).ravel()                # -> 1-D ndarray
    nrm = np.linalg.norm(v)
    if nrm > 0:
        v = v / nrm
    brand_centroid[b] = v

def similar_brands(base_brand, topk=6):
    """Return top-k brands most similar to base brand in content space (including base)."""
    if base_brand not in brand_centroid:
        return [base_brand]
    base = brand_centroid[base_brand]
    sims = []
    for b, vec in brand_centroid.items():
        # cosine for L2-normalized vectors is just dot product
        sim = float(np.dot(base, vec))
        sims.append((b, sim))
    sims.sort(key=lambda x: x[1], reverse=True)
    return [b for b, _ in sims[:topk]]

# ---------- PSEUDO-COLLAB SIM ----------
# brand/category match, color overlap (Jaccard), price proximity

# preprocess colors as sets
def colors_set(s):
    toks = re.split(r"[,\|/]+|\s{2,}", str(s))
    toks = [norm(t) for t in toks if norm(t)]
    return set(toks)

color_sets = df["available_colors_cleaned"].apply(colors_set)

# scale price into [0,1]
scaler = MinMaxScaler()
price_scaled = pd.Series(
    scaler.fit_transform(df[["price_num"]]).reshape(-1),
    index=df.index
).fillna(0.5)  # unknown mid

# on-the-fly pseudo-collab similarity between a query item and all items
def pseudo_collab_scores(i):
    # weights (tune if you like)
    w_brand = 0.30
    w_cat   = 0.35
    w_color = 0.20
    w_price = 0.15

    same_brand = (df["brand"].values == df.iloc[i]["brand"])
    same_cat   = (df["category_group"].values == df.iloc[i]["category_group"])

    # color Jaccard
    A = color_sets.iloc[i]
    def jacc(B):
        if not A and not B: return 0.0
        inter = len(A & B)
        union = len(A | B) if (A or B) else 1
        return inter / union
    color_sim = np.array([jacc(s) for s in color_sets])

    # price similarity (closer is better)
    pi = price_scaled.iloc[i]
    price_sim = 1.0 - np.abs(price_scaled.values - pi)

    # combine
    s = (w_brand*same_brand.astype(float) +
         w_cat*same_cat.astype(float) +
         w_color*color_sim +
         w_price*price_sim)

    # normalize 0..1
    s = (s - s.min()) / (s.max() - s.min() + 1e-9)
    return s

# ---------- HYBRID SCORE ----------
def hybrid_scores(i, k=50, alpha=0.7):
    """
    i: index of the query item in df
    alpha: weight for content (0..1). (1-alpha) used for pseudo-collab.
    returns: indices and scores for top-k (excluding i)
    """
    # content neighbors
    dists, idxs = nn_content.kneighbors(X_content[i], n_neighbors=min(k+1, X_content.shape[0]))
    idxs = idxs[0]
    dists = dists[0]
    content_scores = 1.0 - dists  # cosine -> similarity

    # map content scores to full-length array
    cont_full = np.zeros(len(df))
    cont_full[idxs] = content_scores

    # pseudo-collab over all items
    collab_full = _pseudo(int(i))


    # hybrid
    hybrid = alpha*cont_full + (1.0-alpha)*collab_full
    # remove self
    hybrid[i] = -1
    # top-k
    top_idx = np.argpartition(-hybrid, range(min(k, len(df)-1)))[:k]
    top_idx = top_idx[np.argsort(-hybrid[top_idx])]
    return top_idx, hybrid[top_idx], cont_full[top_idx], collab_full[top_idx]

def find_index_by_name(query):
    # fuzzy-ish: normalize and look for first contains
    q = norm(query)
    hits = df.index[df["product_name"].map(lambda s: q in norm(s))]
    return int(hits[0]) if len(hits) else None

def recommend(query, topn=10, alpha=0.7,
              show_cols=("product_name","brand","category_group","price"),
              same_brand=True, same_category_or_price=True,
              random_brand=False,            # NEW: enable brand-diverse randomization
              brand_topk=6,                  # how many "similar" brands to consider
              temperature=0.7,               # softmax temperature (lower=pick higher-score more often)
              max_per_brand=2,               # cap items per brand in the final list
              seed=None):
    """
    query: product index (int) OR product name (str)
    random_brand=True -> sample from similar brands using softmax over hybrid scores
    """
    i = query if isinstance(query, int) else find_index_by_name(query)
    if i is None:
        print("No product matched your query.")
        return

    # get many candidates first
    idxs, h, c, u = hybrid_scores(i, k=max(topn*10, 100), alpha=alpha)

    base_cat   = df.iloc[i]["category_group"]
    base_price = df.iloc[i]["price_num"]
    base_brand = df.iloc[i]["brand"]
    query_key  = df.iloc[i]["dedupe_key"]

    rng = np.random.default_rng(seed)

    # --- helper filters
    def ok_basic(j):
        # optional brand constraint
        cond_brand = (df.iloc[j]["brand"] == base_brand) if same_brand else True
        # keep if same category OR within a price band
        cond_cat = (df.iloc[j]["category_group"] == base_cat)
        cond_price = (
            pd.isna(base_price) or
            pd.isna(df.iloc[j]["price_num"]) or
            abs(df.iloc[j]["price_num"] - base_price) <= max(20, 0.5 * base_price)
        )
        return cond_brand and (cond_cat or (same_category_or_price and cond_price))

    # --- candidate pool (apply basic filters first)
    pool = [j for j in idxs if ok_basic(j)]
    if not pool:
        pool = list(idxs)  # fall back to raw neighbors if too strict

    # --- optionally restrict to "similar brands" (for cross-brand variety)
 # optionally restrict to "similar brands", but fall back if variety is too low
    if random_brand:
        sim_brands = set(similar_brands(base_brand, topk=brand_topk))
        pool_sim = [j for j in pool if (df.iloc[j]["brand"] in sim_brands)]
    
        # need at least enough brands to fill topn under the cap
        need_brands = max(1, int(np.ceil(topn / max_per_brand)))
        uniq_brands = len({df.iloc[j]["brand"] for j in pool_sim})
    
        pool = pool_sim if (len(pool_sim) >= topn or uniq_brands >= need_brands) else pool


    # --- dedupe by (name+brand) and brand quota, with softmax sampling
    keep, seen_keys = [], {query_key}
    brand_counts = {}

    # scores aligned to pool
    score_map = {j: h[list(idxs).index(j)] if j in idxs[:len(h)] else 0.0 for j in pool}
    # build arrays for softmax sampling
    cand = np.array(pool, dtype=int)
    scrs = np.array([score_map[j] for j in cand], dtype=float)

    def softmax(x, T=1.0):
        x = (x - x.max()) / max(T, 1e-6)
        e = np.exp(x)
        p = e / (e.sum() + 1e-12)
        return p

    # iterate sampling without replacement
    while len(keep) < topn and len(cand) > 0:
        if random_brand:
            p = softmax(scrs, T=temperature)  # random-but-biased by score
            pick_idx = rng.choice(len(cand), p=p)
        else:
            pick_idx = int(np.argmax(scrs))   # deterministic top
        j = int(cand[pick_idx])

        k = df.iloc[j]["dedupe_key"]
        b = df.iloc[j]["brand"]

        if k not in seen_keys and brand_counts.get(b, 0) < max_per_brand:
            keep.append(j)
            seen_keys.add(k)
            brand_counts[b] = brand_counts.get(b, 0) + 1

        # remove that candidate and continue
        cand = np.delete(cand, pick_idx)
        scrs = np.delete(scrs, pick_idx)

    # fall back if we didn’t fill topn
    if len(keep) < topn:
        for j in pool:
            if len(keep) >= topn: break
            k = df.iloc[j]["dedupe_key"]; b = df.iloc[j]["brand"]
            if k in seen_keys or brand_counts.get(b,0) >= max_per_brand: continue
            keep.append(j); seen_keys.add(k); brand_counts[b]=brand_counts.get(b,0)+1

    out = df.loc[keep, list(show_cols)].copy()
    # map scores again for display
    hybrid_full = dict(zip(idxs, h))
    cont_full   = dict(zip(idxs, c))
    coll_full   = dict(zip(idxs, u))
    out.insert(0, "score",      np.round([hybrid_full.get(j, 0.0) for j in keep], 4))
    out.insert(1, "content_sim",np.round([cont_full.get(j, 0.0)   for j in keep], 4))
    out.insert(2, "collab_sim", np.round([coll_full.get(j, 0.0)   for j in keep], 4))

    print("Query →", df.iloc[i][["product_name","brand","category_group","price"]].to_dict())
    display(out.reset_index(drop=True))
    return None



print("Hybrid recommender ready. Try:")
print(" - recommend('Airlift Intrigue Bra', topn=10, alpha=0.7)")
print(" - recommend(0, topn=10, alpha=0.6)  # by row index")

Hybrid recommender ready. Try:
 - recommend('Airlift Intrigue Bra', topn=10, alpha=0.7)
 - recommend(0, topn=10, alpha=0.6)  # by row index


In [100]:
# allow more from the same brand
recommend("Airlift Intrigue Bra", 10, 0.7,
          same_brand=False, random_brand=True,
          brand_topk=12, max_per_brand=5, seed=42)

# or: don't restrict to "similar brands" at all (keeps diversity cap only)
recommend("Airlift Intrigue Bra", 10, 0.7,
          same_brand=False, random_brand=False,  # deterministic
          max_per_brand=3)


Query → {'product_name': 'Airlift Intrigue Bra', 'brand': 'Alo Yoga', 'category_group': 'Activewear', 'price': 68.0}


Unnamed: 0,score,content_sim,collab_sim,product_name,brand,category_group,price
0,0.83,0.8254,0.8408,"5"" Airlift Energy Short",Alo Yoga,Activewear,88.0
1,0.847,0.8296,0.8876,High-Waist Airlift Legging,Alo Yoga,Activewear,128.0
2,0.8244,0.8251,0.8228,Accolade Hoodie,Alo Yoga,Activewear,138.0
3,0.831,0.8281,0.8376,Airbrush Better Together Bra,Alo Yoga,Activewear,88.0
4,0.8173,0.8268,0.795,Conquer Revitalize Pant,Alo Yoga,Activewear,118.0


Query → {'product_name': 'Airlift Intrigue Bra', 'brand': 'Alo Yoga', 'category_group': 'Activewear', 'price': 68.0}


Unnamed: 0,score,content_sim,collab_sim,product_name,brand,category_group,price
0,0.8869,0.9266,0.7942,Airlift Empower Bra,Alo Yoga,Activewear,98.0
1,0.8756,0.8943,0.8321,Alosoft Relay Bra,Alo Yoga,Activewear,78.0
2,0.8689,0.8955,0.8069,Airlift Divine Bra,Alo Yoga,Activewear,68.0


In [108]:
recommend("Airlift Intrigue Bra", 10, 0.7,
          same_brand=False,
          random_brand=False,      # no similar-brand restriction
          same_category_or_price=False,
          # allow more per brand:
          )


Query → {'product_name': 'Airlift Intrigue Bra', 'brand': 'Alo Yoga', 'category_group': 'Activewear', 'price': 68.0}


Unnamed: 0,score,content_sim,collab_sim,product_name,brand,category_group,price
0,0.8869,0.9266,0.7942,Airlift Empower Bra,Alo Yoga,Activewear,98.0
1,0.8756,0.8943,0.8321,Alosoft Relay Bra,Alo Yoga,Activewear,78.0


In [104]:
recommend("Airlift Intrigue Bra", topn=10, alpha=0.7, same_category_or_price=False)


Query → {'product_name': 'Airlift Intrigue Bra', 'brand': 'Alo Yoga', 'category_group': 'Activewear', 'price': 68.0}


Unnamed: 0,score,content_sim,collab_sim,product_name,brand,category_group,price
0,0.8869,0.9266,0.7942,Airlift Empower Bra,Alo Yoga,Activewear,98.0
1,0.8756,0.8943,0.8321,Alosoft Relay Bra,Alo Yoga,Activewear,78.0


In [114]:
# --- BRAND-DIVERSE, FALLBACK-AWARE RECOMMENDER (drop-in) ---
def recommend_v2(query, topn=10, alpha=0.7,
                 show_cols=("product_name","brand","category_group","price"),
                 same_brand=True,
                 same_category_or_price=True,
                 random_brand=False,       # stochastic brand mix
                 brand_topk=12,            # similar brands to consider
                 temperature=0.7,          # softmax temp (lower = pick higher-score more often)
                 max_per_brand=None,       # cap per brand; None = no cap
                 seed=None):
    """
    Self-contained re-ranker that:
      • pulls a *large* candidate pool,
      • optionally restricts to similar brands (with fallback),
      • relaxes filters if pool is too small,
      • dedupes by name+brand,
      • optionally samples to diversify brands.
    Requires: df, find_index_by_name(), hybrid_scores(), similar_brands() and df['dedupe_key'] defined earlier.
    """

    i = query if isinstance(query, int) else find_index_by_name(query)
    if i is None:
        print("No product matched your query.")
        return

    # 1) get a big pool of neighbors (so later filters don’t starve us)
    idxs, h, c, u = hybrid_scores(i, k=max(topn*50, 1000), alpha=alpha)

    base_cat   = df.iloc[i]["category_group"]
    base_price = df.iloc[i]["price_num"]
    base_brand = df.iloc[i]["brand"]
    query_key  = df.iloc[i]["dedupe_key"]
    rng = np.random.default_rng(seed)

    # helper: matching rules
    def ok_basic(j, require_cat=True, allow_price=True):
        cond_brand = (df.iloc[j]["brand"] == base_brand) if same_brand else True
        cond_cat   = (df.iloc[j]["category_group"] == base_cat) if require_cat else True
        cond_price = (
            True if not allow_price else
            (pd.isna(base_price) or
             pd.isna(df.iloc[j]["price_num"]) or
             abs(df.iloc[j]["price_num"] - base_price) <= max(20, 0.5 * base_price))
        )
        if same_category_or_price:
            cond_match = (cond_cat or cond_price)
        else:
            cond_match = True
        return cond_brand and cond_match

    # 2) primary candidate pool
    pool = [j for j in idxs if ok_basic(j, require_cat=True, allow_price=True)]
    if not pool:
        pool = list(idxs)  # absolute fallback

    # 3) optional similar-brands restriction, with variety fallback
    if random_brand:
        sim_brands = set(similar_brands(base_brand, topk=brand_topk))
        pool_sim = [j for j in pool if (df.iloc[j]["brand"] in sim_brands)]
        need_brands = max(1, int(np.ceil(topn / (max_per_brand or topn))))
        uniq_brands = len({df.iloc[j]["brand"] for j in pool_sim})
        if (len(pool_sim) >= topn) or (uniq_brands >= need_brands):
            pool = pool_sim  # keep only similar brands if we still have enough variety

    # 4) auto-relax if pool still too small → drop price, then category
    if len(pool) < topn:
        pool = [j for j in idxs if ok_basic(j, require_cat=True, allow_price=False)]
    if len(pool) < topn:
        pool = [j for j in idxs if ok_basic(j, require_cat=False, allow_price=False)]

    # 5) dedupe + softmax sampling (for random_brand) + optional per-brand cap
    keep, seen_keys = [], {query_key}
    brand_counts = {}
    cap = max_per_brand or topn  # unlimited if None

    # align scores to pool
    score_pos = {j: k for k, j in enumerate(idxs[:len(h)])}
    def get_score(j):
        p = score_pos.get(j, None)
        return float(h[p]) if p is not None else 0.0

    cand = np.array(pool, dtype=int)
    scrs = np.array([get_score(j) for j in cand], dtype=float)

    def softmax(x, T=1.0):
        x = (x - x.max()) / max(T, 1e-6)
        e = np.exp(x)
        return e / (e.sum() + 1e-12)

    while len(keep) < topn and len(cand) > 0:
        if random_brand:
            p = softmax(scrs, T=temperature)
            pick_idx = rng.choice(len(cand), p=p)
        else:
            pick_idx = int(np.argmax(scrs))
        j = int(cand[pick_idx])

        k = df.iloc[j]["dedupe_key"]
        b = df.iloc[j]["brand"]
        if (k not in seen_keys) and (brand_counts.get(b, 0) < cap):
            keep.append(j)
            seen_keys.add(k)
            brand_counts[b] = brand_counts.get(b, 0) + 1

        cand = np.delete(cand, pick_idx)
        scrs = np.delete(scrs, pick_idx)

    # final fill if needed
    if len(keep) < topn:
        for j in pool:
            if len(keep) >= topn: break
            k = df.iloc[j]["dedupe_key"]; b = df.iloc[j]["brand"]
            if (k in seen_keys) or (brand_counts.get(b,0) >= cap): continue
            keep.append(j); seen_keys.add(k); brand_counts[b]=brand_counts.get(b,0)+1

    # 6) assemble output
    out = df.loc[keep, list(show_cols)].copy()
    hybrid_full = dict(zip(idxs, h))
    cont_full   = dict(zip(idxs, c))
    coll_full   = dict(zip(idxs, u))
    out.insert(0, "score",       np.round([hybrid_full.get(j, 0.0) for j in keep], 4))
    out.insert(1, "content_sim", np.round([cont_full.get(j, 0.0)   for j in keep], 4))
    out.insert(2, "collab_sim",  np.round([coll_full.get(j, 0.0)   for j in keep], 4))

    print("Query →", df.iloc[i][["product_name","brand","category_group","price"]].to_dict())
    display(out.reset_index(drop=True))
    return None

print("✅ recommend_v2 is ready. Try:")
print(" - recommend_v2('Airlift Intrigue Bra', 10, 0.7, same_brand=False, random_brand=True, brand_topk=20, max_per_brand=3, same_category_or_price=False, seed=42)")
print(" - recommend_v2(0, 12, 0.6)")


✅ recommend_v2 is ready. Try:
 - recommend_v2('Airlift Intrigue Bra', 10, 0.7, same_brand=False, random_brand=True, brand_topk=20, max_per_brand=3, same_category_or_price=False, seed=42)
 - recommend_v2(0, 12, 0.6)


In [116]:
recommend_v2("Airlift Intrigue Bra", 10, 0.7,
             same_brand=False, random_brand=True,
             brand_topk=20, max_per_brand=3,
             same_category_or_price=False, seed=42)


Query → {'product_name': 'Airlift Intrigue Bra', 'brand': 'Alo Yoga', 'category_group': 'Activewear', 'price': 68.0}


Unnamed: 0,score,content_sim,collab_sim,product_name,brand,category_group,price
0,0.4124,0.4154,0.4053,Faux Leather Cropped Premier Bomber,Alo Yoga,Outerwear,248.0
1,0.7922,0.785,0.8089,Tennis Club Sweater Knit Skirt,Alo Yoga,Activewear,98.0
2,0.3848,0.3616,0.4388,Lauryl V Neck Sweater,Edikted,Activewear,28.0
3,0.4254,0.417,0.4448,Love Knots Tie Scrunchie,Alo Yoga,Accessories,32.0
4,0.3809,0.3555,0.4402,Wavy Baby Sequin Micro Shorts,Edikted,Activewear,34.0
5,0.3825,0.3563,0.4436,Steph Camo Pant,ALTAR'D STATE,Activewear,88.0
6,0.3911,0.3697,0.4412,Vital V Neck Sports Bra,Gymshark,Activewear,38.0
7,0.3899,0.3687,0.4393,One Shoulder Sports Bra,Gymshark,Activewear,30.0
8,0.3811,0.3565,0.4384,Kasey Cable Knit Pants,Edikted,Activewear,26.4
9,0.3947,0.3747,0.4412,Vital Sports Bra,Gymshark,Activewear,38.0


In [118]:
recommend_v2("Airlift Intrigue Bra", 10, 0.7,
             same_brand=False, random_brand=False,
             same_category_or_price=False)


Query → {'product_name': 'Airlift Intrigue Bra', 'brand': 'Alo Yoga', 'category_group': 'Activewear', 'price': 68.0}


Unnamed: 0,score,content_sim,collab_sim,product_name,brand,category_group,price
0,0.8869,0.9266,0.7942,Airlift Empower Bra,Alo Yoga,Activewear,98.0
1,0.8756,0.8943,0.8321,Alosoft Relay Bra,Alo Yoga,Activewear,78.0
2,0.8689,0.8955,0.8069,Airlift Divine Bra,Alo Yoga,Activewear,68.0
3,0.8688,0.8943,0.8094,Alosoft Molded Fantasy Bra,Alo Yoga,Activewear,74.0
4,0.8631,0.8855,0.8109,Slit Bra,Alo Yoga,Activewear,68.0
5,0.8631,0.8645,0.8597,Airlift Strength Bra,Alo Yoga,Activewear,78.0
6,0.8595,0.8529,0.875,Airlift Suit Up Bra,Alo Yoga,Activewear,78.0
7,0.852,0.8605,0.8321,Splendor Bra,Alo Yoga,Activewear,58.0
8,0.8483,0.8368,0.875,Wild Thing Bra,Alo Yoga,Activewear,78.0
9,0.847,0.8296,0.8876,7/8 High-Waist Airlift Legging,Alo Yoga,Activewear,128.0


In [122]:
%pip install -q open_clip_torch pillow torchvision torch faiss-cpu


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/opt/anaconda3/bin/python -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [126]:
import os, io, json, time, math, hashlib, requests
import numpy as np
import torch
from PIL import Image
import open_clip
from torchvision import transforms

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_NAME = "ViT-B-32"
PRETRAIN   = "laion2b_s34b_b79k"

clip_model, _, clip_preprocess = open_clip.create_model_and_transforms(
    MODEL_NAME, pretrained=PRETRAIN, device=DEVICE
)
clip_model.eval()

CACHE_PATH = "image_embeddings_clip_vitb32.npy"

if os.path.exists(CACHE_PATH):
    image_embs = np.load(CACHE_PATH)
    print("Loaded cached image embeddings:", image_embs.shape)
else:
    imgs = [load_image_from_url(u) if isinstance(u, str) and u.strip() else None
            for u in df["image_url"]]
    image_embs = embed_images(imgs, batch_size=32)
    np.save(CACHE_PATH, image_embs)
    print("Saved image embeddings:", image_embs.shape)

has_img = ~np.isnan(image_embs).any(axis=1)


Loaded cached image embeddings: (7375, 512)


In [128]:
def image_sim_for_index(i, idxs):
    """Compute cosine similarity of image embeddings between item i and idxs."""
    if not has_img[i]:
        return np.zeros(len(idxs))
    base = image_embs[i] / np.linalg.norm(image_embs[i])
    neigh = image_embs[idxs] / np.linalg.norm(image_embs[idxs], axis=1, keepdims=True)
    return np.dot(neigh, base)


In [130]:
def hybrid_scores_v2(i, k=100, alpha=0.6, beta=0.2, gamma=0.2):
    """
    Returns:
        idxs, hybrid, content_sim, collab_sim, image_sim
    """
    # reuse your existing hybrid_scores logic
    idxs, content_sim, collab_sim = hybrid_scores(i, k=k, alpha=alpha)  
    
    # image similarity
    image_sim = image_sim_for_index(i, idxs)

    # combine weighted
    hybrid = alpha * content_sim + beta * collab_sim + gamma * image_sim
    return idxs, hybrid, content_sim, collab_sim, image_sim


In [132]:
def recommend_mm(query, topn=10, alpha=0.6, beta=0.2, gamma=0.2,
                 show_cols=("product_name","brand","category_group","price"), **kwargs):
    i = query if isinstance(query, int) else find_index_by_name(query)
    if i is None:
        print("No product matched your query.")
        return

    idxs, h, c, u, img = hybrid_scores_v2(i, k=max(topn*50, 1000), alpha=alpha, beta=beta, gamma=gamma)

    order = np.argsort(-h)[:topn]
    idxs = [idxs[j] for j in order]

    result = df.iloc[idxs][list(show_cols)].copy()
    result["score"] = h[order]
    result["content_sim"] = c[order]
    result["collab_sim"] = u[order]
    result["image_sim"] = img[order]

    display(result)
    return result


In [134]:
recommend_mm("Airlift Intrigue Bra", topn=10, alpha=0.6, beta=0.2, gamma=0.2)

ValueError: too many values to unpack (expected 3)