In [None]:
# ============================================================================
# PINK TAX ANALYSIS: ML-BASED GENDER PREDICTION PIPELINE v3
# ============================================================================
# 
# KEY CHANGES FROM v2:
# 1. Filter categories FIRST, then extract colors (correct order)
# 2. Train primary model on ALL explicitly gendered products (no color requirement)
# 3. Color features as optional enhancement, not constraint
# 4. Much larger training set (~1,900 vs 87)
# ============================================================================

# ============================================================================
# CONFIGURATION
# ============================================================================

PATH_MAIN_DATA = '/Users/leoss/Downloads/items_fin.csv'
PATH_HUMAN_CODED = '/Users/leoss/Downloads/items_prices_description_gender_humancode_sample.csv'
PATH_YOUR_LABELED = '/Users/leoss/Downloads/available_validation.xlsx'
OUTPUT_DIR = '/Users/leoss/Desktop/Portfolio/Website-/UK pink tax/Outputs/charts/ml_pipeline_v3'

# NEW color cache (post-filtering)
COLOR_CACHE_PATH = '/Users/leoss/Desktop/Portfolio/Website-/UK pink tax/Outputs/color_features_cache_v3_filtered.csv'

# ML settings
RANDOM_STATE = 42
TEST_SIZE = 0.25
CV_FOLDS = 5

# Color extraction
N_COLORS = 3
COLOR_EXTRACTION_SAMPLE = 2000  # Will extract from FILTERED data

# Categories to exclude
EXCLUDE_CATEGORIES = [
    'food', 'grocery', 'groceries', 'snacks', 'drinks', 'beverages',
    'pet food', 'pet supplies', 'cleaning', 'household', 'kitchen',
    'office', 'stationery', 'electronics', 'tech', 'garden', 'automotive'
]

# ============================================================================
# IMPORTS
# ============================================================================

import pandas as pd
import numpy as np
import re
import os
import json
import time
import warnings
from collections import Counter
from io import BytesIO
from pathlib import Path

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             confusion_matrix, classification_report)
from scipy.sparse import hstack, csr_matrix

import matplotlib.pyplot as plt
import seaborn as sns

from PIL import Image
import requests
from sklearn.cluster import KMeans

warnings.filterwarnings('ignore')
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("✓ Imports complete")
print(f"✓ Output directory: {OUTPUT_DIR}")

# ============================================================================
# LOAD DATA
# ============================================================================

print("\n" + "="*70)
print("LOADING DATA")
print("="*70)

df = pd.read_csv(PATH_MAIN_DATA, encoding='latin-1')
df.columns = df.columns.str.lower().str.strip().str.replace(' ', '_')
if 'unnamed:_0' in df.columns:
    df = df.drop(columns=['unnamed:_0'])
print(f"✓ Main dataset: {len(df):,} products")

human_coded = pd.read_csv(PATH_HUMAN_CODED, encoding='latin-1')
human_coded.columns = human_coded.columns.str.lower().str.strip().str.replace(' ', '_')
if 'unnamed:_0' in human_coded.columns:
    human_coded = human_coded.drop(columns=['unnamed:_0'])
print(f"✓ Human-coded: {len(human_coded)} products")

try:
    your_labeled = pd.read_excel(PATH_YOUR_LABELED)
    your_labeled.columns = your_labeled.columns.str.lower().str.strip().str.replace(' ', '_')
    print(f"✓ Your labeled: {len(your_labeled)} products")
except:
    your_labeled = pd.DataFrame()
    print("⚠ Your labeled file not found")

# Column mapping
COL_NAME = 'product_title_x'
COL_DESC = 'description'
COL_BREADCRUMB = 'standardized_breadcrumbs'
COL_PRICE = 'price'
COL_UNIT_PRICE = 'unit_price'
COL_IMAGE = 'image_url'
COL_STORE = 'store_id'
COL_PRODUCT_ID = 'product_id'
COL_URL = 'product_url_x'

# ============================================================================
# STEP 1: FILTER CATEGORIES FIRST
# ============================================================================

print("\n" + "="*70)
print("STEP 1: FILTER NON-GENDERED CATEGORIES")
print("="*70)

original_count = len(df)

def contains_excluded_category(text):
    if pd.isna(text):
        return False
    text_lower = str(text).lower()
    return any(cat in text_lower for cat in EXCLUDE_CATEGORIES)

df['is_excluded'] = df[COL_BREADCRUMB].apply(contains_excluded_category)
excluded_count = df['is_excluded'].sum()

print(f"Original: {original_count:,}")
print(f"Excluded: {excluded_count:,}")

df = df[~df['is_excluded']].copy().reset_index(drop=True)
print(f"Remaining: {len(df):,}")

# ============================================================================
# STEP 2: EXTRACT GENDER LABELS
# ============================================================================

print("\n" + "="*70)
print("STEP 2: EXTRACT GENDER LABELS")
print("="*70)

FEMALE_KEYWORDS = ['women', 'woman', 'female', 'ladies', 'lady', 'girls', 
                   'womens', "women's", 'femme', 'her', 'feminine', 'fem']
MALE_KEYWORDS = ['men', 'man', 'male', 'gentleman', 'gentlemen', 'boys', 
                 'mens', "men's", 'homme', 'his', 'masculine']
ALL_GENDER_KEYWORDS = set(FEMALE_KEYWORDS + MALE_KEYWORDS)

def extract_gender_explicit(text):
    if pd.isna(text) or str(text).strip() == '':
        return 'none'
    text_lower = str(text).lower()
    has_female = any(re.search(r'\b' + kw + r'\b', text_lower) for kw in FEMALE_KEYWORDS)
    has_male = any(re.search(r'\b' + kw + r'\b', text_lower) for kw in MALE_KEYWORDS)
    
    if has_female and not has_male:
        return 'female'
    elif has_male and not has_female:
        return 'male'
    elif has_female and has_male:
        return 'both'
    else:
        return 'none'

df['label_bc'] = df[COL_BREADCRUMB].apply(extract_gender_explicit)
df['label_name'] = df[COL_NAME].apply(extract_gender_explicit)
df['label_desc'] = df[COL_DESC].apply(extract_gender_explicit)

def combine_labels(row):
    for col in ['label_bc', 'label_name', 'label_desc']:
        if row[col] in ['female', 'male']:
            return row[col]
    return 'none'

df['label_extracted'] = df.apply(combine_labels, axis=1)

print(f"Label distribution:")
print(df['label_extracted'].value_counts().to_dict())

# Merge human labels
if 'human_gender_label' in human_coded.columns and COL_PRODUCT_ID in human_coded.columns:
    human_labels = human_coded[[COL_PRODUCT_ID, 'human_gender_label']].drop_duplicates()
    human_labels.columns = [COL_PRODUCT_ID, 'label_human']
    human_labels['label_human'] = human_labels['label_human'].str.lower().str.strip()
    df = df.merge(human_labels, on=COL_PRODUCT_ID, how='left')
else:
    df['label_human'] = None

human_count = df['label_human'].notna().sum()
print(f"Human labels merged: {human_count}")

# ============================================================================
# STEP 3: COLOR EXTRACTION (ON FILTERED DATA)
# ============================================================================

print("\n" + "="*70)
print("STEP 3: COLOR EXTRACTION (POST-FILTERING)")
print("="*70)

STANDARD_COLORS = {
    'dark_red': (139, 0, 0), 'red': (255, 0, 0), 'coral': (255, 127, 80),
    'salmon': (250, 128, 114), 'crimson': (220, 20, 60), 'brown': (139, 69, 19),
    'tan': (210, 180, 140), 'orange': (255, 165, 0), 'gold': (255, 215, 0),
    'yellow': (255, 255, 0), 'khaki': (240, 230, 140), 'dark_green': (0, 100, 0),
    'green': (0, 128, 0), 'lime': (50, 205, 50), 'olive': (128, 128, 0),
    'teal': (0, 128, 128), 'navy': (0, 0, 128), 'blue': (0, 0, 255),
    'royal_blue': (65, 105, 225), 'sky_blue': (135, 206, 235), 'cyan': (0, 255, 255),
    'purple': (128, 0, 128), 'magenta': (255, 0, 255), 'violet': (238, 130, 238),
    'lavender': (230, 230, 250), 'pink': (255, 192, 203), 'hot_pink': (255, 105, 180),
    'gray': (128, 128, 128), 'silver': (192, 192, 192), 'black': (0, 0, 0), 'white': (255, 255, 255)
}

def color_distance(c1, c2):
    return np.sqrt(sum((a - b) ** 2 for a, b in zip(c1, c2)))

def closest_standard_color(rgb):
    min_dist = float('inf')
    closest = 'gray'
    for name, std_rgb in STANDARD_COLORS.items():
        dist = color_distance(rgb, std_rgb)
        if dist < min_dist:
            min_dist = dist
            closest = name
    return closest

def is_background_color(rgb):
    r, g, b = rgb
    if r > 240 and g > 240 and b > 240:
        return True
    if r < 15 and g < 15 and b < 15:
        return True
    max_diff = max(abs(r - g), abs(g - b), abs(r - b))
    avg = (r + g + b) / 3
    if max_diff < 20 and 100 < avg < 160:
        return True
    return False

def extract_colors(image_url, n_colors=3, timeout=10):
    try:
        response = requests.get(image_url, timeout=timeout)
        img = Image.open(BytesIO(response.content)).convert('RGB').resize((100, 100))
        pixels = np.array(img).reshape(-1, 3)
        
        non_bg = np.array([p for p in pixels if not is_background_color(tuple(p))])
        if len(non_bg) < 50:
            non_bg = pixels
        
        kmeans = KMeans(n_clusters=min(n_colors + 2, len(non_bg)), random_state=42, n_init=10)
        kmeans.fit(non_bg)
        
        counts = Counter(kmeans.labels_)
        total = len(kmeans.labels_)
        
        colors = []
        for cluster_id, count in sorted(counts.items(), key=lambda x: x[1], reverse=True):
            rgb = tuple(int(c) for c in kmeans.cluster_centers_[cluster_id])
            if not is_background_color(rgb):
                colors.append({
                    'name': closest_standard_color(rgb),
                    'weight': count / total
                })
            if len(colors) >= n_colors:
                break
        
        return colors if colors else None
    except:
        return None

# Check for cache
if os.path.exists(COLOR_CACHE_PATH):
    print(f"✓ Loading cached colors from: {COLOR_CACHE_PATH}")
    color_df = pd.read_csv(COLOR_CACHE_PATH, index_col=0)
    print(f"  Loaded {len(color_df)} color records")
else:
    print(f"⚠ No cache found. Extracting colors for filtered products...")
    print(f"  This will take a while...")
    
    # Prioritize explicitly gendered products for color extraction
    gendered = df[df['label_extracted'].isin(['female', 'male'])].copy()
    none_sample = df[df['label_extracted'] == 'none'].sample(
        n=min(500, len(df[df['label_extracted'] == 'none'])), 
        random_state=RANDOM_STATE
    )
    
    # Combine: all gendered + sample of none
    to_extract = pd.concat([gendered, none_sample]).drop_duplicates()
    to_extract = to_extract[to_extract[COL_IMAGE].notna()]
    
    if len(to_extract) > COLOR_EXTRACTION_SAMPLE:
        to_extract = to_extract.sample(n=COLOR_EXTRACTION_SAMPLE, random_state=RANDOM_STATE)
    
    print(f"  Extracting colors for {len(to_extract)} products...")
    
    color_results = []
    failed = 0
    
    for idx, (row_idx, row) in enumerate(to_extract.iterrows()):
        if idx % 100 == 0:
            print(f"    Progress: {idx}/{len(to_extract)} ({100*idx/len(to_extract):.1f}%)")
        
        colors = extract_colors(row[COL_IMAGE], n_colors=N_COLORS)
        
        if colors:
            entry = {
                'original_index': row_idx,
                COL_PRODUCT_ID: row[COL_PRODUCT_ID],
                'label_extracted': row['label_extracted']
            }
            for i, c in enumerate(colors):
                entry[f'color{i+1}_name'] = c['name']
                entry[f'color{i+1}_weight'] = c['weight']
            color_results.append(entry)
        else:
            failed += 1
        
        time.sleep(0.05)
    
    color_df = pd.DataFrame(color_results)
    if len(color_df) > 0:
        color_df = color_df.set_index('original_index')
    
    # Save cache
    color_df.to_csv(COLOR_CACHE_PATH)
    print(f"\n✓ Color extraction complete. Saved to: {COLOR_CACHE_PATH}")
    print(f"  Success: {len(color_df)}, Failed: {failed}")

print(f"\nColor data summary:")
if 'label_extracted' in color_df.columns:
    print(color_df['label_extracted'].value_counts().to_dict())

# ============================================================================
# STEP 4: PREPARE TRAINING DATA (NO COLOR REQUIREMENT)
# ============================================================================

print("\n" + "="*70)
print("STEP 4: PREPARE TRAINING DATA")
print("="*70)

# Get ALL explicitly gendered products (no color requirement!)
female_all = df[df['label_extracted'] == 'female'].copy()
male_all = df[df['label_extracted'] == 'male'].copy()

print(f"Explicitly female: {len(female_all)}")
print(f"Explicitly male: {len(male_all)}")

# For "none" class: human-coded none + sample from extracted none
human_none = df[(df['label_human'] == 'none')].copy()
extracted_none = df[(df['label_extracted'] == 'none') & (df['label_human'].isna())].copy()

# Target: balance to smallest gendered class
min_gendered = min(len(female_all), len(male_all))
target_none = min_gendered

print(f"\nNone class sources:")
print(f"  Human-coded none: {len(human_none)}")
print(f"  Extracted none (unlabeled): {len(extracted_none)}")

# Build none class
if len(human_none) >= target_none:
    none_all = human_none.sample(n=target_none, random_state=RANDOM_STATE)
else:
    remaining = target_none - len(human_none)
    sampled_none = extracted_none.sample(n=min(remaining, len(extracted_none)), random_state=RANDOM_STATE)
    none_all = pd.concat([human_none, sampled_none])

print(f"  Final none class: {len(none_all)}")

# Balance all classes
min_class = min(len(female_all), len(male_all), len(none_all))
print(f"\nBalancing to: {min_class} per class")

female_balanced = female_all.sample(n=min_class, random_state=RANDOM_STATE)
male_balanced = male_all.sample(n=min_class, random_state=RANDOM_STATE)
none_balanced = none_all.sample(n=min(min_class, len(none_all)), random_state=RANDOM_STATE)

# Combine
ml_data = pd.concat([female_balanced, male_balanced, none_balanced]).copy()

# Create target
ml_data['target'] = ml_data['label_extracted'].map({'female': 0, 'male': 1})
ml_data.loc[ml_data['target'].isna(), 'target'] = 2  # none
ml_data['target'] = ml_data['target'].astype(int)

print(f"\n✓ Training data: {len(ml_data)} products")
print(f"  Class distribution: {ml_data['target'].value_counts().sort_index().to_dict()}")
print(f"  (0=female, 1=male, 2=none)")

# ============================================================================
# STEP 5: TRAIN-TEST SPLIT
# ============================================================================

print("\n" + "="*70)
print("STEP 5: TRAIN-TEST SPLIT")
print("="*70)

train_idx, test_idx = train_test_split(
    ml_data.index,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=ml_data['target']
)

train_data = ml_data.loc[train_idx].copy()
test_data = ml_data.loc[test_idx].copy()

print(f"Train: {len(train_data)}")
print(f"Test: {len(test_data)}")
print(f"Train distribution: {train_data['target'].value_counts().sort_index().to_dict()}")

# ============================================================================
# STEP 6: FEATURE ENGINEERING
# ============================================================================

print("\n" + "="*70)
print("STEP 6: FEATURE ENGINEERING")
print("="*70)

def clean_text_remove_gender(text, remove_words=ALL_GENDER_KEYWORDS):
    if pd.isna(text):
        return ''
    text = str(text).lower()
    for word in remove_words:
        text = re.sub(r'\b' + word + r'\b', '', text)
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Clean text
for dataset in [train_data, test_data, df]:
    dataset['breadcrumb_clean'] = dataset[COL_BREADCRUMB].apply(clean_text_remove_gender)
    dataset['description_clean'] = dataset[COL_DESC].apply(clean_text_remove_gender)

# --- Price features ---
price_features = ['feat_price_log', 'feat_unit_price']
for dataset in [train_data, test_data, df]:
    dataset['feat_price'] = pd.to_numeric(dataset[COL_PRICE], errors='coerce')
    dataset['feat_price_log'] = np.log1p(dataset['feat_price'])
    if COL_UNIT_PRICE in dataset.columns:
        dataset['feat_unit_price'] = dataset[COL_UNIT_PRICE].astype(str).str.extract(r'([\d.]+)')[0].astype(float)
    else:
        dataset['feat_unit_price'] = 0
print(f"✓ Price features: {len(price_features)}")

# --- Store features ---
store_encoder = LabelEncoder()
store_encoder.fit(train_data[COL_STORE].fillna('unknown'))

def encode_stores(data, encoder):
    stores = data[COL_STORE].fillna('unknown')
    encoded = []
    for s in stores:
        if s in encoder.classes_:
            encoded.append(encoder.transform([s])[0])
        else:
            encoded.append(-1)
    return np.array(encoded)

train_data['store_encoded'] = encode_stores(train_data, store_encoder)
test_data['store_encoded'] = encode_stores(test_data, store_encoder)
df['store_encoded'] = encode_stores(df, store_encoder)

n_stores = len(store_encoder.classes_) + 1
print(f"✓ Store features: {n_stores}")

# --- Breadcrumb TF-IDF (fit on TRAIN only) ---
breadcrumb_vectorizer = TfidfVectorizer(
    max_features=200,
    min_df=5,
    max_df=0.9,
    ngram_range=(1, 2),
    stop_words='english'
)
breadcrumb_vectorizer.fit(train_data['breadcrumb_clean'])
print(f"✓ Breadcrumb TF-IDF: {len(breadcrumb_vectorizer.get_feature_names_out())} features")

# --- Description TF-IDF (fit on TRAIN only) ---
description_vectorizer = TfidfVectorizer(
    max_features=500,
    min_df=5,
    max_df=0.9,
    ngram_range=(1, 2),
    stop_words='english'
)
description_vectorizer.fit(train_data['description_clean'])
print(f"✓ Description TF-IDF: {len(description_vectorizer.get_feature_names_out())} features")

# --- Color features (OPTIONAL - not all products have them) ---
color_feature_cols = []
for color_name in STANDARD_COLORS.keys():
    for i in range(1, N_COLORS + 1):
        col_name = f'feat_color{i}_{color_name}'
        color_feature_cols.append(col_name)
        for dataset in [train_data, test_data, df]:
            dataset[col_name] = 0.0

# Fill color features where available
for dataset in [train_data, test_data, df]:
    for idx in dataset.index:
        if idx in color_df.index:
            for i in range(1, N_COLORS + 1):
                cname = color_df.loc[idx, f'color{i}_name'] if f'color{i}_name' in color_df.columns else None
                cweight = color_df.loc[idx, f'color{i}_weight'] if f'color{i}_weight' in color_df.columns else 0
                if pd.notna(cname) and cname in STANDARD_COLORS:
                    dataset.loc[idx, f'feat_color{i}_{cname}'] = cweight

# Count how many training samples have color data
train_has_color = train_data.index.isin(color_df.index).sum()
print(f"✓ Color features: {len(color_feature_cols)} (available for {train_has_color}/{len(train_data)} train samples)")

# ============================================================================
# STEP 7: BUILD FEATURE MATRICES
# ============================================================================

print("\n" + "="*70)
print("STEP 7: BUILD FEATURE MATRICES")
print("="*70)

def build_feature_matrix(data, bc_vec, desc_vec, color_cols, price_cols, n_stores, include_colors=True):
    """Build feature matrix"""
    feature_names = []
    blocks = []
    
    # Price
    X_price = data[price_cols].fillna(0).values
    blocks.append(csr_matrix(X_price))
    feature_names.extend(price_cols)
    
    # Store (one-hot)
    store_enc = data['store_encoded'].values
    X_store = np.zeros((len(data), n_stores))
    for i, s in enumerate(store_enc):
        if s >= 0:
            X_store[i, s] = 1
        else:
            X_store[i, -1] = 1
    blocks.append(csr_matrix(X_store))
    feature_names.extend([f'store_{i}' for i in range(n_stores)])
    
    # Breadcrumb TF-IDF
    X_bc = bc_vec.transform(data['breadcrumb_clean'])
    blocks.append(X_bc)
    feature_names.extend([f'bc_{f}' for f in bc_vec.get_feature_names_out()])
    
    # Description TF-IDF
    X_desc = desc_vec.transform(data['description_clean'])
    blocks.append(X_desc)
    feature_names.extend([f'desc_{f}' for f in desc_vec.get_feature_names_out()])
    
    # Color (optional)
    if include_colors:
        X_color = data[color_cols].values
        blocks.append(csr_matrix(X_color))
        feature_names.extend(color_cols)
    
    X = hstack(blocks)
    return X, feature_names

# Build with colors
X_train, feature_names = build_feature_matrix(
    train_data, breadcrumb_vectorizer, description_vectorizer,
    color_feature_cols, price_features, n_stores, include_colors=True
)
X_test, _ = build_feature_matrix(
    test_data, breadcrumb_vectorizer, description_vectorizer,
    color_feature_cols, price_features, n_stores, include_colors=True
)

y_train = train_data['target'].values
y_test = test_data['target'].values

print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"Features: {len(feature_names)}")

# Validate
assert X_train.shape[1] == len(feature_names), f"Mismatch: {X_train.shape[1]} vs {len(feature_names)}"
print(f"✓ Feature alignment validated")

# ============================================================================
# STEP 8: TRAIN MODELS
# ============================================================================

print("\n" + "="*70)
print("STEP 8: TRAIN MODELS")
print("="*70)

results = []

# --- L1 Logistic Regression ---
print("\n--- Logistic Regression (L1) ---")
model_l1 = LogisticRegressionCV(
    cv=CV_FOLDS,
    penalty='l1',
    solver='saga',
    max_iter=2000,
    multi_class='multinomial',
    class_weight='balanced',
    random_state=RANDOM_STATE
)
model_l1.fit(X_train, y_train)
y_pred_l1 = model_l1.predict(X_test)
acc_l1 = accuracy_score(y_test, y_pred_l1)
f1_l1 = f1_score(y_test, y_pred_l1, average='weighted')
print(f"Accuracy: {acc_l1:.4f}, F1: {f1_l1:.4f}")
results.append({'Model': 'L1 (LASSO)', 'Accuracy': acc_l1, 'F1_weighted': f1_l1})

# --- L2 Logistic Regression ---
print("\n--- Logistic Regression (L2) ---")
model_l2 = LogisticRegressionCV(
    cv=CV_FOLDS,
    penalty='l2',
    solver='lbfgs',
    max_iter=2000,
    multi_class='multinomial',
    class_weight='balanced',
    random_state=RANDOM_STATE
)
model_l2.fit(X_train, y_train)
y_pred_l2 = model_l2.predict(X_test)
acc_l2 = accuracy_score(y_test, y_pred_l2)
f1_l2 = f1_score(y_test, y_pred_l2, average='weighted')
print(f"Accuracy: {acc_l2:.4f}, F1: {f1_l2:.4f}")
results.append({'Model': 'L2 (Ridge)', 'Accuracy': acc_l2, 'F1_weighted': f1_l2})

# --- Random Forest ---
print("\n--- Random Forest ---")
model_rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=15,
    min_samples_split=10,
    min_samples_leaf=5,
    class_weight='balanced',
    random_state=RANDOM_STATE,
    n_jobs=-1
)
model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf, average='weighted')
print(f"Accuracy: {acc_rf:.4f}, F1: {f1_rf:.4f}")
results.append({'Model': 'Random Forest', 'Accuracy': acc_rf, 'F1_weighted': f1_rf})

# --- Histogram Gradient Boosting ---
print("\n--- Histogram Gradient Boosting ---")
model_hgb = HistGradientBoostingClassifier(
    max_iter=200,
    max_depth=10,
    learning_rate=0.1,
    random_state=RANDOM_STATE
)
model_hgb.fit(X_train.toarray(), y_train)
y_pred_hgb = model_hgb.predict(X_test.toarray())
acc_hgb = accuracy_score(y_test, y_pred_hgb)
f1_hgb = f1_score(y_test, y_pred_hgb, average='weighted')
print(f"Accuracy: {acc_hgb:.4f}, F1: {f1_hgb:.4f}")
results.append({'Model': 'Hist Gradient Boosting', 'Accuracy': acc_hgb, 'F1_weighted': f1_hgb})

# --- SVM ---
print("\n--- SVM (RBF) ---")
scaler = StandardScaler(with_mean=False)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model_svm = SVC(
    kernel='rbf',
    C=1.0,
    gamma='scale',
    class_weight='balanced',
    probability=True,
    random_state=RANDOM_STATE
)
model_svm.fit(X_train_scaled, y_train)
y_pred_svm = model_svm.predict(X_test_scaled)
acc_svm = accuracy_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm, average='weighted')
print(f"Accuracy: {acc_svm:.4f}, F1: {f1_svm:.4f}")
results.append({'Model': 'SVM', 'Accuracy': acc_svm, 'F1_weighted': f1_svm})

# Results
results_df = pd.DataFrame(results).sort_values('F1_weighted', ascending=False)
print("\n" + "="*70)
print("MODEL COMPARISON")
print("="*70)
print(results_df.to_string(index=False))
results_df.to_csv(f'{OUTPUT_DIR}/model_comparison.csv', index=False)

# ============================================================================
# STEP 9: BEST MODEL ANALYSIS
# ============================================================================

print("\n" + "="*70)
print("BEST MODEL ANALYSIS")
print("="*70)

best_name = results_df.iloc[0]['Model']
print(f"Best model: {best_name}")

# Get best predictions
if 'L1' in best_name:
    best_model = model_l1
    y_pred_best = y_pred_l1
elif 'L2' in best_name:
    best_model = model_l2
    y_pred_best = y_pred_l2
elif 'Random' in best_name:
    best_model = model_rf
    y_pred_best = y_pred_rf
elif 'Hist' in best_name:
    best_model = model_hgb
    y_pred_best = y_pred_hgb
else:
    best_model = model_svm
    y_pred_best = y_pred_svm

print("\nClassification Report:")
print(classification_report(y_test, y_pred_best, target_names=['female', 'male', 'none']))

print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred_best)
print(f"            Predicted")
print(f"            female  male  none")
for i, label in enumerate(['female', 'male', 'none']):
    row = cm[i] if i < len(cm) else [0, 0, 0]
    print(f"Actual {label:6s}  {row[0]:4d}  {row[1]:4d}  {row[2]:4d}")

# ============================================================================
# STEP 10: FEATURE IMPORTANCE
# ============================================================================

print("\n" + "="*70)
print("FEATURE IMPORTANCE (L1 Model)")
print("="*70)

importance_df = pd.DataFrame({
    'feature': feature_names,
    'coef_female': model_l1.coef_[0],
    'coef_male': model_l1.coef_[1],
    'coef_none': model_l1.coef_[2]
})
importance_df['max_abs'] = importance_df[['coef_female', 'coef_male', 'coef_none']].abs().max(axis=1)
importance_df = importance_df.sort_values('max_abs', ascending=False)

print("\nTop 15 FEMALE features:")
for _, row in importance_df[importance_df['coef_female'] > 0].nlargest(15, 'coef_female').iterrows():
    print(f"  {row['feature']:40s}: {row['coef_female']:+.4f}")

print("\nTop 15 MALE features:")
for _, row in importance_df[importance_df['coef_male'] > 0].nlargest(15, 'coef_male').iterrows():
    print(f"  {row['feature']:40s}: {row['coef_male']:+.4f}")

print("\nTop 15 NONE features:")
for _, row in importance_df[importance_df['coef_none'] > 0].nlargest(15, 'coef_none').iterrows():
    print(f"  {row['feature']:40s}: {row['coef_none']:+.4f}")

importance_df.to_csv(f'{OUTPUT_DIR}/feature_importance.csv', index=False)

# ============================================================================
# STEP 11: PREDICT ON ALL PRODUCTS
# ============================================================================

print("\n" + "="*70)
print("STEP 11: PREDICT ON ALL PRODUCTS")
print("="*70)

X_all, _ = build_feature_matrix(
    df, breadcrumb_vectorizer, description_vectorizer,
    color_feature_cols, price_features, n_stores, include_colors=True
)

print(f"Full dataset: {X_all.shape}")

# Use L1 for interpretability
df['ml_prob_female'] = model_l1.predict_proba(X_all)[:, 0]
df['ml_prob_male'] = model_l1.predict_proba(X_all)[:, 1]
df['ml_prob_none'] = model_l1.predict_proba(X_all)[:, 2]
df['ml_pred'] = model_l1.predict(X_all)
df['ml_pred_label'] = df['ml_pred'].map({0: 'female', 1: 'male', 2: 'none'})
df['ml_confidence'] = df[['ml_prob_female', 'ml_prob_male', 'ml_prob_none']].max(axis=1)

print(f"\nPrediction distribution:")
print(df['ml_pred_label'].value_counts())

# ============================================================================
# STEP 12: COMPARE WITH/WITHOUT COLOR FEATURES
# ============================================================================

print("\n" + "="*70)
print("STEP 12: COLOR FEATURE IMPACT ANALYSIS")
print("="*70)

# Train model WITHOUT colors
X_train_no_color, features_no_color = build_feature_matrix(
    train_data, breadcrumb_vectorizer, description_vectorizer,
    color_feature_cols, price_features, n_stores, include_colors=False
)
X_test_no_color, _ = build_feature_matrix(
    test_data, breadcrumb_vectorizer, description_vectorizer,
    color_feature_cols, price_features, n_stores, include_colors=False
)

model_no_color = LogisticRegressionCV(
    cv=CV_FOLDS, penalty='l1', solver='saga', max_iter=2000,
    multi_class='multinomial', class_weight='balanced', random_state=RANDOM_STATE
)
model_no_color.fit(X_train_no_color, y_train)
y_pred_no_color = model_no_color.predict(X_test_no_color)

acc_no_color = accuracy_score(y_test, y_pred_no_color)
f1_no_color = f1_score(y_test, y_pred_no_color, average='weighted')

print(f"WITH colors:    Accuracy={acc_l1:.4f}, F1={f1_l1:.4f}")
print(f"WITHOUT colors: Accuracy={acc_no_color:.4f}, F1={f1_no_color:.4f}")
print(f"Color impact:   Accuracy {'+' if acc_l1 > acc_no_color else ''}{(acc_l1-acc_no_color)*100:.2f}pp, F1 {'+' if f1_l1 > f1_no_color else ''}{(f1_l1-f1_no_color)*100:.2f}pp")

# On subset WITH color data only
train_with_color = train_data[train_data.index.isin(color_df.index)]
test_with_color = test_data[test_data.index.isin(color_df.index)]

if len(test_with_color) > 10:
    print(f"\nOn color-extracted subset only ({len(test_with_color)} test samples):")
    
    X_test_color_subset, _ = build_feature_matrix(
        test_with_color, breadcrumb_vectorizer, description_vectorizer,
        color_feature_cols, price_features, n_stores, include_colors=True
    )
    X_test_color_subset_no_color, _ = build_feature_matrix(
        test_with_color, breadcrumb_vectorizer, description_vectorizer,
        color_feature_cols, price_features, n_stores, include_colors=False
    )
    
    y_test_subset = test_with_color['target'].values
    
    pred_with = model_l1.predict(X_test_color_subset)
    pred_without = model_no_color.predict(X_test_color_subset_no_color)
    
    acc_with = accuracy_score(y_test_subset, pred_with)
    acc_without = accuracy_score(y_test_subset, pred_without)
    
    print(f"  WITH colors:    Accuracy={acc_with:.4f}")
    print(f"  WITHOUT colors: Accuracy={acc_without:.4f}")

# ============================================================================
# STEP 13: VALIDATION VS HUMAN LABELS
# ============================================================================

print("\n" + "="*70)
print("STEP 13: VALIDATION VS HUMAN LABELS")
print("="*70)

human_labeled = df[df['label_human'].notna()].copy()
print(f"Products with human labels: {len(human_labeled)}")

if len(human_labeled) > 0:
    human_labeled['human_encoded'] = human_labeled['label_human'].map({
        'female': 0, 'male': 1, 'none': 2
    })
    valid = human_labeled[human_labeled['human_encoded'].notna()]
    
    if len(valid) > 0:
        acc_human = accuracy_score(valid['human_encoded'], valid['ml_pred'])
        print(f"Accuracy vs human: {acc_human:.4f}")
        
        print("\nConfusion (ML vs Human):")
        cm_h = confusion_matrix(valid['human_encoded'], valid['ml_pred'], labels=[0, 1, 2])
        print(f"            ML Predicted")
        print(f"            female  male  none")
        for i, label in enumerate(['female', 'male', 'none']):
            print(f"Human {label:6s}  {cm_h[i,0]:4d}  {cm_h[i,1]:4d}  {cm_h[i,2]:4d}")

# ============================================================================
# STEP 14: IMPLICIT GENDERING
# ============================================================================

print("\n" + "="*70)
print("STEP 14: IMPLICIT GENDERING")
print("="*70)

implicit_female = df[
    (df['label_extracted'] == 'none') &
    (df['ml_pred_label'] == 'female') &
    (df['ml_confidence'] > 0.5)
]
implicit_male = df[
    (df['label_extracted'] == 'none') &
    (df['ml_pred_label'] == 'male') &
    (df['ml_confidence'] > 0.5)
]
predicted_none = df[df['ml_pred_label'] == 'none']

print(f"Implicit female (high conf): {len(implicit_female):,}")
print(f"Implicit male (high conf): {len(implicit_male):,}")
print(f"Predicted none: {len(predicted_none):,}")

print(f"\nSample implicit FEMALE:")
for _, row in implicit_female.head(5).iterrows():
    print(f"  {row[COL_NAME][:60]}... (conf: {row['ml_confidence']:.2f})")

print(f"\nSample implicit MALE:")
for _, row in implicit_male.head(5).iterrows():
    print(f"  {row[COL_NAME][:60]}... (conf: {row['ml_confidence']:.2f})")

# ============================================================================
# STEP 15: EXPORT VALIDATION SAMPLE
# ============================================================================

print("\n" + "="*70)
print("STEP 15: EXPORT VALIDATION SAMPLE")
print("="*70)

already_labeled = set()
if COL_PRODUCT_ID in human_coded.columns:
    already_labeled.update(human_coded[COL_PRODUCT_ID].values)
if len(your_labeled) > 0 and COL_PRODUCT_ID in your_labeled.columns:
    already_labeled.update(your_labeled[COL_PRODUCT_ID].values)

available = df[
    (~df[COL_PRODUCT_ID].isin(already_labeled)) &
    (df[COL_IMAGE].notna())
].copy()

print(f"Available: {len(available):,}")

# Sample balanced
N_PER = 85
samples = []
for pred, label in [(0, 'female'), (1, 'male'), (2, 'none')]:
    pool = available[available['ml_pred'] == pred]
    n = min(N_PER, len(pool))
    if n > 0:
        samples.append(pool.sample(n=n, random_state=RANDOM_STATE))
        print(f"  Sampled {n} {label}")

validation = pd.concat(samples).sample(frac=1, random_state=RANDOM_STATE)

export_cols = [
    COL_PRODUCT_ID, COL_NAME, COL_DESC, COL_BREADCRUMB, COL_IMAGE, COL_URL, COL_PRICE,
    'label_extracted', 'ml_pred_label', 'ml_prob_female', 'ml_prob_male', 'ml_prob_none', 'ml_confidence'
]
export_cols = [c for c in export_cols if c in validation.columns]

validation_export = validation[export_cols].copy()
validation_export['manual_gender'] = ''
validation_export['manual_confidence'] = ''
validation_export['manual_notes'] = ''

output_file = f'{OUTPUT_DIR}/validation_sample.csv'
validation_export.to_csv(output_file, index=True)
print(f"\n✓ Saved: {output_file} ({len(validation_export)} products)")

# ============================================================================
# SUMMARY
# ============================================================================

print("\n" + "="*70)
print("PIPELINE SUMMARY (v3)")
print("="*70)

print(f"""
DATA:
  Original: {original_count:,}
  After filtering: {len(df):,}
  Excluded categories: {excluded_count:,}

TRAINING:
  Total samples: {len(ml_data):,} (balanced 3-class)
  Train: {len(train_data):,}
  Test: {len(test_data):,}
  Products with colors: {len(color_df):,}

BEST MODEL: {best_name} (F1: {results_df.iloc[0]['F1_weighted']:.4f})

COLOR IMPACT:
  With colors:    F1={f1_l1:.4f}
  Without colors: F1={f1_no_color:.4f}

PREDICTIONS:
  Female: {(df['ml_pred_label'] == 'female').sum():,}
  Male: {(df['ml_pred_label'] == 'male').sum():,}
  None: {(df['ml_pred_label'] == 'none').sum():,}

IMPLICIT GENDERING:
  Implicit female: {len(implicit_female):,}
  Implicit male: {len(implicit_male):,}

OUTPUT:
  {OUTPUT_DIR}/
  ├── model_comparison.csv
  ├── feature_importance.csv
  └── validation_sample.csv
""")

# Save summary
summary = {
    'version': '3.0',
    'data': {
        'original': original_count,
        'filtered': len(df),
        'excluded': excluded_count,
        'training_samples': len(ml_data),
        'color_samples': len(color_df)
    },
    'models': results,
    'color_impact': {
        'with_colors_f1': f1_l1,
        'without_colors_f1': f1_no_color
    },
    'predictions': {
        'female': int((df['ml_pred_label'] == 'female').sum()),
        'male': int((df['ml_pred_label'] == 'male').sum()),
        'none': int((df['ml_pred_label'] == 'none').sum())
    }
}

with open(f'{OUTPUT_DIR}/summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

print(f"\n✓ Pipeline complete")

✓ Imports complete
✓ Output directory: /Users/leoss/Desktop/Portfolio/Website-/UK pink tax/Outputs/charts/ml_pipeline_v3

LOADING DATA
✓ Main dataset: 21,436 products
✓ Human-coded: 259 products
✓ Your labeled: 44 products

STEP 1: FILTER NON-GENDERED CATEGORIES
Original: 21,436
Excluded: 8,604
Remaining: 12,832

STEP 2: EXTRACT GENDER LABELS
Label distribution:
{'none': 10913, 'female': 1075, 'male': 844}
Human labels merged: 200

STEP 3: COLOR EXTRACTION (POST-FILTERING)
⚠ No cache found. Extracting colors for filtered products...
  This will take a while...
  Extracting colors for 2000 products...
    Progress: 0/2000 (0.0%)
    Progress: 100/2000 (5.0%)
    Progress: 200/2000 (10.0%)
    Progress: 300/2000 (15.0%)
    Progress: 400/2000 (20.0%)
    Progress: 500/2000 (25.0%)
    Progress: 600/2000 (30.0%)
    Progress: 700/2000 (35.0%)
    Progress: 800/2000 (40.0%)
