# Smart Product Pricing Challenge — Full Pipeline

This notebook contains an end-to-end pipeline: EDA, feature engineering (text + image), model training, CV (SMAPE), and submission generation.

Run cells sequentially. Adjust `IMAGE_SAMPLE_LIMIT`, `TFIDF_MAX_FEATURES`, and other config values before running heavy cells.


In [None]:
import os
from pathlib import Path

ZIP_PATH = "/Users/harshilpatel/CODE/Student Resource.zip"
EXTRACT_DIR = "/Users/harshilpatel/CODE/student_resource_extracted"
OUTPUT_SUBMISSION = "/Users/harshilpatel/CODE/test_out_full.csv"
IMAGE_SAMPLE_LIMIT = 2000
TFIDF_MAX_FEATURES = 10000
print("Config variables set.")

Config variables set.


In [None]:
import zipfile
import os

if os.path.exists(ZIP_PATH):
    with zipfile.ZipFile(ZIP_PATH, "r") as z:
        z.extractall(EXTRACT_DIR)
    print("Extracted to", EXTRACT_DIR)
else:
    print("ZIP not found at", ZIP_PATH)

# locate CSVs
train_path = None
test_path = None
for root, dirs, files in os.walk(EXTRACT_DIR):
    for f in files:
        if f.lower().startswith("train") and f.lower().endswith(".csv"):
            train_path = os.path.join(root, f)
        if f.lower().startswith("test") and f.lower().endswith(".csv"):
            test_path = os.path.join(root, f)
print("train:", train_path)
print("test :", test_path)

Extracted to /Users/harshilpatel/CODE/student_resource_extracted
train: /Users/harshilpatel/CODE/student_resource_extracted/student_resource/dataset/train.csv
test : /Users/harshilpatel/CODE/student_resource_extracted/student_resource/dataset/test.csv


In [None]:
import pandas as pd

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
print("Loaded train/test rows:", len(train), len(test))
train.head()

Loaded train/test rows: 75000 75000


Unnamed: 0,sample_id,catalog_content,image_link,price
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89
1,198967,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.97
3,55858,Item Name: Judee’s Blue Cheese Powder 11.25 oz...,https://m.media-amazon.com/images/I/41mu0HAToD...,30.34
4,292686,"Item Name: kedem Sherry Cooking Wine, 12.7 Oun...",https://m.media-amazon.com/images/I/41sA037+Qv...,66.49


In [None]:
print("Columns:", train.columns.tolist())
print("Missing (train):")
print(train.isnull().sum())
print("Price stats:")
print(train["price"].describe())

Columns: ['sample_id', 'catalog_content', 'image_link', 'price']
Missing (train):
sample_id          0
catalog_content    0
image_link         0
price              0
dtype: int64
Price stats:
count    75000.000000
mean        23.647654
std         33.376932
min          0.130000
25%          6.795000
50%         14.000000
75%         28.625000
max       2796.000000
Name: price, dtype: float64


In [None]:
import re
import numpy as np


def extract_ipq(text):
    if not isinstance(text, str):
        return np.nan
    patterns = [
        r"(\b\d+\s*(?:pcs|pieces|count|ct|pk|pack)\b)",
        r"(\b\d+\s*(?:x|X)\s*\d*\b)",
        r"(\b\d+\s*(?:ml|l|g|kg|gm|oz)\b)",
    ]
    for p in patterns:
        m = re.search(p, text, flags=re.IGNORECASE)
        if m:
            return m.group(0)
    m2 = re.search(r"(\b\d+\b)(?!.*\b\d+\b)", text)
    if m2:
        return m2.group(0)
    return np.nan


train["ipq_raw"] = train["catalog_content"].apply(extract_ipq)
test["ipq_raw"] = test["catalog_content"].apply(extract_ipq)
train["ipq_raw"].value_counts().head(20)

ipq_raw
1          3975
12         2690
6          2663
2          2064
2 Pack     1854
5 oz       1791
4          1706
5          1678
3 Pack     1504
3          1461
0          1305
8 oz       1285
8          1188
16 oz       986
16          927
12 oz       892
4 oz        887
12 Pack     791
10          772
24          760
Name: count, dtype: int64

In [None]:
import re


def clean_text(s):
    if not isinstance(s, str):
        return ""
    s = s.lower()
    s = re.sub(r"http\S+", " ", s)
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s


train["catalog_clean"] = train["catalog_content"].apply(clean_text)
test["catalog_clean"] = test["catalog_content"].apply(clean_text)

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=TFIDF_MAX_FEATURES, ngram_range=(1, 2))
vectorizer.fit(pd.concat([train["catalog_clean"], test["catalog_clean"]]))
X_text_train = vectorizer.transform(train["catalog_clean"])
X_text_test = vectorizer.transform(test["catalog_clean"])
print("TF-IDF shapes:", X_text_train.shape, X_text_test.shape)

TF-IDF shapes: (75000, 10000) (75000, 10000)


### Optional: sentence-transformers embeddings

This cell will install and compute sentence-transformer embeddings. It may be slow and requires internet. If offline, skip this cell or load a local model.


In [None]:
# Example (do not run if offline):
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")
emb_train = model.encode(
    train["catalog_clean"].tolist(), batch_size=64, show_progress_bar=True
)
# Save or subsample embeddings for modeling

Batches:   0%|          | 0/1172 [00:00<?, ?it/s]

### Image download and basic features

Download images with retry logic. Compute width, height, aspect, mean pixel. Then compute CNN embeddings using ResNet50 (torchvision).


In [None]:
import requests
from PIL import Image
import numpy as np

IMG_DIR = os.path.join(EXTRACT_DIR, "images")
os.makedirs(IMG_DIR, exist_ok=True)


def download_image(url, dest, retries=2):
    for i in range(retries + 1):
        try:
            r = requests.get(url, timeout=8)
            if r.status_code == 200:
                with open(dest, "wb") as f:
                    f.write(r.content)
                return True
        except Exception:
            time.sleep(0.5)
    return False


def image_stats(path):
    try:
        with Image.open(path) as im:
            w, h = im.size
            arr = np.array(im.convert("RGB"))
            return {
                "width": w,
                "height": h,
                "aspect": float(w) / h if h else np.nan,
                "mean_px": float(arr.mean()),
            }
    except Exception:
        return {"width": np.nan, "height": np.nan, "aspect": np.nan, "mean_px": np.nan}


# Download sample
links = train["image_link"].drop_duplicates().tolist()[:IMAGE_SAMPLE_LIMIT]
img_meta = []
for i, url in enumerate(links):
    dest = os.path.join(IMG_DIR, f"image_{i}.jpg")
    ok = download_image(url, dest)
    if ok:
        s = image_stats(dest)
        s.update({"url": url, "file": dest})
        img_meta.append(s)

import pandas as pd

img_df = pd.DataFrame(img_meta)
img_df.head()

Unnamed: 0,width,height,aspect,mean_px,url,file
0,1000,1000,1.0,234.657214,https://m.media-amazon.com/images/I/51mo8htwTH...,/Users/harshilpatel/CODE/student_resource_extr...
1,1200,1200,1.0,171.139793,https://m.media-amazon.com/images/I/71YtriIHAA...,/Users/harshilpatel/CODE/student_resource_extr...
2,500,500,1.0,174.827907,https://m.media-amazon.com/images/I/51+PFEe-w-...,/Users/harshilpatel/CODE/student_resource_extr...
3,500,500,1.0,205.804781,https://m.media-amazon.com/images/I/41mu0HAToD...,/Users/harshilpatel/CODE/student_resource_extr...
4,1000,1000,1.0,222.876347,https://m.media-amazon.com/images/I/41sA037+Qv...,/Users/harshilpatel/CODE/student_resource_extr...


### Image embeddings using ResNet50 (PyTorch)

This cell will install torch if needed and compute embeddings. Run on GPU for speed.


In [None]:
import torch
from torchvision import models, transforms

resnet = models.resnet50(pretrained=True)
resnet = torch.nn.Sequential(*list(resnet.children())[:-1]).eval()
# define transforms
transform = transforms.Compose(
    [
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ]
)

# compute embeddings for img_df['file']
embs = []
for path in img_df["file"].tolist():
    try:
        im = Image.open(path).convert("RGB")
        x = transform(im).unsqueeze(0)
        with torch.no_grad():
            out = resnet(x)
        embs.append(out.cpu().numpy().reshape(-1))
    except Exception:
        embs.append(np.full((2048,), np.nan))

# convert to DataFrame
import numpy as np

emb_arr = np.stack(embs)
emb_df = pd.DataFrame(emb_arr, columns=[f"emb_{i}" for i in range(emb_arr.shape[1])])
emb_df["url"] = img_df["url"].values
emb_df.head()



Unnamed: 0,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,emb_9,...,emb_2039,emb_2040,emb_2041,emb_2042,emb_2043,emb_2044,emb_2045,emb_2046,emb_2047,url
0,0.844571,1.227987,0.025293,0.002584,0.237795,1.381954,0.293015,0.034165,0.160291,0.098058,...,0.50813,0.274645,0.14253,0.056105,0.018445,0.220828,0.016182,0.369143,0.275119,https://m.media-amazon.com/images/I/51mo8htwTH...
1,0.110833,1.911485,0.000412,0.008957,0.811973,0.044334,0.249721,0.014019,0.286661,0.080281,...,0.281596,0.586324,0.04083,0.0,0.061991,0.052233,0.250995,0.00255,0.198911,https://m.media-amazon.com/images/I/71YtriIHAA...
2,0.554837,0.463608,0.228232,0.179348,1.11433,0.47175,0.371559,0.214669,0.42619,0.074604,...,0.263382,0.827038,0.054977,0.268808,0.146072,0.644417,0.318873,0.316151,0.465631,https://m.media-amazon.com/images/I/51+PFEe-w-...
3,0.725837,1.186047,0.348569,0.0,0.693171,0.340718,0.795682,1.04303,0.670529,0.058493,...,0.332458,0.719062,0.095259,0.15866,0.021819,0.267496,0.080402,0.356998,0.812485,https://m.media-amazon.com/images/I/41mu0HAToD...
4,0.365849,1.481238,0.38243,0.028895,0.238223,1.978534,0.431884,0.068905,0.006629,0.090226,...,0.570752,0.333824,0.056835,0.125275,0.001117,0.075989,0.054212,0.62928,0.018297,https://m.media-amazon.com/images/I/41sA037+Qv...


In [None]:
# Merge features back to train/test
train_feat = train.merge(img_df, left_on="image_link", right_on="url", how="left")
train_feat = train_feat.merge(emb_df, left_on="image_link", right_on="url", how="left")
# fill numeric
num_cols = ["ipq_num", "width", "height", "mean_px"] + [
    c for c in train_feat.columns if c.startswith("emb_")
]
for c in num_cols:
    if c in train_feat:
        train_feat[c] = train_feat[c].fillna(-1)

train_feat.shape

(75000, 2061)

In [19]:
from scipy.sparse import hstack, csr_matrix
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd

# prepare matrices (unchanged)
num_cols = [
    c
    for c in train_feat.columns
    if c in ["ipq_num", "width", "height", "mean_px"] or c.startswith("emb_")
]
X_num = train_feat[num_cols].values
X_num_test = test.merge(img_df, left_on="image_link", right_on="url", how="left").merge(
    emb_df, left_on="image_link", right_on="url", how="left"
)
X_num_test = (
    X_num_test[[c for c in X_num_test.columns if c in num_cols]].fillna(-1).values
)

X_sparse = hstack([X_text_train, csr_matrix(X_num)])
X_sparse_test = hstack([X_text_test, csr_matrix(X_num_test)])

# target
y = train_feat["price"].values

# log transform
y_log = np.log1p(y)

# CV
kf = KFold(n_splits=5, shuffle=True, random_state=42)
pred_oof = np.zeros(len(y))
pred_test = np.zeros(X_sparse_test.shape[0])

for fold, (tr, val) in enumerate(kf.split(X_sparse)):
    print("Fold", fold + 1)
    train_set = lgb.Dataset(X_sparse[tr], y_log[tr])
    val_set = lgb.Dataset(X_sparse[val], y_log[val])
    params = {
        "objective": "regression",
        "metric": "rmse",
        "learning_rate": 0.05,
        "num_leaves": 128,
        "verbosity": -1,
    }

    # use callback-based early stopping and logging
    model = lgb.train(
        params,
        train_set,
        num_boost_round=2000,
        valid_sets=[val_set],
        callbacks=[early_stopping(stopping_rounds=100), log_evaluation(period=200)],
    )

    # safe selection of iteration to use for predict
    n_iter = getattr(model, "best_iteration", None)
    if n_iter is None or n_iter == 0:
        n_iter = model.num_trees()

    pred_oof[val] = np.expm1(model.predict(X_sparse[val], num_iteration=n_iter))
    pred_test += (
        np.expm1(model.predict(X_sparse_test, num_iteration=n_iter)) / kf.n_splits
    )


# SMAPE
def smape(y_true, y_pred):
    denom = (np.abs(y_true) + np.abs(y_pred)) / 2
    denom[denom == 0] = 1e-6
    return np.mean(np.abs(y_true - y_pred) / denom) * 100


print("CV SMAPE:", smape(y, pred_oof))

# Save submission
out = pd.DataFrame(
    {"sample_id": test["sample_id"], "price": np.clip(pred_test, 0.01, None)}
)
out.to_csv(OUTPUT_SUBMISSION, index=False)
print("Saved submission to", OUTPUT_SUBMISSION)

Fold 1
Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 0.698096
[400]	valid_0's rmse: 0.691454
[600]	valid_0's rmse: 0.691162
Early stopping, best iteration is:
[511]	valid_0's rmse: 0.690949
Fold 2
Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 0.68189
[400]	valid_0's rmse: 0.676292
[600]	valid_0's rmse: 0.67613
Early stopping, best iteration is:
[504]	valid_0's rmse: 0.675751
Fold 3
Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 0.685319
[400]	valid_0's rmse: 0.68121
Early stopping, best iteration is:
[345]	valid_0's rmse: 0.681003
Fold 4
Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 0.677954
[400]	valid_0's rmse: 0.674321
Early stopping, best iteration is:
[493]	valid_0's rmse: 0.674219
Fold 5
Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 0.692652
[400]	valid_0's rmse: 0.688222
Early stopping, best i

## Methodology (one-page)

The notebook will produce a one-page markdown `methodology_one_page.md` describing dataset, preprocessing, modeling, and improvements. Edit as needed.


In [20]:
import numpy as np
from sklearn.metrics import mean_squared_error

# Calculate additional metrics
mse = mean_squared_error(y, pred_oof)
rmse = np.sqrt(mse)
smape_score = smape(y, pred_oof)  # Already calculated in previous cell

# Print all final parameters and metrics
print("=" * 50)
print("FINAL PARAMETERS AND METRICS SUMMARY")
print("=" * 50)

print("\n--- DATA PATHS ---")
print(f"ZIP_PATH: {ZIP_PATH}")
print(f"EXTRACT_DIR: {EXTRACT_DIR}")
print(f"OUTPUT_SUBMISSION: {OUTPUT_SUBMISSION}")

print("\n--- FEATURE ENGINEERING PARAMETERS ---")
print(f"IMAGE_SAMPLE_LIMIT: {IMAGE_SAMPLE_LIMIT}")
print(f"TFIDF_MAX_FEATURES: {TFIDF_MAX_FEATURES}")
print(f"TEXT FEATURES SHAPE: {X_text_train.shape}")
print(f"IMAGE DIR: {IMG_DIR}")
print(
    f"NUMERICAL FEATURES: {', '.join(num_cols) if len(num_cols) < 10 else f'{len(num_cols)} features'}"
)

print("\n--- MODEL PARAMETERS ---")
print(f"MODEL: LightGBM")
print(f"OBJECTIVE: {params['objective']}")
print(f"METRIC: {params['metric']}")
print(f"LEARNING RATE: {params['learning_rate']}")
print(f"NUM_LEAVES: {params['num_leaves']}")
print(f"NUM_BOOST_ROUNDS: 2000")
print(f"EARLY_STOPPING_ROUNDS: 100")

print("\n--- CROSS-VALIDATION PARAMETERS ---")
print(f"CV METHOD: KFold")
print(f"N_SPLITS: {kf.n_splits}")
print(f"SHUFFLE: {kf.shuffle}")
print(f"RANDOM_STATE: {kf.random_state}")

print("\n--- EVALUATION METRICS ---")
print(f"MSE: {mse:.6f}")
print(f"RMSE: {rmse:.6f}")
print(f"SMAPE: {smape_score:.6f}")

print("\n--- TRANSFORMATION ---")
print(f"TARGET TRANSFORMATION: Log(1+y)")
print(f"PREDICTION TRANSFORMATION: exp(pred)-1")

FINAL PARAMETERS AND METRICS SUMMARY

--- DATA PATHS ---
ZIP_PATH: /Users/harshilpatel/CODE/Student Resource.zip
EXTRACT_DIR: /Users/harshilpatel/CODE/student_resource_extracted
OUTPUT_SUBMISSION: /Users/harshilpatel/CODE/test_out_full.csv

--- FEATURE ENGINEERING PARAMETERS ---
IMAGE_SAMPLE_LIMIT: 2000
TFIDF_MAX_FEATURES: 10000
TEXT FEATURES SHAPE: (75000, 10000)
IMAGE DIR: /Users/harshilpatel/CODE/student_resource_extracted/images
NUMERICAL FEATURES: 2051 features

--- MODEL PARAMETERS ---
MODEL: LightGBM
OBJECTIVE: regression
METRIC: rmse
LEARNING RATE: 0.05
NUM_LEAVES: 128
NUM_BOOST_ROUNDS: 2000
EARLY_STOPPING_ROUNDS: 100

--- CROSS-VALIDATION PARAMETERS ---
CV METHOD: KFold
N_SPLITS: 5
SHUFFLE: True
RANDOM_STATE: 42

--- EVALUATION METRICS ---
MSE: 746.014998
RMSE: 27.313275
SMAPE: 51.846265

--- TRANSFORMATION ---
TARGET TRANSFORMATION: Log(1+y)
PREDICTION TRANSFORMATION: exp(pred)-1
