In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amazon-ml-challenge-25/train.csv
/kaggle/input/amazon-ml-challenge-25/test.csv


In [2]:
train=pd.read_csv("/kaggle/input/amazon-ml-challenge-25/train.csv")
test=pd.read_csv("/kaggle/input/amazon-ml-challenge-25/test.csv")

In [3]:
import re
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', ' ', text)  
    text = re.sub(r'\s+', ' ', text).strip()  
    return text

train['catalog_content_clean'] = train['catalog_content'].apply(clean_text)
test['catalog_content_clean'] = test['catalog_content'].apply(clean_text)

In [4]:
def extract_value_unit(text):
    value = 0.0
    unit = ""
    try:
        value_match = re.search(r'Value:\s*([0-9\.]+)', text)
        unit_match = re.search(r'Unit:\s*(\w+)', text)
        if value_match:
            value = float(value_match.group(1))
        if unit_match:
            unit = unit_match.group(1).lower()
    except:
        pass
    return value, unit

train[['Value', 'Unit']] = train['catalog_content'].apply(lambda x: pd.Series(extract_value_unit(x)))
test[['Value', 'Unit']] = test['catalog_content'].apply(lambda x: pd.Series(extract_value_unit(x)))

In [5]:
train.head()

Unnamed: 0,sample_id,catalog_content,image_link,price,catalog_content_clean,Value,Unit
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89,item name la victoria green taco sauce mild 12...,72.0,fl
1,198967,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12,item name salerno cookies the original butter ...,32.0,ounce
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.97,item name bear creek hearty soup bowl creamy c...,11.4,ounce
3,55858,Item Name: Judee’s Blue Cheese Powder 11.25 oz...,https://m.media-amazon.com/images/I/41mu0HAToD...,30.34,item name judee s blue cheese powder 11 25 oz ...,11.25,ounce
4,292686,"Item Name: kedem Sherry Cooking Wine, 12.7 Oun...",https://m.media-amazon.com/images/I/41sA037+Qv...,66.49,item name kedem sherry cooking wine 12 7 ounce...,12.0,count


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb


ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
train_unit_ohe = ohe.fit_transform(train[['Unit']])
test_unit_ohe = ohe.transform(test[['Unit']])




In [7]:
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_text = tfidf.fit_transform(train['catalog_content_clean'])
X_test_text = tfidf.transform(test['catalog_content_clean'])


In [8]:
from scipy.sparse import hstack


X_train_numeric = np.hstack([train[['Value']].values, train_unit_ohe])
X_test_numeric = np.hstack([test[['Value']].values, test_unit_ohe])


X_train_final = hstack([X_train_text, X_train_numeric])
X_test_final = hstack([X_test_text, X_test_numeric])

y_train = train['price'].values

In [9]:
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train_final, y_train, test_size=0.1, random_state=42)


In [None]:
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error


lgb_model = LGBMRegressor(
    n_estimators=2000,
    learning_rate=0.01,
    num_leaves=63,
    subsample=0.6,
    colsample_bytree=0.8,
    random_state=42
)


lgb_model.fit(X_train_split, y_train_split)


y_val_pred = lgb_model.predict(X_val)
val_mae = mean_absolute_error(y_val, y_val_pred)
print(f"Validation MAE: {val_mae:.4f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.868069 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 958291
[LightGBM] [Info] Number of data points in the train set: 67500, number of used features: 5011
[LightGBM] [Info] Start training from score 23.595307


In [None]:
test['price'] = lgb_model.predict(X_test_final)

 
test['price'] = test['price'].apply(lambda x: max(0.01, x))

 
submission = test[['sample_id', 'price']]
submission.to_csv('test_out.csv', index=False)

submission.head()

In [None]:
def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(numerator / denominator) * 100

val_smape = smape(y_val, y_val_pred)
print(f"Validation SMAPE: {val_smape:.2f}%")
