In [1]:
# IMPORTS AND DATA LOADING
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import urllib.request
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from scipy.sparse import csr_matrix, hstack
import pickle

In [2]:
# Loading data
train_df = pd.read_csv('../dataset/train.csv')
test_df = pd.read_csv('../dataset/test.csv')

print(" DATA SHAPES ")
print(f"Training data: {train_df.shape}")
print(f"Test data: {test_df.shape}")

 DATA SHAPES 
Training data: (75000, 4)
Test data: (75000, 3)


In [3]:
print(" DOWNLOADING IMAGES ")

def download_images_limited(image_links, download_folder, limit=1000):
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)
    
    downloaded = 0
    for i, image_link in enumerate(image_links[:limit]):  # ONLY DOWNLOAD FIRST 1000
        if isinstance(image_link, str):
            filename = os.path.basename(image_link)
            image_save_path = os.path.join(download_folder, filename)
            if not os.path.exists(image_save_path):
                try:
                    urllib.request.urlretrieve(image_link, image_save_path)
                    downloaded += 1
                    if downloaded % 100 == 0:
                        print(f"Downloaded {downloaded}/{limit} images...")
                except Exception as ex:
                    continue
    print(f"Image download completed! Downloaded {downloaded} images")

 DOWNLOADING IMAGES 


In [4]:
# Download only 1000 images each
download_images_limited(train_df['image_link'].tolist(), '../images_train', 1000)
download_images_limited(test_df['image_link'].tolist(), '../images_test', 1000)

Image download completed! Downloaded 0 images
Image download completed! Downloaded 0 images


In [5]:
print(" EXTRACTING ALL FEATURES ")

def extract_features(text):
    features = {}
    
    # Extracting Value (quantity)
    value_match = re.search(r'Value:\s*([0-9.]+)', text)
    features['value'] = float(value_match.group(1)) if value_match else 0.0
    
    # Extracting Unit
    unit_match = re.search(r'Unit:\s*([^\n]+)', text)
    features['unit'] = unit_match.group(1).strip() if unit_match else 'unknown'
    
    # Extracting Item Name
    item_match = re.search(r'Item Name:\s*([^\n]+)', text)
    features['item_name'] = item_match.group(1).strip() if item_match else ''
    
    # Count bullet points
    bullet_count = len(re.findall(r'Bullet Point\s*\d+:', text))
    features['bullet_count'] = bullet_count
    
    # Text length features
    features['text_length'] = len(text)
    features['item_name_length'] = len(features['item_name'])
    
    return features

 EXTRACTING ALL FEATURES 


In [6]:
def extract_image_features(image_path):
    try:
        with Image.open(image_path) as img:
            return {
                'image_width': img.width,
                'image_height': img.height, 
                'image_aspect_ratio': img.width / img.height if img.height > 0 else 0,
                'has_image': 1
            }
    except:
        return {'image_width': 0, 'image_height': 0, 'image_aspect_ratio': 0, 'has_image': 0}

In [7]:
def preprocess_features(features_df):
    processed = features_df.copy()
    
    # Handling extreme values
    value_cap = features_df['value'].quantile(0.99)
    processed['value_capped'] = np.minimum(features_df['value'], value_cap)
    processed['value_log'] = np.log1p(processed['value_capped'])
    
    # Creating categories
    common_units = ['Ounce', 'Count', 'Fl Oz', 'ounce', 'oz']
    processed['unit_category'] = processed['unit'].apply(
        lambda x: x if x in common_units else 'Other'
    )
    
    processed['bullet_category'] = processed['bullet_count'].apply(
        lambda x: 'None' if x == 0 else 'Few(1-4)' if x < 5 else 'Many(5+)'
    )
    
    return processed

In [8]:
# Extracting text features
print("Extracting text features...")
train_features = [extract_features(text) for text in train_df['catalog_content']]
train_features_df = pd.DataFrame(train_features)
train_processed = pd.concat([train_df, train_features_df], axis=1)

Extracting text features...


In [9]:
# Extracting image features
print("Extracting image features...")
train_image_features = []
for idx in train_processed.index:
    image_filename = os.path.basename(train_df.loc[idx, 'image_link'])
    image_path = f'../images_train/{image_filename}'
    train_image_features.append(extract_image_features(image_path))

train_image_df = pd.DataFrame(train_image_features, index=train_processed.index)
train_final = pd.concat([train_processed, train_image_df], axis=1)

Extracting image features...


In [10]:
# Preprocess
train_final = preprocess_features(train_final)

print("All features extracted and processed!")

All features extracted and processed!


In [11]:
print(" PREPARING TRAINING DATA ")

# Encoding categorical variables
label_encoders = {}
categorical_columns = ['unit_category', 'bullet_category']

 PREPARING TRAINING DATA 


In [12]:
for col in categorical_columns:
    le = LabelEncoder()
    train_final[col + '_encoded'] = le.fit_transform(train_final[col])
    label_encoders[col] = le

In [13]:
# Feature columns (TEXT + IMAGE)
feature_columns = [
    'value_capped', 'value_log', 'text_length', 'item_name_length',
    'unit_category_encoded', 'bullet_category_encoded',
    'image_width', 'image_height', 'image_aspect_ratio', 'has_image'
]

In [14]:
X = train_final[feature_columns]
y = train_final['price']

In [15]:
# Spliting data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")

Training set: (60000, 10)
Validation set: (15000, 10)


In [17]:
print(" ADDING TF-IDF FEATURES ")

# Getting item names for train/val
item_names_train = train_final.loc[X_train.index, 'item_name']
item_names_val = train_final.loc[X_val.index, 'item_name']

 ADDING TF-IDF FEATURES 


In [19]:
# Creating TF-IDF features
tfidf = TfidfVectorizer(max_features=500, stop_words='english')
tfidf_features_train = tfidf.fit_transform(item_names_train)
tfidf_features_val = tfidf.transform(item_names_val)

In [20]:
# Combining all features
existing_features_train = csr_matrix(X_train.values)
existing_features_val = csr_matrix(X_val.values)

In [21]:
X_train_final = hstack([existing_features_train, tfidf_features_train])
X_val_final = hstack([existing_features_val, tfidf_features_val])

print(f"Final training features: {X_train_final.shape}")

Final training features: (60000, 510)


In [22]:
print(" BUILDING ENSEMBLE MODEL ")

# Convert to dense for ensemble 
X_train_dense = X_train_final.toarray()  
X_val_dense = X_val_final.toarray()

 BUILDING ENSEMBLE MODEL 


In [23]:
# Defining models for ensemble
lgb_model = lgb.LGBMRegressor(
    n_estimators=500,
    learning_rate=0.05,
    num_leaves=63,
    random_state=42
)

xgb_model = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    random_state=42
)

rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    random_state=42
)

In [24]:
# Creating ensemble
ensemble = VotingRegressor([
    ('lightgbm', lgb_model),
    ('xgboost', xgb_model), 
    ('random_forest', rf_model)
])

print("Training ensemble model...")
ensemble.fit(X_train_dense, y_train)

Training ensemble model...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.143857 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 85384
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 510
[LightGBM] [Info] Start training from score 23.598634


0,1,2
,estimators,"[('lightgbm', ...), ('xgboost', ...), ...]"
,weights,
,n_jobs,
,verbose,False

0,1,2
,boosting_type,'gbdt'
,num_leaves,63
,max_depth,-1
,learning_rate,0.05
,n_estimators,500
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [25]:
# Also training individual LightGBM for comparison
lgb_individual = lgb.LGBMRegressor(n_estimators=500, random_state=42)
lgb_individual.fit(X_train_dense, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.445693 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 85384
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 510
[LightGBM] [Info] Start training from score 23.598634


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.1
,n_estimators,500
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [26]:
print(" MODEL EVALUATION ")

def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

 MODEL EVALUATION 


In [28]:
# Ensembling predictions
y_pred_ensemble = ensemble.predict(X_val_dense)
smape_ensemble = smape(y_val, y_pred_ensemble)



In [29]:
# LightGBM alone predictions  
y_pred_lgb = lgb_individual.predict(X_val_dense)
smape_lgb = smape(y_val, y_pred_lgb)



In [30]:
print(f"Ensemble SMAPE: {smape_ensemble:.2f}%")
print(f"LightGBM Alone SMAPE: {smape_lgb:.2f}%")
print(f"Ensemble Improvement: {smape_lgb - smape_ensemble:.2f}%")

Ensemble SMAPE: 63.09%
LightGBM Alone SMAPE: 60.80%
Ensemble Improvement: -2.29%


In [31]:
# Choosing best model
if smape_ensemble < smape_lgb:
    best_model = ensemble
    best_smape = smape_ensemble
    model_type = "ENSEMBLE"
else:
    best_model = lgb_individual  
    best_smape = smape_lgb
    model_type = "LIGHTGBM"

print(f"\n BEST MODEL: {model_type} with {best_smape:.2f}% SMAPE")


 BEST MODEL: LIGHTGBM with 60.80% SMAPE


In [32]:
print(" PREPARING TEST DATA ")

# Extracting test features
test_features = [extract_features(text) for text in test_df['catalog_content']]
test_features_df = pd.DataFrame(test_features)
test_processed = pd.concat([test_df, test_features_df], axis=1)

 PREPARING TEST DATA 


In [33]:
# Extracting test image features
test_image_features = []
for idx in test_processed.index:
    image_filename = os.path.basename(test_df.loc[idx, 'image_link'])
    image_path = f'../images_test/{image_filename}'
    test_image_features.append(extract_image_features(image_path))

test_image_df = pd.DataFrame(test_image_features, index=test_processed.index)
test_final = pd.concat([test_processed, test_image_df], axis=1)

In [35]:
# Preprocessing test data
test_final = preprocess_features(test_final)

In [36]:
# Encoding test categorical variables
for col in categorical_columns:
    le = label_encoders[col]
    test_final[col + '_encoded'] = test_final[col].apply(
        lambda x: le.transform([x])[0] if x in le.classes_ else len(le.classes_)
    )

In [37]:
# Preparing test features
X_test = test_final[feature_columns]
test_item_names = test_final['item_name']
tfidf_features_test = tfidf.transform(test_item_names)
existing_features_test = csr_matrix(X_test.values)
X_test_final = hstack([existing_features_test, tfidf_features_test])

print(f"Test features shape: {X_test_final.shape}")

Test features shape: (75000, 510)


In [38]:
print(" MAKING FINAL PREDICTIONS ")

# Converting to dense if using ensemble
if model_type == "ENSEMBLE":
    X_test_dense = X_test_final.toarray()
    final_predictions = best_model.predict(X_test_dense)
else:
    final_predictions = best_model.predict(X_test_final)

 MAKING FINAL PREDICTIONS 




In [40]:
# Ensuring positive prices
final_predictions = np.maximum(final_predictions, 0.1)

In [41]:
print(" PREDICTION STATISTICS ")
print(f"Min: ${final_predictions.min():.2f}")
print(f"Max: ${final_predictions.max():.2f}") 
print(f"Mean: ${final_predictions.mean():.2f}")
print(f"Total predictions: {len(final_predictions)}")

 PREDICTION STATISTICS 
Min: $0.10
Max: $265.59
Mean: $23.70
Total predictions: 75000


In [42]:
print(" FINAL SUBMISSION ")

# submission
final_submission = pd.DataFrame({
    'sample_id': test_df['sample_id'],
    'price': final_predictions.astype(float)
})

 FINAL SUBMISSION 


In [43]:
# correct format
final_submission.to_csv('../dataset/test_out.csv', index=False, float_format='%.6f')

print(" SUBMISSION CREATED: test_out.csv")
print(f"Shape: {final_submission.shape}")
print(f"Rows: {len(final_submission)}")

 SUBMISSION CREATED: test_out.csv
Shape: (75000, 2)
Rows: 75000


In [44]:
# Verify
print("\n VERIFICATION ")
print("First 5 predictions:")
print(final_submission.head())
print(f" All {len(final_submission)} predictions ready!")
print(f" All prices positive: {all(final_submission['price'] > 0)}")


 VERIFICATION 
First 5 predictions:
   sample_id      price
0     100179  13.495586
1     245611  21.545522
2     146263  21.549744
3      95658  18.024891
4      36806  69.704904
 All 75000 predictions ready!
 All prices positive: True


In [46]:
print(" SAVING MODELS ")

import os
os.makedirs('../models', exist_ok=True)

# Saving best model
if model_type == "ENSEMBLE":
    with open('../models/ensemble_model.pkl', 'wb') as f:
        pickle.dump(best_model, f)
else:
   with open('../models/lightgbm_model.pkl', 'wb') as f:
        pickle.dump(best_model, f)

# Saving preprocessing objects
with open('../models/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)
    
with open('../models/label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)

print("ALL MODELS SAVED!")
print(f" FINAL SOLUTION READY WITH {best_smape:.2f}% SMAPE!")
print("Files saved:")
print("   - ../models/lightgbm_model.pkl")
print("   - ../models/tfidf_vectorizer.pkl") 
print("   - ../models/label_encoders.pkl")

 SAVING MODELS 
ALL MODELS SAVED!
 FINAL SOLUTION READY WITH 60.80% SMAPE!
Files saved:
   - ../models/lightgbm_model.pkl
   - ../models/tfidf_vectorizer.pkl
   - ../models/label_encoders.pkl
