<a href="https://colab.research.google.com/github/jai3546/AI_ROCKERS/blob/main/Amazon_Ml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import pandas as pd
import numpy as np
import os, re, cv2
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

# Deep Learning for image feature extraction
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.preprocessing import image



In [None]:
train=pd.read_csv('/content/train.csv')
test=pd.read_csv('/content/test.csv')
print("Train shape:", train.shape)
print("Test shape:", test.shape)
train.head()


Train shape: (75000, 4)
Test shape: (75000, 3)


Unnamed: 0,sample_id,catalog_content,image_link,price
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89
1,198967,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.97
3,55858,Item Name: Judee’s Blue Cheese Powder 11.25 oz...,https://m.media-amazon.com/images/I/41mu0HAToD...,30.34
4,292686,"Item Name: kedem Sherry Cooking Wine, 12.7 Oun...",https://m.media-amazon.com/images/I/41sA037+Qv...,66.49


In [39]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", " ", text)
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

train["clean_text"] = train["catalog_content"].apply(clean_text)
test["clean_text"] = test["catalog_content"].apply(clean_text)
# Extract numeric quantity from catalog_content
def extract_quantity(text):
    nums = re.findall(r'\d+', text)
    return float(nums[0]) if nums else 1.0

train['quantity'] = train['clean_text'].apply(extract_quantity)
test['quantity'] = test['clean_text'].apply(extract_quantity)

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(
    stop_words='english',
    max_features=10000,  # increase features
    ngram_range=(1, 3)   # include trigrams
)

X_train_text = vectorizer.fit_transform(train['clean_text'])
X_test_text = vectorizer.transform(test['clean_text'])

from scipy.sparse import hstack

X_train_combined = hstack([X_train_text, np.array(train['quantity']).reshape(-1,1)])
X_test_combined  = hstack([X_test_text, np.array(test['quantity']).reshape(-1,1)])


In [44]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor
from scipy.sparse import csr_matrix # Import csr_matrix
import lightgbm as lgb # Import lightgbm

# Combine features from your previous cell
X = X_train_combined
y = np.log1p(train['price'].values)  # log-transform target

# Convert to csr_matrix for efficient slicing
X = csr_matrix(X)
X_test_combined = csr_matrix(X_test_combined)

# Epsilon-safe SMAPE
def smape(y_true, y_pred, epsilon=1e-6):
    return 100 * np.mean(
        np.abs(y_true - y_pred) / ((np.abs(y_true) + np.abs(y_pred) + epsilon) / 2)
    )

# K-Fold CV
kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(y))
mae_scores = []
smape_scores = []

for train_idx, val_idx in kf.split(X):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    model = LGBMRegressor(
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=8,
        random_state=42,
        n_jobs=-1
    )

    # Define a custom callback to print metrics
    class PrintMetricsCallback:
        def __init__(self, eval_set, eval_metric):
            self.eval_set = eval_set
            self.eval_metric = eval_metric

        def __call__(self, env):
            if env.evaluation_result_list:
                print(f"Round {env.iteration}:")
                for data_name, eval_name, result, _ in env.evaluation_result_list:
                    print(f"\t{data_name}'s {eval_name}: {result}")

    # Use the custom callback and early stopping
    callbacks = [
        PrintMetricsCallback(eval_set=[(X_val, y_val)], eval_metric='mae'),
        lgb.early_stopping(stopping_rounds=50, verbose=False) # verbose in early_stopping controls its own messages
    ]

    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='mae',
        callbacks=callbacks
    )

    val_preds_log = model.predict(X_val)
    val_preds = np.expm1(val_preds_log)  # back-transform
    y_val_orig = np.expm1(y_val)

    oof_preds[val_idx] = val_preds
    mae_scores.append(mean_absolute_error(y_val_orig, val_preds))
    smape_scores.append(smape(y_val_orig, val_preds))

print("CV MAE:", np.mean(mae_scores))
print("CV SMAPE:", np.mean(smape_scores))

# Optional: predict on test set
test_preds_log = model.predict(X_test_combined)
test_preds = np.expm1(test_preds_log)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 10.366215 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1057768
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 9994
[LightGBM] [Info] Start training from score 2.740904
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 10.308117 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1055922
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 9993
[LightGBM] [Info] Start training from score 2.738173
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 9.856453 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [In

In [46]:
# Predict prices for test set
test_preds_log = model.predict(X_test_combined)

# Ensure no negative predictions
test_preds = np.expm1(test_preds_log) # Use expm1 to back-transform from log scale
test_preds[test_preds < 0] = 0 # Set negative predictions to 0

# Create submission file
submission = pd.DataFrame({
    "sample_id": test["sample_id"],
    "price": test_preds
})

submission.to_csv("test_out.csv", index=False)
print("✅ test_out.csv created successfully!")
display(submission.head())

✅ test_out.csv created successfully!


Unnamed: 0,sample_id,price
0,100179,17.335933
1,245611,15.608065
2,146263,22.060413
3,95658,14.516362
4,36806,28.810085


In [47]:
from google.colab import files
files.download("test_out.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>