In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os

download_folder = "/content/drive/MyDrive/amazonml/product_images"
os.makedirs(download_folder, exist_ok=True)

csv_path = "/content/drive/MyDrive/amazonml/train.csv"  # path to your train.csv file


In [None]:
import pandas as pd

df = pd.read_csv(csv_path)
print(df.columns)


Index(['sample_id', 'catalog_content', 'image_link', 'price'], dtype='object')


In [None]:
df["price_log"] = np.log1p(df["price"])

In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
from pathlib import Path

# Folder where images are saved
image_folder = download_folder

# Extract the filename (without folder) from image_link
train_df["image_name"] = train_df["image_link"].apply(lambda x: Path(x).name)

# Now create full local path to each downloaded image
train_df["image_path"] = train_df["image_name"].apply(lambda x: os.path.join(image_folder, x))

# Check if all exist
train_df["exists"] = train_df["image_path"].apply(os.path.exists)
missing = train_df[~train_df["exists"]]
print(f"Missing images: {len(missing)}")

# Keep only rows where images exist
train_df = train_df[train_df["exists"]]
train_df = train_df[["image_path", "price_log"]].reset_index(drop=True)

train_df.head()


Missing images: 37264


Unnamed: 0,image_path,price_log
0,/content/drive/MyDrive/amazonml/product_images...,3.293983
1,/content/drive/MyDrive/amazonml/product_images...,2.707383
2,/content/drive/MyDrive/amazonml/product_images...,2.995232
3,/content/drive/MyDrive/amazonml/product_images...,4.727388
4,/content/drive/MyDrive/amazonml/product_images...,2.628285


In [None]:
# ==========================================================
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.layers import Dense, Dropout, GlobalAveragePooling2D, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

In [None]:
IMG_SIZE = 160
BATCH_SIZE = 32
EPOCHS = 15

train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)


In [None]:
def preprocess_image(path, label=None):
    img = tf.io.read_file(path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, [IMG_SIZE, IMG_SIZE])
    img = tf.cast(img, tf.float32) / 255.0
    if label is None:
        return img
    return img, label

In [None]:
train_ds = tf.data.Dataset.from_tensor_slices(
    (train_df["image_path"].values, train_df["price_log"].values)
)
train_ds = (
    train_ds.shuffle(2048)
    .map(preprocess_image, num_parallel_calls=tf.data.AUTOTUNE)
    .batch(BATCH_SIZE)
    .prefetch(tf.data.AUTOTUNE)
)

val_ds = tf.data.Dataset.from_tensor_slices(
    (val_df["image_path"].values, val_df["price_log"].values)
)
val_ds = (
    val_ds.map(preprocess_image, num_parallel_calls=tf.data.AUTOTUNE)
    .batch(BATCH_SIZE)
    .prefetch(tf.data.AUTOTUNE)
)


In [None]:
base_model = EfficientNetB0(include_top=False, weights="imagenet", input_shape=(IMG_SIZE, IMG_SIZE, 3))
base_model.trainable = False

x = base_model.output
x = GlobalAveragePooling2D()(x)
x = BatchNormalization()(x)
x = Dropout(0.3)(x)
x = Dense(256, activation="relu")(x)
x = Dropout(0.3)(x)
output = Dense(1)(x)

model = Model(inputs=base_model.input, outputs=output)


Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5
[1m16705208/16705208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [None]:
from tensorflow.keras.losses import Huber

model.compile(
    optimizer=Adam(learning_rate=1e-3),
    loss=Huber(delta=1.0),   # delta controls the switch point between L1 and L2
)


In [None]:
checkpoint = ModelCheckpoint(
    "efficientnet_price.keras", save_best_only=True, monitor="val_loss", mode="min"
)
earlystop = EarlyStopping(monitor="val_loss", patience=4, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor="val_loss", factor=0.3, patience=2)


history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    callbacks=[checkpoint, earlystop, reduce_lr],
)

Epoch 1/15
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4490s[0m 7s/step - loss: 5.6366 - val_loss: 2.6714 - learning_rate: 1.0000e-05
Epoch 2/15
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4250s[0m 7s/step - loss: 3.0275 - val_loss: 1.3817 - learning_rate: 1.0000e-05
Epoch 3/15
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4285s[0m 7s/step - loss: 2.6230 - val_loss: 1.3602 - learning_rate: 1.0000e-05
Epoch 4/15
[1m221/640[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m43:24[0m 6s/step - loss: 2.5331

In [None]:
test_ds = tf.data.Dataset.from_tensor_slices(
    (test_df["image_path"].values, test_df["price_log"].values)
)
test_ds = (
    test_ds.map(preprocess_image, num_parallel_calls=tf.data.AUTOTUNE)
    .batch(BATCH_SIZE)
    .prefetch(tf.data.AUTOTUNE)
)

# Predict
y_pred_log = model.predict(test_ds).flatten()
y_true_log = test_df["price_log"].values

# Convert back to normal scale
y_pred = np.expm1(y_pred_log)
y_true = np.expm1(y_true_log)

def smape(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred) / ((np.abs(y_true) + np.abs(y_pred)) / 2)) * 100

smape_score = smape(y_true, y_pred)
print(f"✅ SMAPE on Test Data: {smape_score:.2f}%")