In [1]:
pip install pandas numpy opencv-python torch torchvision transformers


Collecting transformers
  Downloading transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Downloading huggingface_hub-0.24.7-py3-none-any.whl.metadata (13 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.5-cp311-none-win_amd64.whl.metadata (3.9 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Downloading tokenizers-0.19.1-cp311-none-win_amd64.whl.metadata (6.9 kB)
Downloading transformers-4.44.2-py3-none-any.whl (9.5 MB)
   ---------------------------------------- 0.0/9.5 MB ? eta -:--:--
   --- ------------------------------------ 0.8/9.5 MB 6.7 MB/s eta 0:00:02
   -------- ------------------------------- 2.1/9.5 MB 6.2 MB/s eta 0:00:02
   ------------------ --------------------- 4.5/9.5 MB 8.1 MB/s eta 0:00:01
   ------------------------- -------------- 6.0/9.5 MB 8.0 MB/s eta 0:00:01
   ------------------------------- -------- 7.3/9.5 MB 7.8 MB/s eta 0:00:01
   -------------

In [5]:
import pandas as pd
train_data = pd.read_csv('dataset/train.csv') 


In [6]:
train_data['image_path'] = 'train_images/' + train_data['image_link'].apply(lambda url: url.split("/")[-1])


In [7]:
test_data = pd.read_csv('dataset/test.csv') 

In [8]:
test_data['image_path'] = 'test_images/' + test_data['image_link'].apply(lambda url: url.split("/")[-1])


In [9]:
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce', 'gallon', 
                    'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}

In [10]:
def get_default_unit(entity_name):
    if entity_name in entity_unit_map:
        return next(iter(entity_unit_map[entity_name]))  # Using the first unit in the set
    return ""

In [11]:
test_data['default_unit'] = test_data['entity_name'].apply(get_default_unit)

In [28]:
import torch
from torchvision import models, transforms
from PIL import Image
from tqdm import tqdm
import pandas as pd


In [29]:
# Loading the model
model = models.resnet50(weights='IMAGENET1K_V1')
model.eval()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [30]:
# Defining image transformation
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [31]:
# Extracting features from images
def extract_image_features(image_path):
    try:
        image = Image.open(image_path).convert('RGB')
        image = transform(image).unsqueeze(0)  # Adding batch dimension
        with torch.no_grad():
            features = model(image)
        return features.squeeze().numpy()  # Returning as numpy array
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return None

In [32]:
# Applying feature extraction to train data with progress tracking
def process_data(data):
    features = []
    for image_path in tqdm(data['image_path'], desc="Processing images"):
        feature = extract_image_features(image_path)
        features.append(feature)
    return features

In [33]:
# Applying feature extraction to train and test data
train_data['image_features'] = process_data(train_data)
test_data['image_features'] = process_data(test_data)

Processing images:   0%|          | 1098/263859 [01:29<5:16:13, 13.85it/s]

Error processing train_images/41NH8WgeBOL.jpg: image file is truncated (2 bytes not processed)


Processing images:  27%|██▋       | 70597/263859 [1:38:42<4:10:48, 12.84it/s]

Error processing train_images/41cxahpsHuL.jpg: image file is truncated (5 bytes not processed)


Processing images:  27%|██▋       | 70609/263859 [1:38:43<4:28:26, 12.00it/s]

Error processing train_images/51mPh8XSf7L.jpg: cannot identify image file 'train_images/51mPh8XSf7L.jpg'


Processing images:  27%|██▋       | 70676/263859 [1:38:49<4:13:26, 12.70it/s]

Error processing train_images/6188w25R09L.jpg: image file is truncated (5 bytes not processed)


Processing images:  27%|██▋       | 70703/263859 [1:38:52<3:56:30, 13.61it/s]

Error processing train_images/81OPKq3nB1L.jpg: image file is truncated (3 bytes not processed)


Processing images:  27%|██▋       | 70709/263859 [1:38:52<4:00:25, 13.39it/s]

Error processing train_images/71lxEiQBHlL.jpg: image file is truncated (5 bytes not processed)


Processing images:  27%|██▋       | 70800/263859 [1:39:00<3:59:26, 13.44it/s]

Error processing train_images/71FoRnojWQL.jpg: image file is truncated (1 bytes not processed)


Processing images:  27%|██▋       | 70806/263859 [1:39:01<4:10:17, 12.86it/s]

Error processing train_images/71tusN0H4KL.jpg: cannot identify image file 'train_images/71tusN0H4KL.jpg'


Processing images:  27%|██▋       | 70828/263859 [1:39:03<4:32:46, 11.79it/s]

Error processing train_images/817EsPs2NyL.jpg: image file is truncated (6 bytes not processed)


Processing images:  27%|██▋       | 70830/263859 [1:39:03<4:05:27, 13.11it/s]

Error processing train_images/71A8rZ+KHzL.jpg: image file is truncated (3 bytes not processed)


Processing images:  27%|██▋       | 70841/263859 [1:39:04<3:31:08, 15.24it/s]

Error processing train_images/61tRf8na-LL.jpg: image file is truncated (3 bytes not processed)


Processing images:  29%|██▉       | 77113/263859 [1:48:28<3:45:21, 13.81it/s]

Error processing train_images/41Xvpup0p6L.jpg: image file is truncated (3 bytes not processed)


Processing images:  30%|██▉       | 78615/263859 [1:50:42<4:00:04, 12.86it/s]

Error processing train_images/61svIMKhh5L.jpg: image file is truncated (6 bytes not processed)


Processing images:  30%|██▉       | 78624/263859 [1:50:42<3:37:02, 14.22it/s]

Error processing train_images/61n07sirnkL.jpg: image file is truncated (6 bytes not processed)


Processing images:  30%|██▉       | 78651/263859 [1:50:45<4:06:19, 12.53it/s]

Error processing train_images/71K6ilMyX5L.jpg: image file is truncated (1 bytes not processed)


Processing images:  30%|██▉       | 78657/263859 [1:50:45<3:56:13, 13.07it/s]

Error processing train_images/71dWUPBZm2L.jpg: image file is truncated (5 bytes not processed)


Processing images:  30%|██▉       | 78662/263859 [1:50:46<4:00:51, 12.81it/s]

Error processing train_images/41CPXmRXMRL.jpg: cannot identify image file 'train_images/41CPXmRXMRL.jpg'
Error processing train_images/71zPE5j24DL.jpg: image file is truncated (6 bytes not processed)


Processing images:  30%|██▉       | 78669/263859 [1:50:46<3:09:44, 16.27it/s]

Error processing train_images/71ifSZhyAOL.jpg: image file is truncated (3 bytes not processed)
Error processing train_images/81eyV2nbHzL.jpg: image file is truncated (1 bytes not processed)


Processing images:  30%|██▉       | 78674/263859 [1:50:46<3:19:16, 15.49it/s]

Error processing train_images/81DjNcE+wbL.jpg: cannot identify image file 'train_images/81DjNcE+wbL.jpg'
Error processing train_images/81vfppR95cL.jpg: image file is truncated (5 bytes not processed)


Processing images:  30%|██▉       | 78679/263859 [1:50:47<2:45:19, 18.67it/s]

Error processing train_images/61neBx93yqL.jpg: image file is truncated (3 bytes not processed)
Error processing train_images/51LB3f5gPQL.jpg: image file is truncated (49 bytes not processed)


Processing images:  30%|██▉       | 78757/263859 [1:50:54<3:43:42, 13.79it/s]

Error processing train_images/51VbeouyLLL.jpg: cannot identify image file 'train_images/51VbeouyLLL.jpg'


Processing images:  30%|██▉       | 78847/263859 [1:51:02<3:47:49, 13.54it/s]

Error processing train_images/511kqDJia0L.jpg: image file is truncated (3 bytes not processed)


Processing images:  30%|██▉       | 78895/263859 [1:51:07<4:10:37, 12.30it/s]

Error processing train_images/81xwx6dnIfL.jpg: image file is truncated (2 bytes not processed)


Processing images:  30%|██▉       | 79007/263859 [1:51:17<3:44:52, 13.70it/s]

Error processing train_images/61f4qbv+n2L.jpg: image file is truncated (4 bytes not processed)


Processing images:  30%|██▉       | 79009/263859 [1:51:17<3:46:08, 13.62it/s]

Error processing train_images/81uppVWx2zL.jpg: image file is truncated (5 bytes not processed)


Processing images:  30%|██▉       | 79141/263859 [1:51:29<3:37:46, 14.14it/s]

Error processing train_images/51V54cHSNXL.jpg: cannot identify image file 'train_images/51V54cHSNXL.jpg'


Processing images:  30%|███       | 79168/263859 [1:51:31<3:27:26, 14.84it/s]

Error processing train_images/71uonw0ip1L.jpg: image file is truncated (35 bytes not processed)


Processing images:  30%|███       | 79178/263859 [1:51:32<4:10:09, 12.30it/s]

Error processing train_images/71cMzRJQWeL.jpg: image file is truncated (3 bytes not processed)


Processing images:  30%|███       | 79383/263859 [1:51:51<4:17:45, 11.93it/s]

Error processing train_images/71dV--QHHML.jpg: image file is truncated (4 bytes not processed)


Processing images:  30%|███       | 79676/263859 [1:52:18<3:59:18, 12.83it/s]

Error processing train_images/71GqL7TL7cL.jpg: image file is truncated (1 bytes not processed)


Processing images:  30%|███       | 79861/263859 [1:52:35<3:54:43, 13.07it/s]

Error processing train_images/71duK-OC0bL.jpg: image file is truncated (6 bytes not processed)


Processing images:  30%|███       | 79921/263859 [1:52:40<4:03:48, 12.57it/s]

Error processing train_images/71cCskAMOnL.jpg: image file is truncated (0 bytes not processed)


Processing images:  30%|███       | 79938/263859 [1:52:42<3:59:41, 12.79it/s]

Error processing train_images/71yOV79bIeL.jpg: image file is truncated (13 bytes not processed)


Processing images:  34%|███▎      | 88961/263859 [2:06:17<4:10:14, 11.65it/s]

Error processing train_images/71oMhQuO6sL.jpg: image file is truncated (6 bytes not processed)


Processing images:  34%|███▍      | 89172/263859 [2:06:36<3:22:55, 14.35it/s]

Error processing train_images/71vQYlTxTqL.jpg: image file is truncated (0 bytes not processed)


Processing images:  39%|███▊      | 101667/263859 [2:25:28<3:32:50, 12.70it/s]

Error processing train_images/61Dj8OQCKcL.jpg: image file is truncated (4 bytes not processed)


Processing images:  40%|████      | 105781/263859 [2:31:43<3:08:42, 13.96it/s]

Error processing train_images/61tYP-bE5HL.jpg: image file is truncated (1 bytes not processed)
Error processing train_images/61tYP-bE5HL.jpg: image file is truncated (1 bytes not processed)


Processing images:  42%|████▏     | 110516/263859 [2:38:53<3:50:23, 11.09it/s]

Error processing train_images/81md9nEfQoS.jpg: image file is truncated (6 bytes not processed)


Processing images:  45%|████▍     | 118198/263859 [2:50:21<2:40:14, 15.15it/s]

Error processing train_images/61gq-o7nMlL.jpg: image file is truncated (131 bytes not processed)


Processing images:  45%|████▍     | 118429/263859 [2:50:42<3:01:58, 13.32it/s]

Error processing train_images/619B0fCA2eL.jpg: image file is truncated (2 bytes not processed)


Processing images:  45%|████▌     | 118843/263859 [2:51:19<3:00:08, 13.42it/s]

Error processing train_images/51x1b7eFCeL.jpg: image file is truncated (1 bytes not processed)


Processing images:  46%|████▌     | 121428/263859 [2:55:15<3:23:29, 11.67it/s]

Error processing train_images/71HWrBerepL.jpg: image file is truncated (5 bytes not processed)


Processing images:  47%|████▋     | 124197/263859 [2:59:34<3:14:20, 11.98it/s]

Error processing train_images/71basklnF8L.jpg: image file is truncated (3 bytes not processed)


Processing images:  49%|████▉     | 129737/263859 [3:08:25<2:40:48, 13.90it/s]

Error processing train_images/41hO04updoL.jpg: image file is truncated (3 bytes not processed)


Processing images: 100%|██████████| 263859/263859 [5:50:29<00:00, 12.55it/s]  
Processing images:  39%|███▉      | 51323/131187 [1:05:52<1:24:21, 15.78it/s]

Error processing test_images/51XReKLiOML.jpg: [Errno 2] No such file or directory: 'test_images/51XReKLiOML.jpg'


Processing images:  72%|███████▏  | 94047/131187 [2:03:42<43:57, 14.08it/s]  

Error processing test_images/617t4ItBmSS.jpg: [Errno 2] No such file or directory: 'test_images/617t4ItBmSS.jpg'


Processing images:  98%|█████████▊| 128610/131187 [2:56:17<04:10, 10.28it/s]

Error processing test_images/81eAv-jpiOL.jpg: [Errno 2] No such file or directory: 'test_images/81eAv-jpiOL.jpg'


Processing images: 100%|█████████▉| 130757/131187 [3:00:45<00:44,  9.61it/s]

Error processing test_images/91Y5t1M1BTL.jpg: image file is truncated (2 bytes not processed)


Processing images: 100%|██████████| 131187/131187 [3:01:41<00:00, 12.03it/s]


In [34]:
#Training a model on the extracted image features
from sklearn.ensemble import GradientBoostingRegressor

In [56]:
import numpy as np
# Prepareing the training data
X_train = np.array([features for features in train_data['image_features'] if features is not None])  # Features
y_train = train_data.loc[train_data['image_features'].notnull(), 'entity_value']  # Labels

In [58]:
import re

def extract_numeric_value(value_with_unit):
    if pd.isna(value_with_unit):
        return np.nan
    match = re.match(r"([0-9.]+)\s*([a-zA-Z]+)", value_with_unit)
    if match:
        value, unit = match.groups()
        value = float(value)
        return value  # Adding unit conversion logic here if needed
    return np.nan

train_data['numeric_value'] = train_data['entity_value'].apply(extract_numeric_value)


In [59]:
def convert_to_gram(value, unit):
    conversion_factors = {
        'kilogram': 1000,
        'gram': 1,
        'milligram': 0.001,
        'ounce': 28.3495,
        'pound': 453.592,
        'ton': 1e6
    }
    return value * conversion_factors.get(unit, 1)


def convert_value(value_with_unit):
    if pd.isna(value_with_unit):
        return np.nan
    match = re.match(r"([0-9.]+)\s*([a-zA-Z]+)", value_with_unit)
    if match:
        value, unit = match.groups()
        value = float(value)
        return convert_to_gram(value, unit)  # Converting to grams
    return np.nan

train_data['numeric_value'] = train_data['entity_value'].apply(convert_value)


In [61]:
# Filtering out any rows where `numeric_value` is NaN
train_data = train_data.dropna(subset=['numeric_value', 'image_features'])

X_train = np.array([features for features in train_data['image_features']])
y_train = train_data['numeric_value'].values


In [69]:
pip install joblib


Note: you may need to restart the kernel to use updated packages.


In [70]:
import joblib
import logging
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

In [71]:
# Configuring logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [72]:
# Ensuring X_train and y_train have the same length
if len(X_train) != len(y_train):
    logger.error("Feature and target data lengths do not match.")
    raise ValueError("Feature and target data lengths do not match.")

In [74]:
# Ensureing that X_train is aligned with y_train after dropping NaNs
# Filtering out rows with NaN targets from X_train
valid_indices = ~np.isnan(y_train)
X_train = np.array(X_train)[valid_indices]

In [75]:
# Checking the dimensions again
if len(X_train) != len(y_train):
    logger.error("Mismatch in lengths of X_train and y_train after cleaning.")
    raise ValueError("Mismatch in lengths of X_train and y_train after cleaning.")

In [76]:
# Initializing and training the Gradient Boosting Regressor
regressor = GradientBoostingRegressor()
try:
    logger.info("Training the model...")
    regressor.fit(X_train, y_train)
    logger.info("Model training completed successfully.")
except Exception as e:
    logger.error(f"An error occurred during model training: {e}")
    raise

INFO:__main__:Training the model...
INFO:__main__:Model training completed successfully.


In [124]:
# Ensureing test_data has valid image features for prediction
X_test = test_data['image_features'].dropna().tolist()  

# Predicting the values using the trained regressor only for rows with valid image features
predictions = regressor.predict(X_test)

# Creating a new column 'value_pred' and set default empty values
test_data['value_pred'] = ""

# Assigning predictions back to the original test data (only where image features are valid)
test_data.loc[test_data['image_features'].notna(), 'value_pred'] = predictions


In [131]:
def format_prediction(value, unit):
    if pd.isna(value) or value == "":
        return ""  
    formatted_value = abs(value)

    return f"{formatted_value:.2f} {unit}"

# Applying the formatting function to the test data
test_data['prediction'] = test_data.apply(lambda row: format_prediction(row['value_pred'], row['default_unit']), axis=1)


In [132]:
# Ensuring the output contains 'index' and 'prediction' columns, matching the expected format
output_df = test_data[['index', 'prediction']]

# Saving the predictions to a CSV file named 'test_out.csv'
output_df.to_csv('test_out.csv', index=False)


In [134]:
#Sanity Checking
!python src/sanity.py --test_filename dataset/test.csv --output_filename test_out.csv


Parsing successfull for file: test_out.csv
