<a href="https://colab.research.google.com/github/karthikeya-kar/amazon-ml/blob/main/amazon_ml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import pandas as pd
import requests

# Paths
TRAIN_CSV = '/content/train.csv'  # Path to your train.csv file
OUTPUT_FOLDER = '/content/output_2'  # Path to store organized dataset

# Map of entities and units
entity_unit_map = {
    "width": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "depth": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "height": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "item_weight": {"milligram", "kilogram", "microgram", "gram", "ounce", "ton", "pound"},
    "maximum_weight_recommendation": {"milligram", "kilogram", "microgram", "gram", "ounce", "ton", "pound"},
    "voltage": {"millivolt", "kilovolt", "volt"},
    "wattage": {"kilowatt", "watt"},
    "item_volume": {"cubic foot", "microlitre", "cup", "fluid ounce", "centilitre", "imperial gallon", "pint",
                    "decilitre", "litre", "millilitre", "quart", "cubic inch", "gallon"}
}

# Function to download an image from a URL
def download_image(url, save_path):
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()
        with open(save_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"Downloaded {url} to {save_path}")
    except Exception as e:
        print(f"Failed to download {url}. Error: {e}")

# Function to organize and download images
def organize_and_download_images(num_rows=5000):
    try:
        # Read the CSV file
        df = pd.read_csv(TRAIN_CSV)

        # Limit the DataFrame to the first `num_rows` rows
        df_subset = df.head(num_rows)

        for _, row in df_subset.iterrows():
            entity = row['entity_name']
            entity_value = row['entity_value']
            group_id = row['group_id']
            image_url = row['image_link']

            # Check if required fields are not empty
            if pd.isna(entity) or pd.isna(entity_value) or pd.isna(group_id) or pd.isna(image_url):
                continue  # Skip empty entries

            try:
                # Determine the unit from the entity_value (splitting by space)
                value, unit = entity_value.split()
            except ValueError:
                # If the value doesn't split correctly, skip the row
                print(f"Skipping row: {entity_value}")
                continue

            # Only process if the entity is in our map and unit is valid
            if entity in entity_unit_map and unit in entity_unit_map[entity]:
                # Create target directory based on entity and unit
                target_dir = os.path.join(OUTPUT_FOLDER, entity, unit)
                os.makedirs(target_dir, exist_ok=True)

                # Image file name format is [group_id]_[index].jpg
                image_name = f"{group_id}_{row.name}.jpg"  # Using row index as part of the image name
                image_target_path = os.path.join(target_dir, image_name)

                # Download the image
                download_image(image_url, image_target_path)

    except Exception as e:
        print(f"An error occurred: {e}")

# Run the function to organize and download images
organize_and_download_images()


Downloaded https://m.media-amazon.com/images/I/61I9XdN6OFL.jpg to /content/output_2/item_weight/gram/748919_0.jpg
Downloaded https://m.media-amazon.com/images/I/71gSRbyXmoL.jpg to /content/output_2/item_volume/cup/916768_1.jpg
Downloaded https://m.media-amazon.com/images/I/61BZ4zrjZXL.jpg to /content/output_2/item_weight/gram/459516_2.jpg
Downloaded https://m.media-amazon.com/images/I/612mrlqiI4L.jpg to /content/output_2/item_weight/gram/459516_3.jpg
Downloaded https://m.media-amazon.com/images/I/617Tl40LOXL.jpg to /content/output_2/item_weight/milligram/731432_4.jpg
Downloaded https://m.media-amazon.com/images/I/61QsBSE7jgL.jpg to /content/output_2/item_weight/milligram/731432_5.jpg
Downloaded https://m.media-amazon.com/images/I/81xsq6vf2qL.jpg to /content/output_2/item_weight/milligram/731432_6.jpg
Downloaded https://m.media-amazon.com/images/I/71DiLRHeZdL.jpg to /content/output_2/item_weight/milligram/731432_7.jpg
Downloaded https://m.media-amazon.com/images/I/91Cma3RzseL.jpg to /co

In [None]:
import os
import numpy as np
from PIL import Image

# Define paths and parameters
data_dir = '/content/output'  # Path to your dataset
img_height, img_width = 224, 224  # Desired image dimensions

# Function to preprocess a single image
def preprocess_image(image_path):
    """
    Load an image file, resize it, and normalize it.
    """
    try:
        with Image.open(image_path) as img:
            # Resize image
            img = img.resize((img_width, img_height))

            # Convert image to numpy array
            img_array = np.array(img)

            # Ensure the image has 3 channels
            if img_array.ndim == 2:
                img_array = np.stack([img_array] * 3, axis=-1)
            elif img_array.shape[2] == 4:
                img_array = img_array[:, :, :3]

            # Normalize image to range [0, 1]
            img_array = img_array / 255.0

            return img_array
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        return None

def preprocess_images_from_directory(directory):
    """
    Preprocess all images in a directory, organized by class subdirectories.
    """
    images = []
    labels = []

    print(f"Processing directory: {directory}")

    for entity in os.listdir(directory):
        entity_dir = os.path.join(directory, entity)

        if os.path.isdir(entity_dir):
            print(f"Processing entity directory: {entity_dir}")

            for unit in os.listdir(entity_dir):
                unit_dir = os.path.join(entity_dir, unit)

                if os.path.isdir(unit_dir):
                    print(f"Processing unit directory: {unit_dir}")

                    for file_name in os.listdir(unit_dir):
                        file_path = os.path.join(unit_dir, file_name)

                        if os.path.isfile(file_path):
                            print(f"Processing file: {file_path}")

                            img_array = preprocess_image(file_path)

                            if img_array is not None:
                                if img_array.shape == (img_height, img_width, 3):
                                    images.append(img_array)
                                    labels.append(f"{entity}_{unit}")
                                else:
                                    print(f"Image {file_path} has invalid shape {img_array.shape}")
                            else:
                                print(f"Failed to preprocess image: {file_path}")
                        else:
                            print(f"Skipping non-file: {file_path}")
                else:
                    print(f"Skipping non-directory: {unit_dir}")
        else:
            print(f"Skipping non-directory: {entity_dir}")

    if images:
        images = np.array(images)
        labels = np.array(labels)
        print(f"Processed {len(images)} images with shape {images.shape}")
        print(f"Labels: {np.unique(labels)}")
    else:
        print("No images processed.")

    return images, labels

# Example of using preprocess_images_from_directory
images, labels = preprocess_images_from_directory(data_dir)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Processing file: /content/output/item_weight/gram/299791_4983.jpg
Processing file: /content/output/item_weight/gram/630869_2406.jpg
Processing file: /content/output/item_weight/gram/731432_3781.jpg
Processing file: /content/output/item_weight/gram/601746_2726.jpg
Processing file: /content/output/item_weight/gram/558374_8107.jpg
Processing file: /content/output/item_weight/gram/281678_15.jpg
Processing file: /content/output/item_weight/gram/767202_4681.jpg
Processing file: /content/output/item_weight/gram/750220_5692.jpg
Processing file: /content/output/item_weight/gram/895549_9658.jpg
Processing file: /content/output/item_weight/gram/993359_9273.jpg
Processing file: /content/output/item_weight/gram/630869_2783.jpg
Processing file: /content/output/item_weight/gram/396159_5666.jpg
Processing file: /content/output/item_weight/gram/523149_3650.jpg
Processing file: /content/output/item_weight/gram/957185_5831.jpg
Processing fi

In [13]:
import os
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from PIL import ImageFile

# Enable loading truncated images
ImageFile.LOAD_TRUNCATED_IMAGES = True

# Define paths and parameters
data_dir = '/content/output'  # Path to your dataset
img_height, img_width = 224, 224  # Input size for MobileNetV2
batch_size = 32  # Batch size
validation_split = 0.2  # 20% of data for validation

# Data augmentation and preprocessing for training
train_datagen = ImageDataGenerator(
    rescale=1.0/255.0,  # Normalize pixel values to range [0, 1]
    rotation_range=20,  # Randomly rotate images
    width_shift_range=0.2,  # Randomly shift images horizontally
    height_shift_range=0.2,  # Randomly shift images vertically
    shear_range=0.2,  # Shear transformations
    zoom_range=0.2,  # Zoom in/out
    horizontal_flip=True,  # Randomly flip images horizontally
    fill_mode='nearest',  # Fill any empty pixels after transformations
    validation_split=validation_split  # Split the dataset into train/validation
)

# Create training data generator
train_generator = train_datagen.flow_from_directory(
    data_dir,
    target_size=(img_height, img_width),
    batch_size=batch_size,
    class_mode='categorical',  # We are doing classification
    subset='training'  # This is the training data
)

# Create validation data generator
validation_generator = train_datagen.flow_from_directory(
    data_dir,
    target_size=(img_height, img_width),
    batch_size=batch_size,
    class_mode='categorical',  # We are doing classification
    subset='validation'  # This is the validation data
)

# Check number of classes
num_classes = train_generator.num_classes
print(f"Number of classes: {num_classes}")


Found 7628 images belonging to 8 classes.
Found 1902 images belonging to 8 classes.
Number of classes: 8


In [14]:
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.models import Model

# Load the base model (MobileNetV2) without the top layer
base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(img_height, img_width, 3))

# Freeze the base model to prevent training (can be unfrozen later if fine-tuning is needed)
base_model.trainable = False

# Add custom layers on top of MobileNetV2
x = base_model.output
x = GlobalAveragePooling2D()(x)  # Reduces the feature dimensions
x = Dropout(0.3)(x)  # Dropout layer to prevent overfitting
x = Dense(128, activation='relu')(x)  # Fully connected layer
x = Dropout(0.3)(x)  # Another dropout for regularization
predictions = Dense(num_classes, activation='softmax')(x)  # Final classification layer




In [15]:


# Create the complete model
model = Model(inputs=base_model.input, outputs=predictions)




In [16]:
# Print model summary
model.summary()

In [17]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [18]:
import tensorflow as tf

# Define callbacks with the proper file extension
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
checkpoint = tf.keras.callbacks.ModelCheckpoint('best_model.keras', save_best_only=True, monitor='val_loss', mode='min')



In [19]:
# Train the model
history = model.fit(
    train_generator,
    steps_per_epoch=train_generator.samples // batch_size,
    validation_data=validation_generator,
    validation_steps=validation_generator.samples // batch_size,
    epochs=10,  # You can increase the number of epochs if needed
    callbacks=[early_stopping, checkpoint]
)


Epoch 1/10
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m323s[0m 1s/step - accuracy: 0.7798 - loss: 0.8960 - val_accuracy: 0.8226 - val_loss: 0.6348
Epoch 2/10
[1m  1/238[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m9s[0m 40ms/step - accuracy: 0.9688 - loss: 0.1626

  self.gen.throw(typ, value, traceback)


[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 16ms/step - accuracy: 0.9688 - loss: 0.1626 - val_accuracy: 0.8571 - val_loss: 0.5741
Epoch 3/10
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m362s[0m 1s/step - accuracy: 0.8281 - loss: 0.6250 - val_accuracy: 0.8236 - val_loss: 0.6385
Epoch 4/10
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7500 - loss: 0.7993 - val_accuracy: 0.9286 - val_loss: 0.8949
Epoch 5/10
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m305s[0m 1s/step - accuracy: 0.8322 - loss: 0.5675 - val_accuracy: 0.8204 - val_loss: 0.6311


In [20]:
# Save the final model
model.save('final_model.keras')

print("Training complete, and model saved as 'final_model.keras'.")

Training complete, and model saved as 'final_model.keras'.


In [22]:
# Load the saved model
from tensorflow.keras.models import load_model

model = load_model('final_model.keras')

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(train_generator, steps=train_generator.samples // batch_size)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")


[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m249s[0m 1s/step - accuracy: 0.8267 - loss: 0.5732
Test Loss: 0.5822
Test Accuracy: 0.8243


In [27]:

import pandas as pd

# Load the CSV file
test_df = pd.read_csv('test.csv')

# Check the first few rows
print(test_df.head())

   index                                         image_link  group_id  \
0      0  https://m.media-amazon.com/images/I/110EibNycl...    156839   
1      1  https://m.media-amazon.com/images/I/11TU2clswz...    792578   
2      2  https://m.media-amazon.com/images/I/11TU2clswz...    792578   
3      3  https://m.media-amazon.com/images/I/11TU2clswz...    792578   
4      4  https://m.media-amazon.com/images/I/11gHj8dhhr...    792578   

  entity_name  
0      height  
1       width  
2      height  
3       depth  
4       depth  


In [23]:

import requests
from PIL import Image
from io import BytesIO
import numpy as np

# Function to download and preprocess an image
def preprocess_image(image_url):
    try:
        # Download image
        response = requests.get(image_url)
        response.raise_for_status()  # Check if the request was successful
        img = Image.open(BytesIO(response.content)).convert('RGB')  # Open image and convert to RGB

        # Resize image
        img = img.resize((224, 224))  # Resize to model's expected input size

        # Convert image to numpy array
        img_array = np.array(img)

        # Normalize image to range [0, 1]
        img_array = img_array / 255.0

        return img_array
    except Exception as e:
        print(f"Error processing image {image_url}: {e}")
        return None


In [24]:
# Define the mapping based on the provided entity_unit_map
entity_unit_map = {
    "width": ["centimetre", "foot", "millimetre", "metre", "inch", "yard"],
    "depth": ["centimetre", "foot", "millimetre", "metre", "inch", "yard"],
    "height": ["centimetre", "foot", "millimetre", "metre", "inch", "yard"],
    "item_weight": ["milligram", "kilogram", "microgram", "gram", "ounce", "ton", "pound"],
    "maximum_weight_recommendation": ["milligram", "kilogram", "microgram", "gram", "ounce", "ton", "pound"],
    "voltage": ["millivolt", "kilovolt", "volt"],
    "wattage": ["kilowatt", "watt"],
    "item_volume": ["cubic foot", "microlitre", "cup", "fluid ounce", "centilitre", "imperial gallon", "pint",
                    "decilitre", "litre", "millilitre", "quart", "cubic inch", "gallon"]
}

def map_index_to_unit(entity_name, index):
    """
    Map an index to a unit based on the entity type.

    Args:
    - entity_name (str): The name of the entity (e.g., "width", "height").
    - index (int): The index of the unit.

    Returns:
    - str: The unit name corresponding to the index.
    """
    if entity_name in entity_unit_map:
        units = entity_unit_map[entity_name]
        if 0 <= index < len(units):
            return units[index]
    return 'Unknown'

In [25]:

def predict_image(image_url, entity_name):
    img_array = preprocess_image(image_url)

    if img_array is None:
        return None, None

    # Add batch dimension and make prediction
    img_array = np.expand_dims(img_array, axis=0)
    predictions = model.predict(img_array)

    # Assuming the model outputs two predictions: value and unit index
    predicted_value = predictions[0][0]
    predicted_unit_index = int(predictions[0][1])  # Convert to index if needed

    predicted_unit = map_index_to_unit(entity_name, predicted_unit_index)

    return predicted_value, predicted_unit


In [26]:
print(map_index_to_unit("width", 0))  # Should print: 'centimetre'
print(map_index_to_unit("item_weight", 3))  # Should print: 'gram'
print(map_index_to_unit("voltage", 2))  # Should print: 'volt'
print(map_index_to_unit("wattage", 1))  # Should print: 'watt'

centimetre
gram
volt
watt


In [None]:
results = []

for index, row in test_df.iterrows():

    image_url = row['image_link']
    group_id = row['group_id']
    entity_name = row['entity_name']

    # Get the predictions
    predicted_value, predicted_unit = predict_image(image_url, entity_name)

    if predicted_value is not None:
        # Combine predicted_value and predicted_unit into a single string
        predictions = f"{predicted_value} {predicted_unit}" if predicted_unit else str(predicted_value)
        results.append({

            'predictions': predictions  # Store in a single column
        })
    else:
        print(f"Prediction failed for image: {image_url}")

# Convert results to DataFrame and save to CSV
results_df = pd.DataFrame(results)
results_df.to_csv('test_out.csv', index=False)
print("Predictions saved to test_out.csv")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms