# Submission 1 of Team AIvengers

By Akshay Kumar, Anubroto Ghose and Devina Goel

# Testing if CUDA is configured properly

In [1]:
import torch
torch.cuda.is_available() 

True

# This is the main function

In this iteration of our solution, the Width Height and Depth predictions have been excluded. We will attempt that later

In [1]:
import os
import random
import pandas as pd
import nltk
import re
import easyocr 
import argparse
import cv2
from PIL import Image
import numpy as np
reader = easyocr.Reader(['en'],gpu=True)

entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram',
        'kilogram',
        'microgram',
        'milligram',
        'ounce',
        'pound',
        'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre',
        'cubic foot',
        'cubic inch',
        'cup',
        'decilitre',
        'fluid ounce',
        'gallon',
        'imperial gallon',
        'litre',
        'microlitre',
        'millilitre',
        'pint',
        'quart'}
}

# Separate functions for each key in the entity_unit_map

def handle_width(image, category_id):
    unit_map = {
        'centimetre': ['cm', 'centimetre', 'centimeter'],
        'foot': ['ft', 'foot'],
        'inch': ['in', 'inch', '"'],
        'metre': ['m', 'metre', 'meter'],
        'millimetre': ['mm', 'millimetre', 'millimeter'],
        'yard': ['yd', 'yard']
    }    
    return 0

def handle_depth(image, category_id):
    keywords = {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'}
    return 0

def handle_height(image, category_id):
    keywords = {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'}
    return 0

def handle_item_weight(image_location, category_id):
    unit_map = {
        'gram': ['g', 'gram', 'gm'],
        'kilogram': ['kg', 'kilogram'],
        'microgram': ['mcg', 'microgram', 'μg'],
        'milligram': ['mg', 'milligram'],
        'ounce': ['oz', 'ounce'],
        'pound': ['ibs', 'Ibs', 'lb', 'lbs', 'pound'],
        'ton': ['ton', 't']
    }

    unit_map_reversed = {unit: full_form for full_form, units in unit_map.items() for unit in units}

    unit_patterns = '|'.join([re.escape(u) for u in unit_map_reversed.keys()])

    img = Image.open(image_location).convert('L')
    result = reader.readtext(np.asarray(img))
    
    text = ""
    for (bbox, t, prob) in result:
        text = text + " " + t
    
    #print(text)  

    tokens = nltk.word_tokenize(text)

    pattern = rf'(\d+\.?\d*)\s*({unit_patterns})'
    
    matches = re.findall(pattern, text, re.IGNORECASE)

    result = []
    for match in matches:
        value, unit = match[0], match[1].lower()
        
        if float(value) == 0:
            continue
        
        if unit in unit_map_reversed:
            full_form = unit_map_reversed[unit]
            result.append(f"{value} {full_form}")

    if result:
        return result[0]
    else:
        return 0

def handle_maximum_weight_recommendation(image_location, category_id):
    unit_map = {
        'gram': ['g', 'gram', 'gm'],
        'kilogram': ['kg', 'kilogram', 'kcs'],
        'microgram': ['mcg', 'microgram', 'μg'],
        'milligram': ['mg', 'milligram'],
        'ounce': ['oz', 'ounce'],
        'pound': ['ibs', 'Ibs', 'lb', 'lbs', 'pound'],
        'ton': ['ton', 't']
    }

    unit_map_reversed = {unit: full_form for full_form, units in unit_map.items() for unit in units}

    unit_patterns = '|'.join([re.escape(u) for u in unit_map_reversed.keys()])

    def extract_text(image):
        img = Image.open(image).convert('L')
        result = reader.readtext(np.asarray(img))

        text = " ".join([t for (_, t, _) in result])
        
        return text

    def find_max_weight(text):
        pattern = rf'(\d+\.?\d*)\s*({unit_patterns})'
        
        matches = re.findall(pattern, text, re.IGNORECASE)

        max_value = None
        max_unit = None

        for match in matches:
            value, unit = match[0], match[1].lower()

            value = float(value)

            
            if unit in unit_map_reversed:
                full_form = unit_map_reversed[unit]

                
                if max_value is None or value > max_value:
                    max_value = value
                    max_unit = full_form

        return max_value, max_unit

    
    text = extract_text(image_location)
    max_value, max_unit = find_max_weight(text)

    
    if max_value is None:
        img = Image.open(image_location)
        
        img_resized = img.resize((img.width * 2, img.height * 2), Image.Resampling.LANCZOS)

        
        result = reader.readtext(np.asarray(img_resized))

        
        text = " ".join([t for (_, t, _) in result])

        
        max_value, max_unit = find_max_weight(text)

    
    if max_value is not None:
        return f"{max_value} {max_unit}"
    else:
        return 0

def handle_voltage(image_location, category_id):
    
    unit_map = {
        'kilovolt': ['kV', 'kilovolt', 'kv'],
        'millivolt': ['mV', 'millivolt', 'mv'],
        'volt': ['V', 'volt', 'v']
    }

    
    unit_map_reversed = {unit: full_form for full_form, units in unit_map.items() for unit in units}

    
    unit_patterns = '|'.join([re.escape(u) for u in unit_map_reversed.keys()])

    
    img = Image.open(image_location).convert('L')
    result = reader.readtext(np.asarray(img))
    
    text = ""
    for (bbox, t, prob) in result:
        text = text + " " + t
        

    
    tokens = nltk.word_tokenize(text)

    
    pattern = rf'(\d+\.?\d*)\s*({unit_patterns})'

    
    matches = re.findall(pattern, text, re.IGNORECASE)

    
    result = []
    for match in matches:
        value, unit = match[0], match[1].lower()

        
        if unit in unit_map_reversed:
            full_form = unit_map_reversed[unit]
            result.append(f"{value} {full_form}")

    
    if result:
        return result[0]
    else:
        return 0

def handle_wattage(image_location, category_id):
    
    unit_map = {
        'kilowatt': ['kW', 'kilowatt', 'kw'],
        'watt': ['W', 'watt', 'w']
    }

    
    unit_map_reversed = {unit: full_form for full_form, units in unit_map.items() for unit in units}

    
    unit_patterns = '|'.join([re.escape(u) for u in unit_map_reversed.keys()])

    
    img = Image.open(image_location).convert('L')
    result = reader.readtext(np.asarray(img))
    text = ""
    for (bbox, t, prob) in result:
        text = text + " " + t
        
    tokens = nltk.word_tokenize(text)

    
    pattern = rf'(\d+\.?\d*)\s*({unit_patterns})'

    
    matches = re.findall(pattern, text, re.IGNORECASE)

    
    result = []
    for match in matches:
        value, unit = match[0], match[1].lower()

        
        if unit in unit_map_reversed:
            full_form = unit_map_reversed[unit]
            result.append(f"{value} {full_form}")

    
    if result:
        return result[0]
    else:
        return 0

def handle_item_volume(image_location, category_id):
    
    unit_map = {
        'centilitre': ['cl', 'centilitre'],
        'cubic foot': ['cubic foot', 'ft³'],
        'cubic inch': ['cubic inch', 'in³'],
        'cup': ['cup'],
        'decilitre': ['dl', 'decilitre'],
        'fluid ounce': ['Fl','Fl 0z','fl oz','fl. oz','fi. oz', 'fluid ounce', 'oz'],
        'gallon': ['gallon', 'gal'],
        'imperial gallon': ['imperial gallon', 'imp gal'],
        'litre': ['litre', 'liter', 'l'],
        'microlitre': ['microlitre', 'µl'],
        'millilitre': ['millilitre', 'ml'],
        'pint': ['pint', 'pt'],
        'quart': ['quart', 'qt']
    }

    
    unit_patterns = '|'.join([f"({'|'.join(units)})" for units in unit_map.values()])

    def extract_text(image):
        
        img = Image.open(image).convert('L')
        result = reader.readtext(np.asarray(img))

        
        text = " ".join([t for (_, t, _) in result])
        return text

    def find_volume(text):
        
        pattern = rf'(\d+\.?\d*)\s*({unit_patterns})'

        
        matches = re.findall(pattern, text, re.IGNORECASE)

        
        result = []
        for match in matches:
            value, unit = match[0], match[1].lower()

            
            for full_form, short_forms in unit_map.items():
                if unit in [u.lower() for u in short_forms]:
                    unit = full_form
                    break

            result.append(f"{value} {unit}")

        return result

    
    text = extract_text(image_location)
    result = find_volume(text)

    
    if not result:
        
        filtered_text = text  
        result = find_volume(filtered_text)

    
    if not result:
        img = Image.open(image_location)
        
        img_resized = img.resize((img.width * 2, img.height * 2), Image.Resampling.LANCZOS)
        
        resized_image_location = "resized_image.jpg"
        img_resized.save(resized_image_location)
        
        text = extract_text(resized_image_location)
        result = find_volume(text)

    
    if result:
        return result[0]
    else:
        return 0


def predictor(image_link, category_id, entity_name):
    '''
    Call your model/approach here
    '''
    
    if entity_name == 'width':
        return handle_width(image_link, category_id)
    elif entity_name == 'depth':
        return handle_depth(image_link, category_id)
    elif entity_name == 'height':
        return handle_height(image_link, category_id)
    elif entity_name == 'item_weight':
        return handle_item_weight(image_link, category_id)
    elif entity_name == 'maximum_weight_recommendation':
        return handle_maximum_weight_recommendation(image_link, category_id)
    elif entity_name == 'voltage':
        return handle_voltage(image_link, category_id)
    elif entity_name == 'wattage':
        return handle_wattage(image_link, category_id)
    elif entity_name == 'item_volume':
        return handle_item_volume(image_link, category_id)
    else:
        return "Invalid entity name"

  net.load_state_dict(copyStateDict(torch.load(trained_model, map_location=device)))
  model.load_state_dict(torch.load(model_path, map_location=device))


# Preliminary Testing

In [2]:
handle_item_volume("Image1.jpg",3333)

'16 fluid ounce'

In [3]:
handle_item_volume("VolumeTest.jpg",3333)

'4 fluid ounce'

In [4]:
print(handle_width("Width.jpg",123))

0


In [5]:
print(handle_wattage("Wattage.jpg",222))

800 watt


In [6]:
print(handle_wattage("Wattage2.jpg",222))

150 watt


In [7]:
print(handle_voltage("Wattage.jpg",444))

36 volt


In [8]:
print(handle_item_weight("Weight.jpg",444))

500 gram


In [9]:
print(handle_item_weight("Test2.jpg",444))

0


In [10]:
print(handle_maximum_weight_recommendation("MaxWeight.jpg",444))

15.0 kilogram


In [11]:
print(handle_maximum_weight_recommendation("MaxWeight2.jpg",444))

350.0 pound


In [12]:
print(handle_maximum_weight_recommendation("MaxWeight3.jpg",444))

120.0 kilogram


In [13]:
print(handle_maximum_weight_recommendation("MaxWeight4.jpg",444))

0.0 milligram


In [14]:
print(handle_maximum_weight_recommendation("WeightTest.jpg",444))

0


# The runner code to download the image, and run it through the predictor

In [17]:
import pandas as pd
import os
import urllib.request
import time
from pathlib import Path
from tqdm import tqdm  
from PIL import Image
import glob  

def create_placeholder_image(image_path, width=100, height=100):
    """
    Create a black placeholder image if the original image fails to download.

    Parameters:
    - image_path: The path where the placeholder image will be saved.
    - width: The width of the placeholder image (default is 100 pixels).
    - height: The height of the placeholder image (default is 100 pixels).
    """
    placeholder = Image.new('RGB', (width, height), color='black')
    
    placeholder.save(image_path)

def download_image(image_link, save_folder, retries=3, delay=3):
    if not isinstance(image_link, str):
        return

    filename = Path(image_link).name
    image_save_path = os.path.join(save_folder, filename)

    if os.path.exists(image_save_path):
        return image_save_path

    for _ in range(retries):
        try:
            urllib.request.urlretrieve(image_link, image_save_path)
            return image_save_path
        except:
            time.sleep(delay)
    
    create_placeholder_image(image_save_path)  
    return image_save_path

results_df = pd.DataFrame(columns=['index', 'prediction'])

def download_and_predict(image_link, save_folder, category_id, entity_name, retries=3, delay=3):
    image_save_path = download_image(image_link, save_folder, retries, delay)
    prediction = predictor(image_save_path, category_id, entity_name)
    return prediction

def process_images_from_csv(csv_path, download_folder, output_csv_path, save_interval=5000):
    df = pd.read_csv(csv_path)
    display(df)
    print(df.shape)

    image_links = df['image_link'].tolist()
    categories = df['group_id'].tolist()
    entities = df['entity_name'].tolist()
    indices = df['index'].tolist()  

    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    results = []

    for i, (image_link, category_id, entity_name, index) in enumerate(tqdm(zip(image_links, categories, entities, indices), total=len(image_links), desc="Processing Images")):
        
        prediction = download_and_predict(image_link, download_folder, category_id, entity_name)
        print(f'index: {index}, prediction: {prediction}')
        results.append({'index': index, 'prediction': prediction})

        if (i + 1) % save_interval == 0:
            results_df = pd.DataFrame(results)
            
            partial_output_csv = output_csv_path.replace(".csv", f"_part_{(i + 1) // save_interval}.csv")
            results_df.to_csv(partial_output_csv, index=False)
            print(f"Saved progress after {i + 1} predictions to {partial_output_csv}")
            
            results.clear()

    
    if results:
        results_df = pd.DataFrame(results)
        partial_output_csv = output_csv_path.replace(".csv", f"_part_final.csv")
        results_df.to_csv(partial_output_csv, index=False)
        print(f"Final save: Remaining {len(results)} predictions saved to {partial_output_csv}")

        concatenate_results(output_csv_path)

def concatenate_results(final_output_csv):
    """
    Concatenate all partial CSV files into one final file.
    
    Parameters:
    - final_output_csv: The path where the final concatenated CSV will be saved.
    """
    
    partial_csvs = glob.glob(final_output_csv.replace(".csv", "_part_*.csv"))
    
    partial_csvs.sort()

    combined_df = pd.concat([pd.read_csv(csv_file) for csv_file in partial_csvs])

    combined_df.to_csv('resultshalf.csv', index=False)
    print(f"All partial files concatenated into resultshalf.csv")

process_images_from_csv('sample_test.csv', 'download_folder', 'sampletestresult.csv')


Unnamed: 0,index,image_link,group_id,entity_name
0,0,https://m.media-amazon.com/images/I/41-NCxNuBx...,658003,width
1,1,https://m.media-amazon.com/images/I/41-NCxNuBx...,658003,depth
2,2,https://m.media-amazon.com/images/I/417NJrPEk+...,939426,maximum_weight_recommendation
3,3,https://m.media-amazon.com/images/I/417SThj+Sr...,276700,voltage
4,4,https://m.media-amazon.com/images/I/417SThj+Sr...,276700,wattage
...,...,...,...,...
83,83,https://m.media-amazon.com/images/I/81IYdOV0mV...,721522,maximum_weight_recommendation
84,84,https://m.media-amazon.com/images/I/81PG3ea0MO...,240413,voltage
85,85,https://m.media-amazon.com/images/I/81aZ2ozp1G...,805279,maximum_weight_recommendation
86,86,https://m.media-amazon.com/images/I/81qUmRUUTT...,603688,maximum_weight_recommendation


(88, 4)


Processing Images:   0%|                                                                        | 0/88 [00:00<?, ?it/s]

index: 0, prediction: 0
index: 1, prediction: 0


Processing Images:   3%|██▏                                                             | 3/88 [00:00<00:22,  3.71it/s]

index: 2, prediction: 0


Processing Images:   5%|██▉                                                             | 4/88 [00:01<00:29,  2.87it/s]

index: 3, prediction: 0


Processing Images:   6%|███▋                                                            | 5/88 [00:01<00:34,  2.44it/s]

index: 4, prediction: 2100 watt


Processing Images:   8%|█████                                                           | 7/88 [00:02<00:24,  3.30it/s]

index: 5, prediction: 0
index: 6, prediction: 0


Processing Images:  11%|███████▏                                                       | 10/88 [00:02<00:13,  5.62it/s]

index: 7, prediction: 55 watt
index: 8, prediction: 0
index: 9, prediction: 0
index: 10, prediction: 0
index: 11, prediction: 0
index: 12, prediction: 0
index: 13, prediction: 0


Processing Images:  17%|██████████▋                                                    | 15/88 [00:03<00:09,  7.64it/s]

index: 14, prediction: 6.75 pound
index: 15, prediction: 0
index: 16, prediction: 0
index: 17, prediction: 0
index: 18, prediction: 0
index: 19, prediction: 0
index: 20, prediction: 0
index: 21, prediction: 0
index: 22, prediction: 0
index: 23, prediction: 0


Processing Images:  28%|█████████████████▉                                             | 25/88 [00:03<00:03, 16.05it/s]

index: 24, prediction: 12 volt


Processing Images:  31%|███████████████████▎                                           | 27/88 [00:03<00:04, 13.97it/s]

index: 25, prediction: 0
index: 26, prediction: 0
index: 27, prediction: 0
index: 28, prediction: 0
index: 29, prediction: 0
index: 30, prediction: 0
index: 31, prediction: 0
index: 32, prediction: 0
index: 33, prediction: 0


Processing Images:  42%|██████████████████████████▍                                    | 37/88 [00:04<00:03, 15.23it/s]

index: 34, prediction: 75.0 pound
index: 35, prediction: 50.0 ton
index: 36, prediction: 0
index: 37, prediction: 0
index: 38, prediction: 0
index: 39, prediction: 0
index: 40, prediction: 0
index: 41, prediction: 0


Processing Images:  49%|██████████████████████████████▊                                | 43/88 [00:04<00:02, 16.84it/s]

index: 42, prediction: 43.0 pound
index: 43, prediction: 684 volt


Processing Images:  51%|████████████████████████████████▏                              | 45/88 [00:05<00:04,  9.22it/s]

index: 44, prediction: 0


Processing Images:  53%|█████████████████████████████████▋                             | 47/88 [00:05<00:04,  9.09it/s]

index: 45, prediction: 0
index: 46, prediction: 0
index: 47, prediction: 0


Processing Images:  56%|███████████████████████████████████                            | 49/88 [00:06<00:05,  7.14it/s]

index: 48, prediction: 71 ton


Processing Images:  57%|███████████████████████████████████▊                           | 50/88 [00:06<00:07,  5.37it/s]

index: 49, prediction: 0


Processing Images:  58%|████████████████████████████████████▌                          | 51/88 [00:07<00:08,  4.27it/s]

index: 50, prediction: 0


Processing Images:  59%|█████████████████████████████████████▏                         | 52/88 [00:07<00:09,  3.87it/s]

index: 51, prediction: 0
index: 52, prediction: 0
index: 53, prediction: 0
index: 54, prediction: 0
index: 55, prediction: 0


Processing Images:  65%|████████████████████████████████████████▊                      | 57/88 [00:08<00:06,  5.06it/s]

index: 56, prediction: 120.0 volt


Processing Images:  66%|█████████████████████████████████████████▌                     | 58/88 [00:08<00:08,  3.69it/s]

index: 57, prediction: 88 watt


Processing Images:  67%|██████████████████████████████████████████▏                    | 59/88 [00:09<00:09,  3.17it/s]

index: 58, prediction: 0
index: 59, prediction: 0
index: 60, prediction: 0
index: 61, prediction: 0
index: 62, prediction: 0


Processing Images:  73%|█████████████████████████████████████████████▊                 | 64/88 [00:10<00:06,  3.68it/s]

index: 63, prediction: 5 gram


Processing Images:  74%|██████████████████████████████████████████████▌                | 65/88 [00:11<00:08,  2.61it/s]

index: 64, prediction: 5.0 gram


Processing Images:  75%|███████████████████████████████████████████████▎               | 66/88 [00:11<00:08,  2.74it/s]

index: 65, prediction: 0


Processing Images:  76%|███████████████████████████████████████████████▉               | 67/88 [00:12<00:09,  2.15it/s]

index: 66, prediction: 0


Processing Images:  77%|████████████████████████████████████████████████▋              | 68/88 [00:13<00:10,  1.94it/s]

index: 67, prediction: 500 watt


Processing Images:  78%|█████████████████████████████████████████████████▍             | 69/88 [00:14<00:10,  1.83it/s]

index: 68, prediction: 0


Processing Images:  80%|██████████████████████████████████████████████████             | 70/88 [00:14<00:10,  1.73it/s]

index: 69, prediction: 0


Processing Images:  81%|██████████████████████████████████████████████████▊            | 71/88 [00:18<00:24,  1.47s/it]

index: 70, prediction: 0


Processing Images:  82%|███████████████████████████████████████████████████▌           | 72/88 [00:19<00:21,  1.37s/it]

index: 71, prediction: 0


Processing Images:  83%|████████████████████████████████████████████████████▎          | 73/88 [00:20<00:18,  1.20s/it]

index: 72, prediction: 0


Processing Images:  84%|████████████████████████████████████████████████████▉          | 74/88 [00:21<00:16,  1.20s/it]

index: 73, prediction: 0


Processing Images:  85%|█████████████████████████████████████████████████████▋         | 75/88 [00:22<00:14,  1.09s/it]

index: 74, prediction: 110 watt


Processing Images:  86%|██████████████████████████████████████████████████████▍        | 76/88 [00:23<00:12,  1.07s/it]

index: 75, prediction: 10.0 ton


Processing Images:  88%|███████████████████████████████████████████████████████▏       | 77/88 [00:24<00:10,  1.06it/s]

index: 76, prediction: 10 ton


Processing Images:  89%|███████████████████████████████████████████████████████▊       | 78/88 [00:26<00:11,  1.15s/it]

index: 77, prediction: 4.7 gram


Processing Images:  90%|████████████████████████████████████████████████████████▌      | 79/88 [00:27<00:09,  1.10s/it]

index: 78, prediction: 60 watt


Processing Images:  91%|█████████████████████████████████████████████████████████▎     | 80/88 [00:28<00:08,  1.09s/it]

index: 79, prediction: 265 volt


Processing Images:  92%|█████████████████████████████████████████████████████████▉     | 81/88 [00:28<00:06,  1.05it/s]

index: 80, prediction: 0


Processing Images:  93%|██████████████████████████████████████████████████████████▋    | 82/88 [00:30<00:06,  1.06s/it]

index: 81, prediction: 2.1 kilogram


Processing Images:  94%|███████████████████████████████████████████████████████████▍   | 83/88 [00:31<00:05,  1.06s/it]

index: 82, prediction: 660.0 pound


Processing Images:  95%|████████████████████████████████████████████████████████████▏  | 84/88 [00:32<00:05,  1.27s/it]

index: 83, prediction: 1500.0 pound


Processing Images:  97%|████████████████████████████████████████████████████████████▊  | 85/88 [00:34<00:04,  1.42s/it]

index: 84, prediction: 0


Processing Images:  98%|█████████████████████████████████████████████████████████████▌ | 86/88 [00:38<00:04,  2.05s/it]

index: 85, prediction: 500.0 pound


Processing Images:  99%|██████████████████████████████████████████████████████████████▎| 87/88 [00:39<00:01,  1.87s/it]

index: 86, prediction: 350.0 pound


Processing Images: 100%|███████████████████████████████████████████████████████████████| 88/88 [00:40<00:00,  2.19it/s]

index: 87, prediction: 350 pound
Final save: Remaining 88 predictions saved to sampletestresult_part_final.csv
All partial files concatenated into resultshalf.csv





# Use this to concatenate the partial results

In [31]:
import os
import pandas as pd
import glob

folder_path = os.getcwd()

csv_files = glob.glob(os.path.join(folder_path, "halfresults_part_*.csv"))

df_list = []

for file_path in sorted(csv_files):  
    print(f"Processing {file_path}")
    df = pd.read_csv(file_path)
    df_list.append(df)

if df_list:
    concatenated_df = pd.concat(df_list, ignore_index=True)
    
    # Save the concatenated dataframe to a new CSV file
    output_file = os.path.join(folder_path, "combined_results.csv")
    concatenated_df.to_csv(output_file, index=False)
    print(f"Concatenated file saved as {output_file}")
else:
    print("No matching CSV files found.")


Processing C:\Users\Akshay Kumar\Pyth\Amazon ML Challenge\halfresults_part_1.csv
Processing C:\Users\Akshay Kumar\Pyth\Amazon ML Challenge\halfresults_part_2.csv
Processing C:\Users\Akshay Kumar\Pyth\Amazon ML Challenge\halfresults_part_3.csv
Processing C:\Users\Akshay Kumar\Pyth\Amazon ML Challenge\halfresults_part_4.csv
Processing C:\Users\Akshay Kumar\Pyth\Amazon ML Challenge\halfresults_part_5.csv
Processing C:\Users\Akshay Kumar\Pyth\Amazon ML Challenge\halfresults_part_6.csv
Processing C:\Users\Akshay Kumar\Pyth\Amazon ML Challenge\halfresults_part_7.csv
Concatenated file saved as C:\Users\Akshay Kumar\Pyth\Amazon ML Challenge\combined_results.csv


# Use this to create the final submission file

In [30]:
import pandas as pd

test_df = pd.read_csv("test.csv")
results_df = pd.read_csv("combined_results.csv")

merged_df = pd.merge(test_df, results_df, on='index', how='left')
merged_df.replace("0","",inplace=True)
display(merged_df)
submission_df = merged_df[['index', 'prediction']]

submission_df.to_csv("submission.csv", index=False)

print("Submission file created as submission.csv")


Unnamed: 0,index,image_link,group_id,entity_name,prediction
0,0,https://m.media-amazon.com/images/I/110EibNycl...,156839,height,
1,1,https://m.media-amazon.com/images/I/11TU2clswz...,792578,width,
2,2,https://m.media-amazon.com/images/I/11TU2clswz...,792578,height,
3,3,https://m.media-amazon.com/images/I/11TU2clswz...,792578,depth,
4,4,https://m.media-amazon.com/images/I/11gHj8dhhr...,792578,depth,
...,...,...,...,...,...
131182,131283,https://m.media-amazon.com/images/I/A1rVsIzEtk...,721522,maximum_weight_recommendation,
131183,131284,https://m.media-amazon.com/images/I/A1rdvZ5zDd...,603688,item_weight,
131184,131285,https://m.media-amazon.com/images/I/A1rdvZ5zDd...,603688,maximum_weight_recommendation,
131185,131286,https://m.media-amazon.com/images/I/A1tnTUPyr7...,853009,item_weight,


Submission file created as submission.csv
