In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('/content/train.csv')
df

Unnamed: 0,image_link,group_id,entity_name,entity_value
0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,item_weight,500.0 gram
1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,item_volume,1.0 cup
2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,459516,item_weight,0.709 gram
3,https://m.media-amazon.com/images/I/612mrlqiI4...,459516,item_weight,0.709 gram
4,https://m.media-amazon.com/images/I/617Tl40LOX...,731432,item_weight,1400 milligram
...,...,...,...,...
263854,https://m.media-amazon.com/images/I/612J1R1xHl...,558806,height,5.0 centimetre
263855,https://m.media-amazon.com/images/I/61Blzh2+28...,470067,height,8.5 inch
263856,https://m.media-amazon.com/images/I/51MsegDL9V...,204245,height,43.2 centimetre
263857,https://m.media-amazon.com/images/I/510KhVw4VS...,752266,height,9.1 centimetre


In [None]:
df['entity_name'].value_counts()

Unnamed: 0_level_0,count
entity_name,Unnamed: 1_level_1
item_weight,102786
depth,45127
width,44183
height,43597
voltage,9466
wattage,7755
item_volume,7682
maximum_weight_recommendation,3263


In [None]:
import pandas as pd
import re


# Define entity_unit_map
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce', 'gallon',
                    'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}

# Function to extract the unit from the entity_value
def extract_unit(entity_value):
    # Ensure the entity_value is a string and strip any leading/trailing whitespace
    if isinstance(entity_value, str):
        match = re.search(r'[a-zA-Z\s]+$', entity_value.strip())
        if match:
            return match.group(0).strip()
    return None

# Function to validate entity_value based on the entity_unit_map
def is_valid_row(row):
    entity_name = row['entity_name']
    entity_value = row['entity_value']

    # Ensure entity_name is in the map
    if entity_name in entity_unit_map:
        unit = extract_unit(entity_value)

        # Check if the unit extracted is in the valid unit set for this entity_name
        if unit and unit in entity_unit_map[entity_name]:
            return True
    return False

# Apply the filter to the dataframe and remove rows with invalid units
filtered_df = df[df.apply(is_valid_row, axis=1)]

# Ensure no missing or malformed data after filtering
filtered_df = filtered_df.dropna(subset=['entity_name', 'entity_value'])

# Save the cleaned dataset to a new CSV file
filtered_df.to_csv('filtered_dataset.csv', index=False)

print(f"Rows after filtering: {len(filtered_df)}")
print("Filtered dataset saved to 'filtered_dataset.csv'.")


Rows after filtering: 263071
Filtered dataset saved to 'filtered_dataset.csv'.


In [None]:
df_new = pd.read_csv('/content/dataset_30k.csv')
df_new

Unnamed: 0,image_link,group_id,entity_name,entity_value
0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,item_weight,500.0 gram
1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,item_volume,1.0 cup
2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,459516,item_weight,0.709 gram
3,https://m.media-amazon.com/images/I/612mrlqiI4...,459516,item_weight,0.709 gram
4,https://m.media-amazon.com/images/I/617Tl40LOX...,731432,item_weight,1400 milligram
...,...,...,...,...
263066,https://m.media-amazon.com/images/I/612J1R1xHl...,558806,height,5.0 centimetre
263067,https://m.media-amazon.com/images/I/61Blzh2+28...,470067,height,8.5 inch
263068,https://m.media-amazon.com/images/I/51MsegDL9V...,204245,height,43.2 centimetre
263069,https://m.media-amazon.com/images/I/510KhVw4VS...,752266,height,9.1 centimetre


In [None]:
df_new['entity_name'].value_counts()

Unnamed: 0_level_0,count
entity_name,Unnamed: 1_level_1
item_weight,102583
depth,45127
width,44183
height,43597
voltage,9466
wattage,7485
item_volume,7378
maximum_weight_recommendation,3252


In [None]:
# Define the total number of samples and the number of classes
total_samples = 30000
classes = df_new['entity_name'].value_counts().index.tolist()  # Get unique entity names

# Get the count of samples for each class
class_counts = df_new['entity_name'].value_counts()

# Calculate the sample size for each class, with a floor of the total class count if it is smaller
samples_per_class = {entity: min(total_samples // len(classes), count) for entity, count in class_counts.items()}

# Adjust the remaining samples (if any) to the largest classes
remaining_samples = total_samples - sum(samples_per_class.values())

for entity in class_counts.index:
    if remaining_samples <= 0:
        break
    # Add samples to the largest classes until remaining_samples runs out
    extra_samples = min(remaining_samples, class_counts[entity] - samples_per_class[entity])
    samples_per_class[entity] += extra_samples
    remaining_samples -= extra_samples

# Sample from the dataframe for each class
dfs = [df_new[df_new['entity_name'] == entity].sample(n=samples_per_class[entity], random_state=42)
       for entity in class_counts.index]

# Concatenate the sampled dataframes
sampled_df = pd.concat(dfs)

# Shuffle the resulting dataframe to avoid any bias
sampled_df = sampled_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the sampled dataset to a new CSV file
sampled_df.to_csv('/content/sampled_filtered_train.csv', index=False)

# Print the number of samples per class in the sampled dataset
print(sampled_df['entity_name'].value_counts())

entity_name
item_weight                      4248
wattage                          3750
item_volume                      3750
width                            3750
depth                            3750
voltage                          3750
height                           3750
maximum_weight_recommendation    3252
Name: count, dtype: int64


In [None]:
# Extract 400 samples from each 'entity_name' and reset index
sampled_3200 = df.groupby('entity_name').apply(lambda x: x.sample(n=3200, random_state=42, replace=False)).reset_index(drop=True)

# Remove these 400 samples from the original dataframe
remaining_df = df[~df.index.isin(sampled_3200.index)]

# Extract 200 samples from the remaining data for each 'entity_name' and reset index
#sampled_300 = remaining_df.groupby('entity_name').apply(lambda x: x.sample(n=300, random_state=42, replace=False)).reset_index(drop=True)

# Save the samples to separate CSV files
sampled_3200.to_csv('data_train_25k.csv', index=False)
#sampled_300.to_csv('data_test.csv', index=False)

In [None]:
df_new = pd.read_csv('/content/filtered_dataset_30k.csv')
df_new


Unnamed: 0,image_link,group_id,entity_name,entity_value
0,https://m.media-amazon.com/images/I/81tBy5VCw0...,558374,item_weight,5.0 milligram
1,https://m.media-amazon.com/images/I/71nuXof1ce...,752266,wattage,60.0 watt
2,https://m.media-amazon.com/images/I/21g9GpirXB...,459516,item_volume,100.0 millilitre
3,https://m.media-amazon.com/images/I/818s35ccBE...,120569,item_volume,1.0 fluid ounce
4,https://m.media-amazon.com/images/I/71cXMrwRAA...,558374,item_weight,1.0 kilogram
...,...,...,...,...
29995,https://m.media-amazon.com/images/I/61T3be0kmo...,801829,maximum_weight_recommendation,150 kilogram
29996,https://m.media-amazon.com/images/I/51Afww11B6...,392533,depth,160.0 centimetre
29997,https://m.media-amazon.com/images/I/71kKKdwbyi...,630869,item_weight,57.0 gram
29998,https://m.media-amazon.com/images/I/711T5P1civ...,752266,voltage,"[0.0, 10.0] volt"
