In [1]:
import pandas as pd

# Step 1: Load the Data
# Load the cleaned Products.csv and Images.csv
df_pdt = pd.read_csv('data/Cleaned_Products.csv', lineterminator='\n')
df_img = pd.read_csv('data/Images.csv', lineterminator='\n')

# Step 2: Extract the Root Category
# Extract the root category (text before the first slash in the 'category' column)
df_pdt['root_category'] = df_pdt['category'].str.split('/').str[0].str.strip()

# Step 3: Create the Encoder and Decoder
# Get unique root categories
unique_categories = df_pdt['root_category'].unique().tolist()

# Create encoder (category -> integer) and decoder (integer -> category)
encoder = {category: idx for idx, category in enumerate(unique_categories)}
decoder = {idx: category for category, idx in encoder.items()}

# Encode the root categories
df_pdt['encoded_category'] = df_pdt['root_category'].map(encoder)

# Step 4: Merge DataFrames
# Merge Images.csv with the encoded product DataFrame on product ID
merged_df = df_img.merge(df_pdt, left_on='product_id', right_on='id', how='inner')

# Add a `labels` column for the image (id), which is the same as `encoded_category`
merged_df['labels'] = merged_df['encoded_category']

# Step 5: Retain Required Columns
# Retain only image IDs (`id_x`) and their corresponding labels
training_data = merged_df[['id_x', 'labels']]

# Step 6: Save the Training Data
# Save the final training data to a CSV file
training_data.to_csv('data/training_data.csv', index=False)

# Step 7: Display Encoder and Decoder
# Print the encoder and decoder mappings for reference
print("Training data has been saved as 'training_data.csv'.")
print("Encoder (Category to Label):", encoder)
print("Decoder (Label to Category):", decoder)

# Optional: Preview the training data
training_data.head()


Training data has been saved as 'training_data.csv'.
Encoder (Category to Label): {'Home & Garden': 0, 'Baby & Kids Stuff': 1, 'DIY Tools & Materials': 2, 'Music, Films, Books & Games': 3, 'Phones, Mobile Phones & Telecoms': 4, 'Clothes, Footwear & Accessories': 5, 'Other Goods': 6, 'Health & Beauty': 7, 'Sports, Leisure & Travel': 8, 'Appliances': 9, 'Computers & Software': 10, 'Office Furniture & Equipment': 11, 'Video Games & Consoles': 12}
Decoder (Label to Category): {0: 'Home & Garden', 1: 'Baby & Kids Stuff', 2: 'DIY Tools & Materials', 3: 'Music, Films, Books & Games', 4: 'Phones, Mobile Phones & Telecoms', 5: 'Clothes, Footwear & Accessories', 6: 'Other Goods', 7: 'Health & Beauty', 8: 'Sports, Leisure & Travel', 9: 'Appliances', 10: 'Computers & Software', 11: 'Office Furniture & Equipment', 12: 'Video Games & Consoles'}


Unnamed: 0,id_x,labels
0,912bb259-3ad9-457b-9db1-ce1da9016057,0
1,b166d305-b852-4bdd-83f4-465b20da94fa,0
2,68f5a29d-0075-4d60-81c1-ab684a82e50c,0
3,f6a309d7-d247-446a-9b5e-aceefdd4334d,0
4,2c2b3a6f-15b3-4289-937a-15482d9f5781,0
