In [1]:
import os
import shutil
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split

## Importing the raw data

In [3]:
index = pd.read_csv('./datasets/data/raw_dataset/index.csv')
index.head()

Unnamed: 0,path,class_id
0,marvel/0001/001.jpg,1
1,marvel/0001/002.jpg,1
2,marvel/0001/003.jpg,1
3,marvel/0001/004.jpg,1
4,marvel/0001/005.jpg,1


In [4]:
test = pd.read_csv('./datasets/data/raw_dataset/test.csv')
test.head()

Unnamed: 0,path,class_id
0,test/001.jpg,32
1,test/002.jpg,32
2,test/003.jpg,32
3,test/004.jpg,32
4,test/005.jpg,1


In [5]:
metadata = pd.read_csv('./datasets/data/raw_dataset/metadata.csv', encoding='cp1252')
metadata.head()

Unnamed: 0,class_id,lego_ids,lego_names,minifigure_name
0,1,[76115],['Spider Mech vs. Venom'],SPIDER-MAN
1,2,[76115],['Spider Mech vs. Venom'],VENOM
2,3,[76115],['Spider Mech vs. Venom'],AUNT MAY
3,4,[76115],['Spider Mech vs. Venom'],GHOST SPIDER
4,5,[75208],"[""Yoda's Hut""]",YODA


## Cleaning and merging the data

In [6]:
def clean_metadata(intput: str) -> str:
	return intput.replace('[', '').replace(']', '').replace("'", '').replace('"', '')

In [7]:
metadata['lego_ids'] = metadata['lego_ids'].apply(clean_metadata)
metadata['lego_ids'] = metadata['lego_ids'].astype(int)
metadata['lego_names'] = metadata['lego_names'].apply(clean_metadata)

In [8]:
metadata.sample(10, random_state=99)

Unnamed: 0,class_id,lego_ids,lego_names,minifigure_name
14,15,76128,Molten Man Battle,MYSTERIO
27,28,76125,Iron Man Hall of Armor,IRON MAN MK 1
21,22,76162,Black Widows Helicopter Chase,TASKMASTER
28,29,76125,Iron Man Hall of Armor,IRON MAN MK 5
34,35,75183,Darth Vader Transformation,ANAKIN SKYWALKER
13,14,75254,AT-ST Raider from The Mandalorian,KLATOOINIAN RAIDER 2
19,20,76162,Black Widows Helicopter Chase,BLACK WIDOW
24,25,76123,Captain America: Outriders Attack,OUTRIDER 2
25,26,75926,Pteranodon Chase,OWEN GRADY
15,16,76128,Molten Man Battle,FIREFIGHTER


In [9]:
dataset = pd.merge(index, metadata, on='class_id')
dataset.sample(10, random_state=99)

Unnamed: 0,path,class_id,lego_ids,lego_names,minifigure_name
309,star-wars/0011/002.jpg,32,75273,Poe Damerons X-wing Fighter,JANNAH
49,star-wars/0001/010.jpg,5,75208,Yodas Hut,YODA
126,star-wars/0009/006.jpg,13,75254,AT-ST Raider from The Mandalorian,KLATOOINIAN RAIDER 1
211,marvel/0010/009.jpg,22,76162,Black Widows Helicopter Chase,TASKMASTER
81,star-wars/0004/006.jpg,8,75199,General Grievous Combat Speeder,MACE WINDU
61,star-wars/0002/010.jpg,6,75208,Yodas Hut,LUKE SKYWALKER
255,jurassic-world/0001/012.jpg,26,75926,Pteranodon Chase,OWEN GRADY
84,star-wars/0004/009.jpg,8,75199,General Grievous Combat Speeder,MACE WINDU
171,harry-potter/0002/002.jpg,19,75950,Aragogs Lair,RON WEASLEY
50,star-wars/0001/011.jpg,5,75208,Yodas Hut,YODA


In [10]:
test = pd.merge(test, metadata, on='class_id')
test.sample(10, random_state=99)

Unnamed: 0,path,class_id,lego_ids,lego_names,minifigure_name
25,test/026.jpg,27,75926,Pteranodon Chase,TRACKER TRAQUEUR RASTREADOR
33,test/034.jpg,21,76162,Black Widows Helicopter Chase,YELENA BELOVA
4,test/005.jpg,1,76115,Spider Mech vs. Venom,SPIDER-MAN
14,test/015.jpg,7,75208,Yodas Hut,R2-D2
63,test/064.jpg,33,7104,Desert Skiff,HAN SOLO
10,test/011.jpg,4,76115,Spider Mech vs. Venom,GHOST SPIDER
53,test/054.jpg,8,75199,General Grievous Combat Speeder,MACE WINDU
51,test/052.jpg,31,76125,Iron Man Hall of Armor,IRON MAN MK 50
6,test/007.jpg,2,76115,Spider Mech vs. Venom,VENOM
61,test/062.jpg,13,75254,AT-ST Raider from The Mandalorian,KLATOOINIAN RAIDER 1


## Splitting train/val

In [11]:
labels = dataset['minifigure_name'].unique()

In [12]:
for label in labels:
	sub_dataset = dataset[dataset['minifigure_name'] == label]
	train, val = train_test_split(sub_dataset, test_size=0.2, shuffle=True, random_state=99)
	for idx in val.index.values:
		dataset.loc[idx, 'validation'] = True

In [13]:
dataset['validation'].value_counts()/len(dataset)

validation
True    0.237197
Name: count, dtype: float64

In [14]:
dataset['validation'].fillna(False, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset['validation'].fillna(False, inplace=True)
  dataset['validation'].fillna(False, inplace=True)


In [15]:
dataset.sample(10, random_state=99)

Unnamed: 0,path,class_id,lego_ids,lego_names,minifigure_name,validation
309,star-wars/0011/002.jpg,32,75273,Poe Damerons X-wing Fighter,JANNAH,False
49,star-wars/0001/010.jpg,5,75208,Yodas Hut,YODA,False
126,star-wars/0009/006.jpg,13,75254,AT-ST Raider from The Mandalorian,KLATOOINIAN RAIDER 1,True
211,marvel/0010/009.jpg,22,76162,Black Widows Helicopter Chase,TASKMASTER,False
81,star-wars/0004/006.jpg,8,75199,General Grievous Combat Speeder,MACE WINDU,True
61,star-wars/0002/010.jpg,6,75208,Yodas Hut,LUKE SKYWALKER,False
255,jurassic-world/0001/012.jpg,26,75926,Pteranodon Chase,OWEN GRADY,False
84,star-wars/0004/009.jpg,8,75199,General Grievous Combat Speeder,MACE WINDU,False
171,harry-potter/0002/002.jpg,19,75950,Aragogs Lair,RON WEASLEY,False
50,star-wars/0001/011.jpg,5,75208,Yodas Hut,YODA,True


## Moving the pictures in a YOLO-like manner 

In [16]:
for i in tqdm(range(len(dataset))):
	minifigure_name = dataset.iloc[i]['minifigure_name']
	path = dataset.iloc[i]['path']
	title = path.split('/')[-1]
	validation = dataset.iloc[i]['validation']
	os.makedirs(f'./data/YOLOv8_dataset/{'Val' if validation else 'Train'}/{minifigure_name}', exist_ok=True)
	shutil.copyfile(f'./datasets/data/raw_dataset/{path}', f'./data/YOLOv8_dataset/{'Val' if validation else 'Train'}/{minifigure_name}/{title}')

100%|██████████| 371/371 [00:01<00:00, 349.99it/s]


In [17]:
shutil.copytree('./datasets/data/raw_dataset/test', './data/YOLOv8_dataset/Test')
test.to_csv('./data/YOLOv8_dataset/test.csv', index=False)

I also manually created a last class called "Iron Man Keyring" using picture found on the internet, to test my model on my personal keyring.