In [89]:
import pandas as pd

In [90]:
df_sets_brickset = pd.read_csv('sets-brickset.csv')

# change number to bricklink format
df_sets_brickset['Number'] = [
    f'{number}-{variant}' for number, variant 
    in zip(df_sets_brickset['Number'], df_sets_brickset['Variant'])
]

# change column names, filter columns
df_sets_brickset = df_sets_brickset.rename(
    columns={col: col.lower() for col in list(df_sets_brickset)}
)[['number', 'theme', 'year', 'name', 'minifigs', 'pieces', 'usprice']]

# filter out sets with fewer than 25 pieces
df_sets_brickset = df_sets_brickset[df_sets_brickset['pieces'] > 25]

# filter out sets with NaN prices
df_sets_brickset = df_sets_brickset.dropna(subset=['usprice'])

# make NaN minifigs 0.0
df_sets_brickset = df_sets_brickset.fillna(value={'minifigs': 0.0})

df_sets_brickset

Unnamed: 0,number,theme,year,name,minifigs,pieces,usprice
0,497-1,Space,1979,Galaxy Explorer,4.0,338.0,32.00
1,1072-1,Dacta,1979,Supplementary LEGO Set,0.0,304.0,16.30
6,341-2,Fabuland,1979,Catherine Cat's House and Mortimer Mouse,2.0,123.0,3.00
91,8858-2,Technic,1980,Auto Engines,0.0,460.0,69.00
92,744-1,Basic,1980,"Universal Building Set with Motor, 7+",0.0,537.0,59.00
...,...,...,...,...,...,...,...
14386,41387-1,Friends,2019,Olivia's Summer Heart Box,2.0,93.0,7.99
14387,41388-1,Friends,2019,Mia's Summer Heart Box,1.0,85.0,7.99
14414,853906-1,Seasonal,2019,LEGO Greeting Card,0.0,40.0,4.99
14430,30362-1,City,2019,Sky Police Jetpack,2.0,33.0,3.99


In [91]:
df_sets_bricklink = pd.read_csv('sets-bricklink.tsv', sep='\t')

# rename and filter columns
df_sets_bricklink = df_sets_bricklink.rename(
    columns={
        'Number': 'number',
        'Weight (in Grams)': 'set_weight',
    }
)[['number', 'set_weight']]

# filter out sets without weight data
df_sets_bricklink = df_sets_bricklink[df_sets_bricklink['set_weight'] != '?']

df_sets_bricklink

Unnamed: 0,number,set_weight
8,041-2,569
11,101-1,74
15,1029-1,142
17,1030-1,1189
18,1031-1,277
...,...,...
15568,75964-21,13
15569,75964-22,7
15570,75964-23,12
15571,75964-24,5


In [92]:
df_boxes = pd.read_csv('boxes-bricklink.tsv', sep='\t')

# rename and filter columns
df_boxes = df_boxes.rename(
    columns={
        'Number': 'number',
        'Weight (in Grams)': 'box_weight'
    }
)[['number', 'box_weight']]

# filter out sets without weight data
df_boxes = df_boxes[df_boxes['box_weight'] != '?']

df_boxes

Unnamed: 0,number,box_weight
3,367-1,134
5,2164-1,0.45
8,217-2,41
9,603-2,3
11,293-1,30
...,...,...
15420,911951-1,1.5
15422,BIL01-1,14
15441,11920-1,2
15442,111903-2,1.5


In [93]:
df_instructions = pd.read_csv('instructions-bricklink.tsv', sep='\t')

# filter out instructions without weight
df_instructions = df_instructions[df_instructions['Weight (in Grams)'] != '?']

# rename and filter columns
df_instructions.rename(
    columns={
        'Number': 'number',
        'Weight (in Grams)': 'instruction_weight'
    }
)[['number', 'instruction_weight']]

Unnamed: 0,number,instruction_weight
0,8470-1,102
2,691-1,3.4
3,367-1,20
8,293-1,3.4
9,645-2,3.4
...,...,...
9231,col19-14,2.38
9232,col19-15,2.38
9233,col19-16,2.38
9235,GA11NoDk-99,9


In [None]:
# join it all together!

appendix: merge brickset year csvs (not necessary to run analysis)

In [37]:
import os

brickset_path = 'sets-brickset'
list_dfs_brickset = []

for filename in os.listdir(brickset_path):
    list_dfs_brickset.append(
        pd.read_csv(os.path.join(brickset_path, filename))
    )

df_sets_brickset = pd.concat(list_dfs_brickset, ignore_index=True)

In [32]:
df_sets_brickset.to_csv('sets-brickset.csv', index=False)