In [1]:
import pandas as pd
import requests
from PythonScripts.keys import KEY_TWO
import PythonScripts.data_clean as dc

In [2]:
# API Call to get list of LEGO themes and convert to DataFrame
themes = requests.get(f'https://brickset.com/api/v3.asmx/getThemes?apiKey={KEY_TWO}') 
data = themes.json()
theme_df = pd.json_normalize(data, 'themes')

print('Shape: ',theme_df.shape)
theme_df.head()

Shape:  (153, 5)


Unnamed: 0,theme,setCount,subthemeCount,yearFrom,yearTo
0,4 Juniors,24,5,2003,2004
1,Action Wheelers,9,0,2000,2001
2,Advanced models,35,12,2000,2012
3,Adventurers,72,4,1998,2003
4,Agents,13,0,2008,2009


In [3]:
# Drop themes older than 1999, themes with less than 50 sets, sets that aren't currently in production, 
#minifig theme, or miscellaneous theme
print('Starting theme_df shape: ',theme_df.shape)

mask = theme_df[(theme_df['yearFrom'] < 1999) | (theme_df['setCount'] < 50) | (theme_df['yearTo'] < 2022) |
         (theme_df['theme'] == 'Collectable Minifigures') | (theme_df['theme'] == 'Miscellaneous')].index
theme_df.drop(mask, inplace=True)

print('Resulting theme_df shape: ',theme_df.shape)

Starting theme_df shape:  (153, 5)
Resulting theme_df shape:  (17, 5)


In [4]:
# Generate sample theme list to use in 2nd API call. Convert list to string for API parameters.
theme_list = []
for item in theme_df['theme'].sample(3):
    theme_list.append(item)
param_string = ", ".join(theme_list)
print('Theme List: ',param_string)

Theme List:  Speed Champions, Seasonal, Super Mario


In [5]:
# 2nd API call to get a full set list for themes in the theme list generated by first API call. 
# Convert to a Dataframe.
parameters = {'theme' : f'{param_string}', 'pageSize' : 2500}
set_list = requests.get(f"https://brickset.com/api/v3.asmx/getSets?apiKey={KEY_TWO}&userHash=&params={parameters}")
set_data = set_list.json()
set_df = pd.json_normalize(set_data,'sets')
print('set_df shape: ',set_df.shape)

set_df shape:  (368, 44)


In [6]:
# Drop columns using helper function
dc.drop_columns(set_df)

print('set_df shape: ',set_df.shape)
set_df.head()

set_df shape:  (368, 8)


Unnamed: 0,setID,number,name,year,theme,subtheme,pieces,rating
0,166,1127,Santa,1999,Seasonal,Christmas,39.0,0.0
1,167,1128,Santa on Skis,1999,Seasonal,Christmas,21.0,0.0
2,168,1129,Santa on Reindeer,1999,Seasonal,Christmas,34.0,0.0
3,214,1263,Easter Bunny,2000,Seasonal,Easter,27.0,0.0
4,215,1264,Easter Chicks,2000,Seasonal,Easter,26.0,0.0


In [7]:
# Drop rows where there is no rating for the set.
mask_two = set_df[set_df['rating'] == 0].index
set_df.drop(mask_two, inplace=True)
print('set_df shape: ',set_df.shape)

set_df shape:  (200, 8)


In [8]:
# Drop any rows if they have a NaN value in the pieces column
pieces_null = set_df['pieces'].isnull().values.any()
if pieces_null == True:
    set_df.dropna(subset=['pieces'], inplace=True)
print('set_df shape: ',set_df.shape)
set_df.head()

set_df shape:  (185, 8)


Unnamed: 0,setID,number,name,year,theme,subtheme,pieces,rating
24,29777,30342,Lamborghini Huracán Super Trofeo EVO,2020,Speed Champions,Lamborghini,70.0,3.6
25,31053,30343,McLaren Elva,2021,Speed Champions,McLaren,85.0,3.4
26,30196,30385,Super Mushroom Surprise,2020,Super Mario,Expansion Set,18.0,3.6
27,31397,30389,Fuzzy & Mushroom Platform,2021,Super Mario,Expansion Set,39.0,2.9
28,32510,30434,Aston Martin Valkyrie AMR Pro,2022,Speed Champions,Aston Martin,97.0,3.3


In [9]:
#Convert pieces, minimum age range, and maximum age range from floats to ints.
set_df['pieces'] = set_df['pieces'].astype(pd.Int64Dtype())
set_df.head()

Unnamed: 0,setID,number,name,year,theme,subtheme,pieces,rating
24,29777,30342,Lamborghini Huracán Super Trofeo EVO,2020,Speed Champions,Lamborghini,70,3.6
25,31053,30343,McLaren Elva,2021,Speed Champions,McLaren,85,3.4
26,30196,30385,Super Mushroom Surprise,2020,Super Mario,Expansion Set,18,3.6
27,31397,30389,Fuzzy & Mushroom Platform,2021,Super Mario,Expansion Set,39,2.9
28,32510,30434,Aston Martin Valkyrie AMR Pro,2022,Speed Champions,Aston Martin,97,3.3


In [10]:
# Rename column labels
print(f'Starting column labels: {set_df.columns}')

rename_dict = {
               'setID' : 'Set ID',
               'number' : 'Set Number',
               'name' : 'Set Name',
               'year' : 'Release Year',
               'theme' : 'Theme',
               'subtheme' : 'Subtheme',
               'pieces' : 'Number of Pieces',
               'rating' : 'Brickset Rating',
               }
                

set_df.rename(columns=rename_dict, inplace=True)
print(f'\nResulting column labels: {set_df.columns}')
set_df.shape

Starting column labels: Index(['setID', 'number', 'name', 'year', 'theme', 'subtheme', 'pieces',
       'rating'],
      dtype='object')

Resulting column labels: Index(['Set ID', 'Set Number', 'Set Name', 'Release Year', 'Theme', 'Subtheme',
       'Number of Pieces', 'Brickset Rating'],
      dtype='object')


(185, 8)

In [11]:
# Make a new DataFrame with a sample of 100 sets from set_df
if len(set_df.index) > 100:
    set_df = set_df.sample(100)
else:
    print('Restart notebook kernel and try again for a better sample')
set_df.shape

(100, 8)

In [12]:
# Save sample DataFrame to .csv for visualization in Tableau
file_path = dc.csv_path('theme_sample_set_list.csv')
set_df.to_csv(file_path)