In [1]:
import pandas as pd
import requests
from PythonScripts.keys import KEY_TWO
import PythonScripts.data_clean as dc


In [2]:
# API Call to get list of LEGO themes and convert to dataframe
themes = requests.get(f'https://brickset.com/api/v3.asmx/getThemes?apiKey={KEY_TWO}') 
data = themes.json()
theme_df = pd.json_normalize(data, 'themes')
print('Shape: ',theme_df.shape)
theme_df.head()


Shape:  (153, 5)


Unnamed: 0,theme,setCount,subthemeCount,yearFrom,yearTo
0,4 Juniors,24,5,2003,2004
1,Action Wheelers,9,0,2000,2001
2,Advanced models,35,12,2000,2012
3,Adventurers,72,4,1998,2003
4,Agents,13,0,2008,2009


In [3]:
# Drop themes older than 1999, themes with less than 50 sets, sets that aren't currently in production, minifig theme, and miscellaneous theme
mask = theme_df[(theme_df['yearFrom'] < 1999) | (theme_df['setCount'] < 50) | (theme_df['yearTo'] < 2022) |
         (theme_df['theme'] == 'Collectable Minifigures') | (theme_df['theme'] == 'Miscellaneous')].index
theme_df.drop(mask, inplace=True)
print('theme_df shape: ',theme_df.shape)


theme_df shape:  (17, 5)


In [4]:
# Generate sample theme list to use in 2nd API call. Convert list to string for API parameters.
theme_list = []
for item in theme_df['theme'].sample(3):
    theme_list.append(item)
param_string = ", ".join(theme_list)
print('Theme List: ',param_string)

Theme List:  BrickHeadz, Ninjago, Harry Potter


In [5]:
# 2nd API call to get a full set list for themes in the theme list generated by first API call. Convert to a data frame
parameters = {'theme' : f'{param_string}', 'pageSize' : 2500}
set_list = requests.get(f"https://brickset.com/api/v3.asmx/getSets?apiKey={KEY_TWO}&userHash=&params={parameters}")
set_data = set_list.json()
set_df = pd.json_normalize(set_data,'sets')
print('set_df shape: ',set_df.shape)


set_df shape:  (716, 44)


In [6]:
# Drop columns using helper function
dc.drop_columns(set_df)
set_df.head()

Unnamed: 0,setID,number,name,year,theme,themeGroup,subtheme,pieces,rating,ageRange.min,ageRange.max
0,8283,2111,Kai,2011,Ninjago,Action/Adventure,Spinners,19.0,3.6,6.0,14.0
1,8284,2112,Cole,2011,Ninjago,Action/Adventure,Spinners,19.0,3.7,6.0,14.0
2,8285,2113,Zane,2011,Ninjago,Action/Adventure,Spinners,19.0,4.0,6.0,14.0
3,8286,2114,Chopov,2011,Ninjago,Action/Adventure,Spinners,20.0,3.6,6.0,14.0
4,8287,2115,Bonezai,2011,Ninjago,Action/Adventure,Spinners,21.0,3.4,6.0,14.0


In [7]:
# Drop rows where there is no rating for the set.
mask_two = set_df[set_df['rating'] == 0].index
set_df.drop(mask_two, inplace=True)
print('set_df shape: ',set_df.shape)


set_df shape:  (542, 11)


In [8]:
# Drop any rows if they have a NaN value in the pieces column
pieces_null = set_df['pieces'].isnull().values.any()
if pieces_null == True:
    set_df.dropna(subset=['pieces'], inplace=True)
print('set_df shape: ',set_df.shape)


set_df shape:  (542, 11)


In [9]:
#Convert pieces, minimum age range, and maximum age range from floats to ints.
set_df['pieces'] = set_df['pieces'].astype(pd.Int64Dtype())
set_df['ageRange.min'] = set_df['ageRange.min'].astype(pd.Int64Dtype())
set_df['ageRange.max'] = set_df['ageRange.max'].astype(pd.Int64Dtype())

In [10]:
rename_dict = {
               'setID' : 'Set ID',
               'number' : 'Set Number',
               'name' : 'Set Name',
               'year' : 'Release Year',
               'theme' : 'Theme',
               'themeGroup' : 'Theme Group',
               'subtheme' : 'Subtheme',
               'pieces' : 'Number of Pieces',
               'rating' : 'Brickset Rating',
               'ageRange.min' : 'Min Age Range',
               'ageRange.max' : 'Max Age Range',
                }
                

set_df.rename(columns=rename_dict, inplace=True)
set_df.head()

Unnamed: 0,Set ID,Set Number,Set Name,Release Year,Theme,Theme Group,Subtheme,Number of Pieces,Brickset Rating,Min Age Range,Max Age Range
0,8283,2111,Kai,2011,Ninjago,Action/Adventure,Spinners,19,3.6,6,14
1,8284,2112,Cole,2011,Ninjago,Action/Adventure,Spinners,19,3.7,6,14
2,8285,2113,Zane,2011,Ninjago,Action/Adventure,Spinners,19,4.0,6,14
3,8286,2114,Chopov,2011,Ninjago,Action/Adventure,Spinners,20,3.6,6,14
4,8287,2115,Bonezai,2011,Ninjago,Action/Adventure,Spinners,21,3.4,6,14


In [11]:
set_df = set_df.sample(100)

In [12]:
# Save data to CSV for visualization in Tableau
file_path = dc.csv_path('theme_sample_set_list.csv')
set_df.to_csv(file_path)