In [1]:
import pandas as pd
import requests
from keys import KEY_TWO


In [2]:
# API Call to get list of LEGO themes and convert to dataframe
themes = requests.get(f'https://brickset.com/api/v3.asmx/getThemes?apiKey={KEY_TWO}') 
data = themes.json()
df = pd.json_normalize(data, 'themes')


In [3]:
# Drop themes older than 1999, themes with less than 50 sets, sets that aren't currently in production, minifig theme, and miscellaneous theme
mask = df[(df['yearFrom'] < 1999) | (df['setCount'] < 50) | (df['yearTo'] < 2022) |
         (df['theme'] == 'Collectable Minifigures') | (df['theme'] == 'Miscellaneous')].index
df.drop(mask, inplace=True)
df.shape

(17, 5)

In [4]:
# Generate sample theme list to use in 2nd API call. Convert list to string for API parameters.
theme_list = []
for item in df['theme'].sample(5):
    theme_list.append(item)
param_string = ", ".join(theme_list)
param_string

'Disney, Harry Potter, Speed Champions, Dots, Seasonal'

In [5]:
# 2nd API call to get a full set list for themes in the theme list generated by first API call. Convert to a data frame
parameters = {'theme' : f'{param_string}', 'pageSize' : 2500}
set_list = requests.get(f"https://brickset.com/api/v3.asmx/getSets?apiKey={KEY_TWO}&userHash=&params={parameters}")
set_data = set_list.json()
set_df = pd.json_normalize(set_data,'sets')
set_df.shape


(583, 44)

In [6]:
# List of columns to drop from Dataframe. Columns not relevant to analysis.
drop_list = [
            'numberVariant',
            'released',
            'category',
            'bricksetURL',
            'reviewCount',
            'packagingType',
            'availability',
            'instructionsCount',
            'additionalImageCount',
            'lastUpdated',
            'image.thumbnailURL',
            'image.imageURL',
            'collections.ownedBy',
            'collections.wantedBy',
            'dimensions.height',
            'dimensions.width',
            'dimensions.depth',
            'LEGOCom.US.retailPrice',
            'LEGOCom.US.dateFirstAvailable',
            'LEGOCom.US.dateLastAvailable',
            'LEGOCom.UK.retailPrice',
            'LEGOCom.UK.dateFirstAvailable',
            'LEGOCom.UK.dateLastAvailable',
            'LEGOCom.CA.retailPrice',
            'LEGOCom.CA.dateFirstAvailable',
            'LEGOCom.CA.dateLastAvailable',
            'dimensions.weight',
            'barcode.EAN',
            'barcode.UPC',
            'minifigs',
            'LEGOCom.DE.retailPrice',
            'LEGOCom.DE.dateFirstAvailable',
            'LEGOCom.DE.dateLastAvailable'
            ]
set_df.drop(columns=drop_list, inplace=True)
set_df

Unnamed: 0,setID,number,name,year,theme,themeGroup,subtheme,pieces,rating,ageRange.min,ageRange.max
0,166,1127,Santa,1999,Seasonal,Miscellaneous,Christmas,39.0,0.0,,
1,167,1128,Santa on Skis,1999,Seasonal,Miscellaneous,Christmas,21.0,0.0,,
2,168,1129,Santa on Reindeer,1999,Seasonal,Miscellaneous,Christmas,34.0,0.0,,
3,214,1263,Easter Bunny,2000,Seasonal,Miscellaneous,Easter,27.0,0.0,,
4,215,1264,Easter Chicks,2000,Seasonal,Miscellaneous,Easter,26.0,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...
578,26644,TRULUMIERE,Lumiere,2016,Disney,Licensed,Beauty and the Beast,23.0,0.0,,
579,26323,TRUPODIUM,Winner's Podium,2016,Speed Champions,Licensed,Promotional,,0.0,,
580,25726,TRUSANTA,Santa at the Beach,2014,Seasonal,Miscellaneous,Christmas,43.0,0.0,,
581,30211,TRUWEASLEYCAR,The Weasley's car,2018,Harry Potter,Licensed,Promotional,36.0,0.0,,


In [7]:
# Drop rows where there is no rating for the set.
mask_two = set_df[set_df['rating'] == 0].index
set_df.drop(mask_two, inplace=True)
set_df.shape

(344, 11)

In [8]:
# Drop any rows if they have a NaN value in the pieces column
pieces_null = set_df['pieces'].isnull().values.any()
print(pieces_null)
if pieces_null == True:
    set_df.dropna(subset=['pieces'], inplace=True)
set_df.shape

False


(344, 11)

In [16]:
#Convert pieces, minimum age range, and maximum age range from floats to ints.
set_df['pieces'] = set_df['pieces'].(pd.Int64Dtype())
set_df['ageRange.min'] = set_df['ageRange.min'].astype(pd.Int64Dtype())
set_df['ageRange.max'] = set_df['ageRange.max'].astype(pd.Int64Dtype())

In [17]:
# Save data to CSV for visualization in Tableau
set_df.to_csv('./set_list.csv')