In [1]:
import pandas as pd
import requests
from keys import KEY_TWO

In [2]:
# Configure URL for pd.read_csv
# Full sheet URL == https://docs.google.com/spreadsheets/d/1xw7y9yawF6i35BTfP9M1uUawJvwpacz01Xq4MEZszBs/edit#gid=0
workbook_id = "1xw7y9yawF6i35BTfP9M1uUawJvwpacz01Xq4MEZszBs"
sheet_name = "Sheet1"
url = f"https://docs.google.com/spreadsheets/d/{workbook_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"

In [3]:
# Read Google Sheet and convert to a Dataframe
year_df = pd.read_csv(url, parse_dates=['Release_Date'])


In [4]:
# Format Date column to display as the year
year_df['Release_Date'] = year_df['Release_Date'].dt.strftime('%Y')
year_df


Unnamed: 0,Title,Is_Movie,Is_TV,Release_Date,Tomatometer
0,Episode IV – A New Hope,Y,N,1977,93%
1,Episode V – The Empire Strikes Back,Y,N,1980,94%
2,Episode VI – Return of the Jedi,Y,N,1983,83%
3,Episode I – The Phantom Menace,Y,N,1999,51%
4,Episode II – Attack of the Clones,Y,N,2002,66%
5,Episode III – Revenge of the Sith,Y,N,2005,79%
6,Episode VII – The Force Awakens,Y,N,2015,93%
7,Episode VIII – The Last Jedi,Y,N,2017,91%
8,Episode IX – The Rise of Skywalker,Y,N,2019,52%
9,Star Wars: The Clone Wars,Y,N,2008,18%


In [5]:
# Clean extra text out the Title column

for item, str in year_df['Title'].items():
        head, sep, tail = str.partition(' – ')
        year_df['Title'].replace(to_replace=str, value = head, inplace=True)


# Function for cleaning a series by partition
def part_colon(column_label: pd.Series) -> pd.Series:
        for item, value in column_label.items():
           if ': ' in value:
                head, sep, tail = value.partition(': ')
                column_label.replace(to_replace=value, value = tail, inplace=True)
        return pd.Series



# Run cleaning function on Title column
part_colon(year_df['Title'])

year_df.drop(index=9, inplace=True)
# Remove Clone Wars movie which was kick off of Star Wars TV Show
# mask_three = year_df[(year_df['Title'] == "The Clone Wars")].index
# year_df.drop(mask_three, inplace=True)
# year_df

#& (year_df['Is_Movie'] == 'Y')


In [6]:
# API call for information for sets in Star Wars theme and convert to dataframe. 
parameters = {'theme' : 'Star Wars', 'pageSize' : 900}
sw_set_list = requests.get(f"https://brickset.com/api/v3.asmx/getSets?apiKey={KEY_TWO}&userHash=&params={parameters}")
sw_data = sw_set_list.json()
sw_df = pd.json_normalize(sw_data,'sets')


In [7]:
# Drop unneeded columns from the dataframe
drop_list = [
            'numberVariant',
            'released',
            'category',
            'bricksetURL',
            'reviewCount',
            'packagingType',
            'availability',
            'instructionsCount',
            'additionalImageCount',
            'lastUpdated',
            'image.thumbnailURL',
            'image.imageURL',
            'collections.ownedBy',
            'collections.wantedBy',
            'dimensions.height',
            'dimensions.width',
            'dimensions.depth',
            'LEGOCom.US.retailPrice',
            'LEGOCom.US.dateFirstAvailable',
            'LEGOCom.US.dateLastAvailable',
            'LEGOCom.UK.retailPrice',
            'LEGOCom.UK.dateFirstAvailable',
            'LEGOCom.UK.dateLastAvailable',
            'LEGOCom.CA.retailPrice',
            'LEGOCom.CA.dateFirstAvailable',
            'LEGOCom.CA.dateLastAvailable',
            'dimensions.weight',
            'barcode.EAN',
            'barcode.UPC',
            'minifigs',
            'LEGOCom.DE.retailPrice',
            'LEGOCom.DE.dateFirstAvailable',
            'LEGOCom.DE.dateLastAvailable'
            ]
sw_df.drop(columns=drop_list, inplace=True)

# Replace certain values with values matching first data frame
sw_df['subtheme'].replace(to_replace={'The Clone Wars' : 'Star Wars: The Clone Wars', 
                                      'The Force Awakens' : 'Episode IX', 
                                      'The Last Jedi' : 'Episode X', 
                                      'The Rise of Skywalker' : 'Episode XI' }, inplace=True)



In [8]:
# Drop any rows where the set has not been rated and where there is NaN for number of pieces.
mask_two = sw_df[sw_df['rating'] == 0].index
sw_df.drop(mask_two, inplace=True)
pieces_null = sw_df.isnull().values.any()
if pieces_null == True:
    sw_df.dropna(subset=['pieces'], inplace=True)

# Convert columns with numeric columns to Int64 Type 
sw_df['pieces'] = sw_df['pieces'].astype(pd.Int64Dtype())
sw_df['ageRange.min'] = sw_df['ageRange.min'].astype(pd.Int64Dtype())
sw_df['ageRange.max'] = sw_df['ageRange.max'].astype(pd.Int64Dtype())


In [9]:
# Run clean via partition function on the subtheme column of the second dataframe
part_colon(sw_df['subtheme'])

pandas.core.series.Series

In [10]:
# Create a data frame that is the count of sets and average rating of each subtheme 
lego_set_count = sw_df.groupby(['subtheme'])['number'].count()
rating_avg=sw_df.groupby(['subtheme'])['rating'].mean().round(2)
agg_df = pd.concat([lego_set_count, rating_avg], axis=1)
agg_df


Unnamed: 0_level_0,number,rating
subtheme,Unnamed: 1_level_1,Unnamed: 2_level_1
Battlefront,4,3.9
Book Parts,1,3.6
Boost,1,3.7
Buildable Figures,29,3.88
Diorama Collection,3,4.47
Episode I,40,3.86
Episode II,20,4.08
Episode III,48,4.02
Episode IV,49,4.01
Episode IX,26,3.79


In [11]:
# Create third data frame that is a combination of the first two cleaned dataframes, based on subtheme
merged_df = year_df.merge(agg_df, left_on='Title', right_on='subtheme')
merged_df

Unnamed: 0,Title,Is_Movie,Is_TV,Release_Date,Tomatometer,number,rating
0,Episode IV,Y,N,1977,93%,49,4.01
1,Episode V,Y,N,1980,94%,43,4.04
2,Episode VI,Y,N,1983,83%,41,3.93
3,Episode I,Y,N,1999,51%,40,3.86
4,Episode II,Y,N,2002,66%,20,4.08
5,Episode III,Y,N,2005,79%,48,4.02
6,Episode IX,Y,N,2019,52%,26,3.79
7,Rogue One,Y,N,2016,84%,12,4.12
8,Solo,Y,N,2018,69%,12,3.89
9,The Bad Batch,N,Y,2021,85%,1,4.1


In [12]:

merged_df.to_csv('./sw_set_list.csv')