In [None]:
import pandas as pd
import requests
from PythonScripts.keys import KEY_TWO
import PythonScripts.data_clean as dc

In [None]:
# Configure URL for pd.read_csv
# Full sheet URL == https://docs.google.com/spreadsheets/d/1xw7y9yawF6i35BTfP9M1uUawJvwpacz01Xq4MEZszBs/
workbook_id = "1xw7y9yawF6i35BTfP9M1uUawJvwpacz01Xq4MEZszBs"
sheet_name = "Tomato"
url = f"https://docs.google.com/spreadsheets/d/{workbook_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"

In [None]:
# Read Google Sheet and convert to a Dataframe
ip_df = pd.read_csv(url, parse_dates=['Release_Date'])
ip_df

In [None]:
# Format Date column to display as the year
ip_df['Release_Date'] = ip_df['Release_Date'].dt.strftime('%Y')
ip_df

In [None]:
# Clean extra text out the Title column

for item, str in ip_df['Title'].items():
        head, sep, tail = str.partition(' – ')
        ip_df['Title'].replace(to_replace=str, value = head, inplace=True)


# Function for cleaning a series by partition
def part_colon(column_label: pd.Series) -> pd.Series:
        for item, value in column_label.items():
           if ': ' in value:
                head, sep, tail = value.partition(': ')
                column_label.replace(to_replace=value, value = tail, inplace=True)
        
# Run cleaning function on Title column
part_colon(ip_df['Title'])
ip_df['Title']

In [None]:
# Drop duplicate Clone Wars, first 3 episodes of TV show theaterical release. Duplicate value and outlier
ip_df.drop(index=9, inplace=True)
ip_df.reset_index(drop=True, inplace=True)
ip_df

In [None]:
# API call for information for sets in Star Wars theme and convert to dataframe. 
parameters = {'theme' : 'Star Wars', 'pageSize' : 900}
sw_set_list = requests.get(f"https://brickset.com/api/v3.asmx/getSets?apiKey={KEY_TWO}&userHash=&params={parameters}")
sw_data = sw_set_list.json()
sw_df = pd.json_normalize(sw_data,'sets')
print(f'sw_df shape: {sw_df.shape}')
sw_df.head()

In [None]:
dc.drop_columns(sw_df)
print(f'sw_df shape: {sw_df.shape}')

In [None]:
# Replace certain values with values matching first data frame
subthemes = sw_df['subtheme'].sort_values().unique()
print(f'Subthemes: f{subthemes}')

sw_df['subtheme'].replace(to_replace={'The Clone Wars' : 'Star Wars: The Clone Wars', 
                                       'The Force Awakens' : 'Episode VII', 
                                       'The Last Jedi' : 'Episode VIII', 
                                       'The Rise of Skywalker' : 'Episode IX' }, inplace=True)
subthemes = sw_df['subtheme'].sort_values().unique()
print(f'\nRenamed Subthemes: f{subthemes}')

In [None]:
# Drop any rows where the set has not been rated and where there is NaN for number of pieces.
mask_two = sw_df[sw_df['rating'] == 0].index
sw_df.drop(mask_two, inplace=True)

pieces_null = sw_df.isnull().values.any()
if pieces_null == True:
    sw_df.dropna(subset=['pieces'], inplace=True)
                 
print(f'sw_df shape: {sw_df.shape}')
sw_df.head()

In [None]:
# Convert pieces to Int64 
sw_df['pieces'] = sw_df['pieces'].astype(pd.Int64Dtype())
sw_df.head()

In [None]:
# Run clean via partition function on the subtheme column of the second dataframe
part_colon(sw_df['subtheme'])

In [None]:
# Group subthemes by the number of sets in the the subtheme
lego_set_count = sw_df.groupby(['subtheme'])['number'].count()
lego_set_count

In [None]:
# Group subthemes by the average rating
rating_avg=sw_df.groupby(['subtheme'])['rating'].mean().round(2)
rating_avg

In [None]:
# Create a new DataFrame combining the set count and rating by subtheme
agg_df = pd.concat([lego_set_count, rating_avg], axis=1)
agg_df

In [None]:
# Merge DataFrame of set #s and average rating into DataFrame of Star Wars properties
merged_df = ip_df.merge(agg_df, how='left', left_on='Title', right_on='subtheme')
merged_df

In [None]:
# Write the merged DataFrame to .csv for visualization in Tableau
file_path = dc.csv_path('star_wars_set_list.csv')
merged_df.to_csv(file_path)