In [None]:
import pandas as pd 
import requests
from PythonScripts.keys import KEY_ONE
import PythonScripts.data_clean as dc
from pathlib import Path

In [None]:
# Read CSVs off local drive and convert them each to a DataFrame

inv_path = dc.csv_path('inventories.csv')
inv_df = pd.read_csv(inv_path, usecols=['id','set_num'])

inv_parts_path = dc.csv_path('inventory_parts.csv')
inv_parts_df = pd.read_csv(inv_parts_path, usecols=['inventory_id', 'part_num', 'color_id', 'quantity'])

sets_path = dc.csv_path('sets.csv')
set_df = pd.read_csv(sets_path, usecols=['set_num', 'name', 'year', 'theme_id', 'num_parts'])

parts_path = dc.csv_path('parts.csv')
parts_df = pd.read_csv(parts_path, usecols=['part_num', 'name'])

print(f'inv_df shape: {inv_df.shape}')
print(inv_df.head(n=3))
print(f'\ninv_parts_df shape: {inv_parts_df.shape}')
print(inv_parts_df.head(n=3))
print(f'\nset_df shape: {set_df.shape}')
print(set_df.head(n=3))
print(f'\nparts_df shape: {parts_df.shape}')
print(parts_df.head(n=3))

In [None]:
# Rename columns in inv_df and parts_df for easier joining
print('Original column labels:')
print(f'inv_df: {inv_df.columns}')
print(f'parts_df: {parts_df.columns}')

inv_rename_dict = {'id' : 'inventory_id',
                   'set_num' : 'set_num'}
inv_df.rename(columns=inv_rename_dict, inplace=True)

parts_rename_dict = {'part_num' : 'part_num',
                     'name' : 'part_name'}
parts_df.rename(columns=parts_rename_dict, inplace=True)

print('\nRenamed column labels:')
print(f'inv_df: {inv_df.columns}')
print(f'parts_df: {parts_df.columns}')

In [None]:
# Merge all DataFrames into one larger DataFrame with all data points
print(f'Original inv_df shape: {inv_df.shape}')

all_merged_df = inv_df.merge(inv_parts_df, how='inner', left_on='inventory_id', right_on='inventory_id')
print(f'Post-first merge shape: {all_merged_df.shape}')
      
all_merged_df = all_merged_df.merge(set_df, how='left', left_on='set_num', right_on='set_num')
print(f'Post-second merge shape: {all_merged_df.shape}')

all_merged_df = all_merged_df.merge(parts_df, how='inner', left_on='part_num', right_on='part_num')
print(f'current all_merged_df shape: {all_merged_df.shape}')

In [None]:
# Remove all non-Star Wars themes from the Dataframe

sw_theme_ids = [18, 158, 171, 209, 261]
print(f'Star Wars Theme IDs: {sw_theme_ids}')
all_merged_df = all_merged_df[all_merged_df['theme_id'].isin(sw_theme_ids)]
unique_themes = all_merged_df['theme_id'].unique()

print(f'Unique values in theme_id column in DataFrame: {unique_themes}')

In [None]:
# Find the set number with the most pieces

piece_count = all_merged_df.groupby(['set_num'])['quantity'].sum()
max_count = piece_count.idxmax()
print(f'Set with highest count: {max_count}')

In [None]:
# Drop all rows from the large DataFrame except the set with most pieces

print(f'Current all_merged_df shape: {all_merged_df.shape}')

mask = all_merged_df[all_merged_df['set_num'] != max_count].index
all_merged_df.drop(mask, inplace=True)

print(f'Final all_merged_df shape: {all_merged_df.shape}')
print(f'\n{all_merged_df.head(n=3)}')

In [None]:
# Convert year, theme_id, and num_parts columns to integers

all_merged_df['year'] = all_merged_df['year'].astype(pd.Int64Dtype())
all_merged_df['theme_id'] = all_merged_df['theme_id'].astype(pd.Int64Dtype())
all_merged_df['num_parts'] = all_merged_df['num_parts'].astype(pd.Int64Dtype())

In [None]:
# Make a new DataFrame with just the top 10 parts with highest quantity.
top_ten_parts = all_merged_df['quantity'].nlargest(n=10, keep='first')
top_ten_df = all_merged_df[all_merged_df['quantity'].isin(top_ten_parts)]
top_ten_df.reset_index(drop=True, inplace=True)
top_ten_df

In [None]:
# Write a .csv of the parts on the local drive for use in Tableau visualization
file_path = dc.csv_path('top_ten_parts.csv')
top_ten_df.to_csv(file_path)