In [1]:
import pandas as pd 
import requests
from PythonScripts.keys import KEY_ONE
import PythonScripts.data_clean as dc
import PythonScripts.JPGFun as jf
from pathlib import Path

In [2]:
# Read CSVs off local drive and convert them each to a DataFrame

inv_path = dc.csv_path('inventories.csv')
inv_df = pd.read_csv(inv_path, usecols=['id','set_num'])

inv_parts_path = dc.csv_path('inventory_parts.csv')
inv_parts_df = pd.read_csv(inv_parts_path, usecols=['inventory_id', 'part_num', 'color_id', 'quantity'])

sets_path = dc.csv_path('sets.csv')
set_df = pd.read_csv(sets_path, usecols=['set_num', 'name', 'year', 'theme_id', 'num_parts'])

parts_path = dc.csv_path('parts.csv')
parts_df = pd.read_csv(parts_path, usecols=['part_num', 'name'])

print(f'inv_df shape: {inv_df.shape}')
print(inv_df.head(n=3))
print(f'\ninv_parts_df shape: {inv_parts_df.shape}')
print(inv_parts_df.head(n=3))
print(f'\nset_df shape: {set_df.shape}')
print(set_df.head(n=3))
print(f'\nparts_df shape: {parts_df.shape}')
print(parts_df.head(n=3))


inv_df shape: (33221, 2)
   id set_num
0   1  7922-1
1   3  3931-1
2   4  6942-1

inv_parts_df shape: (1041633, 4)
   inventory_id        part_num  color_id  quantity
0             1        48379c01        72         1
1             1           48395         7         1
2             1  stickerupn0077      9999         1

set_df shape: (19561, 5)
  set_num                        name  year  theme_id  num_parts
0   001-1                       Gears  1965         1         43
1  0011-2           Town Mini-Figures  1979        67         12
2  0011-3  Castle 2 for 1 Bonus Offer  1987       199          0

parts_df shape: (47792, 2)
  part_num                                        name
0   003381                 Sticker Sheet for Set 663-1
1   003383         Sticker Sheet for Sets 618-1, 628-2
2   003402  Sticker Sheet for Sets 310-3, 311-1, 312-3


In [3]:
# Rename columns in inv_df and parts_df for easier joining
print('Original column labels:')
print(f'inv_df: {inv_df.columns}')
print(f'parts_df: {parts_df.columns}')

inv_rename_dict = {'id' : 'inventory_id',
                   'set_num' : 'set_num'}
inv_df.rename(columns=inv_rename_dict, inplace=True)

parts_rename_dict = {'part_num' : 'part_num',
                     'name' : 'part_name'}
parts_df.rename(columns=parts_rename_dict, inplace=True)

print('\nRenamed column labels:')
print(f'inv_df: {inv_df.columns}')
print(f'parts_df: {parts_df.columns}')

Original column labels:
inv_df: Index(['id', 'set_num'], dtype='object')
parts_df: Index(['part_num', 'name'], dtype='object')

Renamed column labels:
inv_df: Index(['inventory_id', 'set_num'], dtype='object')
parts_df: Index(['part_num', 'part_name'], dtype='object')


In [4]:
# Merge all DataFrames into one larger DataFrame with all data points
print(f'Original inv_df shape: {inv_df.shape}')

all_merged_df = inv_df.merge(inv_parts_df, how='inner', left_on='inventory_id', right_on='inventory_id')
print(f'Post-first merge shape: {all_merged_df.shape}')
      
all_merged_df = all_merged_df.merge(set_df, how='left', left_on='set_num', right_on='set_num')
print(f'Post-second merge shape: {all_merged_df.shape}')

all_merged_df = all_merged_df.merge(parts_df, how='inner', left_on='part_num', right_on='part_num')
print(f'current all_merged_df shape: {all_merged_df.shape}')



Original inv_df shape: (33221, 2)
Post-first merge shape: (1041633, 5)
Post-second merge shape: (1041633, 9)
current all_merged_df shape: (1041633, 10)


In [5]:
# Remove all non-Star Wars themes from the Dataframe

sw_theme_ids = [18, 158, 171, 209, 261]
print(f'Star Wars Theme IDs: {sw_theme_ids}')
all_merged_df = all_merged_df[all_merged_df['theme_id'].isin(sw_theme_ids)]
unique_themes = all_merged_df['theme_id'].unique()

print(f'Unique values in theme_id column in DataFrame: {unique_themes}')


Star Wars Theme IDs: [18, 158, 171, 209, 261]
Unique values in theme_id column in DataFrame: [158. 171. 209. 261.  18.]


In [6]:
# Find the set number with the most pieces

piece_count = all_merged_df.groupby(['set_num'])['quantity'].sum()
max_count = piece_count.idxmax()
print(f'Set with highest count: {max_count}')


Set with highest count: 75192-1


In [7]:
# Drop all rows from the large DataFrame except the set with most pieces

print(f'Current all_merged_df shape: {all_merged_df.shape}')

mask = all_merged_df[all_merged_df['set_num'] != max_count].index
all_merged_df.drop(mask, inplace=True)

print(f'Final all_merged_df shape: {all_merged_df.shape}')
print(f'\n{all_merged_df.head(n=3)}')


Current all_merged_df shape: (86526, 10)
Final all_merged_df shape: (730, 10)

      inventory_id  set_num part_num  color_id  quantity               name  \
5970         19670  75192-1     3003         0         3  Millennium Falcon   
5971         19670  75192-1     3003        73         1  Millennium Falcon   
5972         19670  75192-1     3003        71         4  Millennium Falcon   

        year  theme_id  num_parts    part_name  
5970  2017.0     171.0     7541.0  Brick 2 x 2  
5971  2017.0     171.0     7541.0  Brick 2 x 2  
5972  2017.0     171.0     7541.0  Brick 2 x 2  


In [8]:
# Convert year, theme_id, and num_parts columns to integers

all_merged_df['year'] = all_merged_df['year'].astype(pd.Int64Dtype())
all_merged_df['theme_id'] = all_merged_df['theme_id'].astype(pd.Int64Dtype())
all_merged_df['num_parts'] = all_merged_df['num_parts'].astype(pd.Int64Dtype())

In [9]:
# Make a new DataFrame with just the top 10 parts with highest quantity.
top_ten_parts = all_merged_df['quantity'].nlargest(n=10, keep='first')
top_ten_df = all_merged_df[all_merged_df['quantity'].isin(top_ten_parts)]
top_ten_df.reset_index(drop=True, inplace=True)
top_ten_df



Unnamed: 0,inventory_id,set_num,part_num,color_id,quantity,name,year,theme_id,num_parts,part_name
0,19670,75192-1,3023,71,73,Millennium Falcon,2017,171,7541,Plate 1 x 2
1,19670,75192-1,3023,28,243,Millennium Falcon,2017,171,7541,Plate 1 x 2
2,19670,75192-1,3021,71,105,Millennium Falcon,2017,171,7541,Plate 2 x 3
3,19670,75192-1,15573,72,75,Millennium Falcon,2017,171,7541,Plate Special 1 x 2 with 1 Stud with Groove an...
4,19670,75192-1,2412b,0,102,Millennium Falcon,2017,171,7541,Tile Special 1 x 2 Grille with Bottom Groove
5,19670,75192-1,3710,72,72,Millennium Falcon,2017,171,7541,Plate 1 x 4
6,19670,75192-1,2780,0,269,Millennium Falcon,2017,171,7541,Technic Pin with Friction Ridges Lengthwise an...
7,19670,75192-1,32054,71,75,Millennium Falcon,2017,171,7541,Technic Pin Long with Friction Ridges Lengthwi...
8,19670,75192-1,6558,1,139,Millennium Falcon,2017,171,7541,Technic Pin Long with Friction Ridges Lengthwi...
9,19670,75192-1,15712,72,139,Millennium Falcon,2017,171,7541,Tile Special 1 x 1 with Clip with Rounded Edges


In [10]:
# Write a .csv of the parts on the local drive for use in Tableau visualization
file_path = dc.csv_path('top_ten_parts.csv')
top_ten_df.to_csv(file_path)

In [11]:
# Data Manipulation is now complete. Additional code below for image processing for custom images for use in Tableau
# Build list of the image URLs for the top ten most common parts, request those images,
# and save them to a .csv. This is not used in the data manipulation.

#initialize new lists for color id and part number of top 10 piece quantities. 
#API call to find part specs for each part/color combination and save the image URL to a list

# add the URLs to the top ten Dataframe
part_num_list =[]
part_color_list = []

for item in top_ten_df['part_num']:
    part_num_list.append(item)
for item in top_ten_df['color_id']:
    part_color_list.append(str(item))
    
# Zip to tuple to lock in for API calls
num_color_zip = zip(part_num_list,part_color_list)


    

In [12]:
# API call to find part specs for each part/color combination and save the image URL to a list
url_list = []
for num, color in num_color_zip:
    response = requests.get(f'https://rebrickable.com/api/v3/lego/parts/{num}/colors/{color}?key={KEY_ONE}')
    data = response.json()
    url_list.append(str(data['part_img_url']))

In [13]:
# Create a Dataframe of the URLs and merge them into the top_ten_parts DataFrame in case it's needed for future features
url_df = pd.DataFrame({'part_num' : part_num_list,
                       'URL' : url_list})

url_df['part_num'] = url_df['part_num'].astype(str)
url_df['URL'] = url_df['URL'].astype(str)

top_ten_df = top_ten_df.merge(url_df, how='left', left_on='part_num', right_on='part_num')

In [14]:
# Write the URL DataFrame to a .csv
file_path = dc.csv_path('url_list.csv')
top_ten_df.to_csv(file_path)

In [15]:
# Download each image file and save to JPG folder
path = jf.jpg_path()
jf.write_image(url_list, path)
jf.resize_files(path)

In [None]:
# Resize files to Tableau max size
jf.resize_files(path)