# Setting up

In [1]:
# Prepare dependencies
import numpy as np
import scipy.stats as stats
import pandas as pd
import seaborn as sns

import matplotlib
import matplotlib.pyplot as plt

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

plt.style.use('seaborn')

# Preparing datasets

In [2]:
legoData = pd.read_csv('Data/lego_data.csv')
legoData.head(3)

Unnamed: 0,SetID,Number,Variant,Theme,Subtheme,Year,Name,Pieces,Price,ImageURL,OwnedBy,WantedBy,Rating
0,22812,30224,1,City,General,2013,Ride-On Lawn Mower,42.0,,https://images.brickset.com/sets/images/30224-...,1950,1746,4.0
1,22830,21104,1,Ideas,NASA,2014,NASA Mars Science Laboratory Curiosity Rover,295.0,29.99,https://images.brickset.com/sets/images/21104-...,1983,2912,4.0
2,22888,71002,12,Collectable Minifigures,Series 11,2013,Saxophone Player,6.0,2.99,https://images.brickset.com/sets/images/71002-...,5426,2201,4.0


In [3]:
legoData2 = pd.read_csv('Data/lego_sets.csv')
legoData2.head(3)

Unnamed: 0,num_reviews,piece_count,play_star_rating,prod_desc,prod_id,review_difficulty,set_name,star_rating,theme_name,val_star_rating,country
0,2.0,277,4.0,Catapult into action and take back the eggs fr...,75823,Average,Bird Island Egg Heist,4.5,Angry Birds™,4.0,US
1,2.0,168,4.0,Launch a flying attack and rescue the eggs fro...,75822,Easy,Piggy Plane Attack,5.0,Angry Birds™,4.0,US
2,11.0,74,4.3,Chase the piggy with lightning-fast Chuck and ...,75821,Easy,Piggy Car Escape,4.3,Angry Birds™,4.1,US


# Cleaning datasets

## 1. LegoData

In [4]:
# Make columns lowercase
legoData.columns = [x.lower()for x in legoData.columns]

# Revise column names
legoData = legoData.rename(columns={'imageurl': 'image_url',
                            'usprice': 'price',
                            'ownedby': 'owned_by',
                            'wantedby': 'wanted_by'})

In [5]:
# Check variables' types
legoData.dtypes

setid          int64
number        object
variant        int64
theme         object
subtheme      object
year           int64
name          object
pieces       float64
price        float64
image_url     object
owned_by       int64
wanted_by      int64
rating       float64
dtype: object

In [6]:
# Change type of 'setid', 'variant', 'year'
legoData['setid'] = legoData['setid'].astype(str)
legoData['variant'] = legoData['variant'].astype(str)
legoData['year'] = legoData['year'].astype(str)

In [7]:
# Check NaNs
legoData.isnull().sum()

setid           0
number          0
variant         0
theme           0
subtheme     3864
year            0
name            0
pieces       1076
price        4916
image_url       0
owned_by        0
wanted_by       0
rating       5731
dtype: int64

In [8]:
# Replace NaNs in 'subtheme' with 'Not Available'
legoData['subtheme'] = legoData['subtheme'].fillna('Not Available')

# Remove NaNs in 'pieces'
legoData = legoData.dropna(subset=['pieces'])

# Remove NaNs in 'rating'
legoData = legoData.dropna(subset=['rating'])

# Replace NaNs in 'price' with 'pieces' * 0.08
# 0.08 is a rough-guess for purchasing a brick based on Lego website
legoData['price'] = legoData.apply(lambda row: row['pieces'] * 0.08 if np.isnan(row['price']) else row['price'],
                                   axis=1)

In [9]:
# Revise 'number' column 
legoData['number'] = legoData['number'] + '-' + legoData['variant']

In [10]:
# Remove 'variant' column
legoData = legoData.drop(['variant'], axis=1)

## 2. LegoData2

In [11]:
# Check unique info in 'country'
legoData2['country'].unique()

array(['US', 'AU', 'AT', 'BE', 'CA', 'CH', 'CZ', 'DE', 'DN', 'ES', 'FI',
       'FR', 'GB', 'IE', 'IT', 'LU', 'NO', 'NL', 'NZ', 'PL', 'PT'],
      dtype=object)

In [12]:
# Keep US data
legoData2 = legoData2.loc[legoData2['country'] == 'US']

In [13]:
# Check variables' types
legoData2.dtypes

num_reviews          float64
piece_count            int64
play_star_rating     float64
prod_desc             object
prod_id                int64
review_difficulty     object
set_name              object
star_rating          float64
theme_name            object
val_star_rating      float64
country               object
dtype: object

In [14]:
# Check NaNs
legoData2.isnull().sum()

num_reviews           98
piece_count            0
play_star_rating     107
prod_desc             20
prod_id                0
review_difficulty    124
set_name               0
star_rating           98
theme_name             0
val_star_rating      108
country                0
dtype: int64

In [15]:
# Remove NaNs in 'num_reviews'
legoData2 = legoData2.dropna(subset=['num_reviews'])

# Replace NaN in 'review_difficulty', 'prod_desc' with 'Not Available'
legoData2['review_difficulty'] = legoData2['review_difficulty'].fillna('Not Available')
legoData2['prod_desc'] = legoData2['prod_desc'].fillna('Not Available')

In [18]:
# Remove 'play_star_rating', 'val_star_rating', 'country'
legoData2 = legoData2.drop(['play_star_rating', 'val_star_rating', 'country'], axis=1)

## 3. Merged datasets (don't execute)

In [None]:
# dfLego = pd.merge(legoData, 
#                   legoData2[['num_reviews', 'prod_desc', 'review_difficulty']],
#                   left_on='..',
#                   right_on='..',
#                   how='left')

In [None]:
# # Remove duplicated columns ('piece_count', 'prod_id', 'theme_name')
# dfLego = dfLego.drop(['piece_count', 'prod_id', 'theme_name'], axis=1)

# # Remove 'country' column
# dfLego = dfLego.drop(['country'], axis=1)

In [None]:
# # Remove NaNs
# dfLego = dfLego.dropna()

In [None]:
# # Find deplicates
# dfLego.duplicated('setid')

In [None]:
# # Remove duplicates
# dfLego = dfLego.drop_duplicates(['setid'])

In [None]:
# # Change type of 'variant'
# dfLego['variant'] = dfLego['variant'].astype(str)

In [None]:
# # Revise 'number' column
# dfLego['number'] = dfLego['number'] + '-' + dfLego['variant']

# Exporting dataframe to a csv

In [None]:
legoData.to_csv('Data/lego_clean_data_v1.csv', index=None, header=True)

In [20]:
legoData2.to_csv('Data/lego_clean_data_v2.csv', index=None, header=True)