# Setting up

In [1]:
# Prepare dependencies
import numpy as np
import scipy.stats as stats
import pandas as pd
import seaborn as sns

import matplotlib
import matplotlib.pyplot as plt

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

plt.style.use('seaborn')

# # ML dependencies
# from sklearn.datasets import make_regression
# from sklearn.datasets import make_s_curve
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import mean_squared_error, r2_score
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
# from sklearn.linear_model import LogisticRegression
# from sklearn.neighbors import KNeighborsClassifier

In [2]:
# Import dataset
dfLego = pd.read_csv('lego_cleaned_set.csv')
dfLego.head(3)

Unnamed: 0,setid,number,variant,theme,subtheme,year,name,pieces,price,image_url,owned_by,wanted_by,num_reviews,prod_desc,review_difficulty,set_name,star_rating
0,27499,75209,1,Star Wars,Solo,2018,Han Solo's Landspeeder,345.0,29.99,https://images.brickset.com/sets/images/75209-...,7511,1385,4.0,Escape to safety with Han Solo’s Landspeeder!,Easy,Han Solo's Landspeeder™,4.8
1,27437,76104,1,Marvel Super Heroes,Avengers: Infinity War,2018,The Hulkbuster Smash-Up,375.0,29.99,https://images.brickset.com/sets/images/76104-...,6899,1097,13.0,Bash the ball-shooting gun turret with the Hul...,Easy,The Hulkbuster Smash-Up,4.8
2,27832,75181,1,Star Wars,Ultimate Collector Series,2018,Y-wing Starfighter,1967.0,199.99,https://images.brickset.com/sets/images/75181-...,5317,2748,1.0,Collect the ultimate long-range Rebel starfigh...,Very Challenging,Y-Wing Starfighter™,5.0


# Peeking at the dataset

In [3]:
# Statistics summary
dfLego.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
setid,518.0,27001.453668,609.82776,24893.0,26578.25,27011.5,27489.25,28033.0
number,518.0,55092.370656,21659.76733,10255.0,41310.25,70357.5,71021.0,76108.0
variant,518.0,2.335907,3.886084,1.0,1.0,1.0,1.0,21.0
year,518.0,2017.364865,0.481858,2017.0,2017.0,2017.0,2018.0,2018.0
pieces,518.0,376.366795,653.309904,1.0,85.0,191.0,401.5,7541.0
price,518.0,39.063958,57.181132,3.99,9.99,19.99,39.99,799.99
owned_by,518.0,3371.123552,2273.498685,139.0,1612.25,3083.0,4566.75,15558.0
wanted_by,518.0,881.6139,699.815423,31.0,446.75,675.0,1146.5,5897.0
num_reviews,518.0,11.745174,22.043318,1.0,2.0,6.0,14.0,367.0
star_rating,518.0,4.512934,0.484585,1.8,4.3,4.6,4.9,5.0


In [4]:
# Calculate the total rating counts based on set names
setNameRatingCount = dfLego.groupby('set_name')['star_rating'].count()
setNameRatingCount = pd.DataFrame(setNameRatingCount).reset_index().rename(columns={'star_rating': 'total_rating_count'})
setNameRatingCount

Unnamed: 0,set_name,total_rating_count
0,1968 Ford Mustang Fastback,1
1,2016 Ford GT & 1966 Ford GT40,1
2,4 x 4 Response Unit,1
3,6x6 All Terrain Tow Truck,1
4,A-Wing™ vs. TIE Silencer™ Microfighters,1
...,...,...
437,Y-Wing Starfighter™,2
438,Y-Wing™ Microfighter,1
439,Yoda's Hut,1
440,Yoda's Jedi Starfighter™,1


In [5]:
# Merge total rating count to 'dfLego'
dfLego = pd.merge(dfLego, setNameRatingCount, how='left', on='set_name')
dfLego.head()

Unnamed: 0,setid,number,variant,theme,subtheme,year,name,pieces,price,image_url,owned_by,wanted_by,num_reviews,prod_desc,review_difficulty,set_name,star_rating,total_rating_count
0,27499,75209,1,Star Wars,Solo,2018,Han Solo's Landspeeder,345.0,29.99,https://images.brickset.com/sets/images/75209-...,7511,1385,4.0,Escape to safety with Han Solo’s Landspeeder!,Easy,Han Solo's Landspeeder™,4.8,1
1,27437,76104,1,Marvel Super Heroes,Avengers: Infinity War,2018,The Hulkbuster Smash-Up,375.0,29.99,https://images.brickset.com/sets/images/76104-...,6899,1097,13.0,Bash the ball-shooting gun turret with the Hul...,Easy,The Hulkbuster Smash-Up,4.8,1
2,27832,75181,1,Star Wars,Ultimate Collector Series,2018,Y-wing Starfighter,1967.0,199.99,https://images.brickset.com/sets/images/75181-...,5317,2748,1.0,Collect the ultimate long-range Rebel starfigh...,Very Challenging,Y-Wing Starfighter™,5.0,2
3,27732,75884,1,Speed Champions,Ford,2018,1968 Ford Mustang Fastback,183.0,14.99,https://images.brickset.com/sets/images/75884-...,7310,900,9.0,"Build, display and race the 1968 Ford Mustang ...",Easy,1968 Ford Mustang Fastback,4.9,1
4,27965,71021,7,Collectable Minifigures,Series 18,2018,Dragon Suit Guy,6.0,3.99,https://images.brickset.com/sets/images/71021-...,5376,820,14.0,Get the party started with LEGO® Minifigures!,Very Easy,Series 18: Party,4.6,17


In [16]:
# Statistics summary for 'total_rating_count'
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(dfLego['total_rating_count'].describe())
print('='*50)
print(dfLego['total_rating_count'].quantile(np.arange(.9, 1, .01)))

count   518.000
mean      3.687
std       6.489
min       1.000
25%       1.000
50%       1.000
75%       1.000
max      21.000
Name: total_rating_count, dtype: float64
0.900   17.000
0.910   17.000
0.920   19.560
0.930   21.000
0.940   21.000
0.950   21.000
0.960   21.000
0.970   21.000
0.980   21.000
0.990   21.000
Name: total_rating_count, dtype: float64


In [13]:
# Keep top 10% data
popThreshold = 17
dfPopularLego = dfLego.query('total_rating_count >= @popThreshold')
dfPopularLego.head()

Unnamed: 0,setid,number,variant,theme,subtheme,year,name,pieces,price,image_url,owned_by,wanted_by,num_reviews,prod_desc,review_difficulty,set_name,star_rating,total_rating_count
4,27965,71021,7,Collectable Minifigures,Series 18,2018,Dragon Suit Guy,6.0,3.99,https://images.brickset.com/sets/images/71021-...,5376,820,14.0,Get the party started with LEGO® Minifigures!,Very Easy,Series 18: Party,4.6,17
18,27974,71021,16,Collectable Minifigures,Series 18,2018,Birthday Party Boy,9.0,3.99,https://images.brickset.com/sets/images/71021-...,4628,541,14.0,Get the party started with LEGO® Minifigures!,Very Easy,Series 18: Party,4.6,17
42,27966,71021,8,Collectable Minifigures,Series 18,2018,Classic Police Officer,6.0,3.99,https://images.brickset.com/sets/images/71021-...,3799,1201,14.0,Get the party started with LEGO® Minifigures!,Very Easy,Series 18: Party,4.6,17
43,27960,71021,2,Collectable Minifigures,Series 18,2018,Brick Suit Guy,5.0,3.99,https://images.brickset.com/sets/images/71021-...,5836,617,14.0,Get the party started with LEGO® Minifigures!,Very Easy,Series 18: Party,4.6,17
50,27963,71021,5,Collectable Minifigures,Series 18,2018,Firework Guy,4.0,3.99,https://images.brickset.com/sets/images/71021-...,5171,478,14.0,Get the party started with LEGO® Minifigures!,Very Easy,Series 18: Party,4.6,17


# Building the recommendation engine

## 1. k-Nearest Neighbors (kNN)

In [18]:
# Prepare dependencies
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [27]:
# Create the pivot dataframe and fill NaNs with 0
legoRatingPivot = dfPopularLego.pivot(index='set_name', columns='setid', values='star_rating').fillna(0)
legoRatingPivot

setid,27133,27139,27140,27141,27142,27143,27144,27145,27146,27147,...,27966,27967,27968,27969,27970,27971,27972,27973,27974,27975
set_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Series 17,4.1,4.1,4.1,4.1,4.1,4.1,4.1,4.1,4.1,4.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Series 18: Party,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.6,4.6,4.6,4.6,4.6,4.6,4.6,4.6,4.6,4.6
THE LEGO® BATMAN MOVIE Series 2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
THE LEGO® NINJAGO® MOVIE™,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Transform pivot dataframe into a 2D matrix
legoRatingMatrix = csr_matrix(legoRatingPivot.values)

In [28]:
# Build and fit the kNN model
modelKNN = NearestNeighbors(metric='cosine', algorithm='brute')
modelKNN.fit(legoRatingMatrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [55]:
# Test the model and make some recommendations
queryIndex = np.random.choice(legoRatingPivot.shape[0])
distances, indices = modelKNN.kneighbors(legoRatingPivot.iloc[queryIndex, :].values.reshape(1, -1), n_neighbors = 4)

for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(legoRatingPivot.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, legoRatingPivot.index[indices.flatten()[i]], distances.flatten()[i]))
    

Recommendations for THE LEGO® BATMAN MOVIE Series 2:

1: Series 17, with distance of 1.0:
2: Series 18: Party, with distance of 1.0:
3: THE LEGO® NINJAGO® MOVIE™, with distance of 1.0:


## 2. TF-IDF Vetorizer

In [44]:
# Prepare dependencies
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [63]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidfMatrix = tf.fit_transform(dfLego['prod_desc'])

In [64]:
cosineSimilarity = linear_kernel(tfidfMatrix, tfidfMatrix)

In [68]:
results = {}

# for idx, row in dfLego.iterrows():
#     indices = cosineSimilarity[idx].argsort()[:-10:-1]
#     items = [(cosineSimilarity[idx][i], dfLego['number'][i]) for i in cosineSimilarity]

#     results[row['number']] = items[1:]