# Setting up

In [1]:
# Prepare dependencies
import numpy as np
import scipy.stats as stats
import pandas as pd
import seaborn as sns

import matplotlib
import matplotlib.pyplot as plt

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

plt.style.use('seaborn')

In [2]:
# Import datasets
dfLego = pd.read_csv('Data/lego_clean_data_v1.csv')
dfLego.head(3)

Unnamed: 0,setid,number,theme,subtheme,year,name,pieces,price,image_url,owned_by,wanted_by,rating
0,22812,30224-1,City,General,2013,Ride-On Lawn Mower,42.0,3.36,https://images.brickset.com/sets/images/30224-...,1950,1746,4.0
1,22830,21104-1,Ideas,NASA,2014,NASA Mars Science Laboratory Curiosity Rover,295.0,29.99,https://images.brickset.com/sets/images/21104-...,1983,2912,4.0
2,22888,71002-12,Collectable Minifigures,Series 11,2013,Saxophone Player,6.0,2.99,https://images.brickset.com/sets/images/71002-...,5426,2201,4.0


In [3]:
dfLego2 = pd.read_csv('Data/lego_clean_data_v2.csv')
dfLego2.head()

Unnamed: 0,num_reviews,piece_count,prod_desc,prod_id,review_difficulty,set_name,star_rating,theme_name
0,2.0,277,Catapult into action and take back the eggs fr...,75823,Average,Bird Island Egg Heist,4.5,Angry Birds™
1,2.0,168,Launch a flying attack and rescue the eggs fro...,75822,Easy,Piggy Plane Attack,5.0,Angry Birds™
2,11.0,74,Chase the piggy with lightning-fast Chuck and ...,75821,Easy,Piggy Car Escape,4.3,Angry Birds™
3,23.0,1032,Explore the architecture of the United States ...,21030,Average,United States Capitol Building,4.6,Architecture
4,14.0,744,Recreate the Solomon R. Guggenheim Museum® wit...,21035,Challenging,Solomon R. Guggenheim Museum®,4.6,Architecture


# Peeking at the dataset

In [4]:
# Check variables' types
print(dfLego.dtypes)
print('='*30)
print(dfLego2.dtypes)

setid          int64
number        object
theme         object
subtheme      object
year           int64
name          object
pieces       float64
price        float64
image_url     object
owned_by       int64
wanted_by      int64
rating       float64
dtype: object
num_reviews          float64
piece_count            int64
prod_desc             object
prod_id                int64
review_difficulty     object
set_name              object
star_rating          float64
theme_name            object
dtype: object


In [5]:
# Change type of 'setid', 'year', 'pieces'
dfLego['setid'] = dfLego['setid'].astype(str)
dfLego['year'] = dfLego['year'].astype(str)
dfLego['pieces'] = dfLego['pieces'].astype(int)

dfLego2['num_reviews'] = dfLego2['num_reviews'].astype(int)

In [6]:
# Statistics summary
dfLego.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
pieces,7022.0,245.596696,422.958131,0.0,33.0,97.0,292.0,7541.0
price,7022.0,26.874006,40.30988,0.0,4.0,12.99,30.0,799.99
owned_by,7022.0,2608.271148,2548.412041,5.0,747.25,1776.5,3655.0,22256.0
wanted_by,7022.0,893.48761,909.318002,7.0,321.25,597.0,1188.75,11089.0
rating,7022.0,4.194375,0.696726,0.9,4.0,4.3,4.7,5.0


In [7]:
dfLego2.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
num_reviews,719.0,14.564673,31.127752,1.0,2.0,6.0,12.0,367.0
piece_count,719.0,459.844228,761.064515,1.0,101.0,208.0,494.0,7541.0
prod_id,719.0,61881.18637,149994.028968,630.0,41094.5,45517.0,71223.5,2000431.0
star_rating,719.0,4.50612,0.51297,1.8,4.3,4.6,4.95,5.0


In [8]:
# Function for calculating total counts of A based on B
def total_count(df, col, col2):
    count = df.groupby(col)[col2].count()
    dfNew = pd.DataFrame(count).reset_index()
    return dfNew

In [9]:
# Calculate total counts based on rating
themeRatingCount = total_count(dfLego, 'theme', 'rating')

# Rename the column
themeRatingCount = themeRatingCount.rename(columns={'rating': 'total_count_rating'})

In [10]:
# Function for merging 'total_rating_count_x' or 'total_price_count_y' into 'dfLego'
def merge_df(df, df2, col):
    return pd.merge(df, df2, how='left', on=col)

In [11]:
# Merge dataframes
dfLegoRating = merge_df(dfLego, themeRatingCount, 'theme')

In [12]:
# Function for summary statistics for 'total_rating_count_x' or 'total_price_count_y'
def summary_stats(df, col):
    pd.set_option('display.float_format', lambda x: '%.3f' % x)
    print(df[col].describe())
    print('='*48)
    print(df[col].quantile(np.arange(0.5, 1, 0.1)))

In [13]:
# Summary statistics
summary_stats(dfLegoRating, 'total_count_rating')

count   7022.000
mean     250.625
std      197.441
min        1.000
25%       66.000
50%      253.000
75%      446.000
max      568.000
Name: total_count_rating, dtype: float64
0.500   253.000
0.600   290.000
0.700   358.000
0.800   466.000
0.900   556.000
Name: total_count_rating, dtype: float64


In [14]:
# Keep top 50% data
threshold = 253
dfLegoRatingTop50 = dfLegoRating.query('total_count_rating >= @threshold')

# Building the recommendation engine

## 1. k-Nearest Neighbors (kNN)

In [15]:
# Prepare dependencies
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

### (A) Theme

In [16]:
# Create the pivot dataframe and fill NaNs with 0
pivotRatingTheme = dfLegoRatingTop50.pivot(index='theme', columns='number', values='rating').fillna(0)

# Transfrom pivot dataframe into a 2D matrix
matrixRatingTheme = csr_matrix(pivotRatingTheme.values)


In [17]:
# Build and fit the kNN model
themeKNN = NearestNeighbors(metric='cosine', algorithm='brute')
themeKNN.fit(matrixRatingTheme)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [18]:
# Test the model and make some recommendations
queryIndexName = np.random.choice(pivotRatingTheme.shape[0])
distance, index = themeKNN.kneighbors(pivotRatingTheme.iloc[queryIndexName, :].values.reshape(1, -1), n_neighbors = 6)

for i in range(0, len(distance.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(pivotRatingTheme.index[queryIndexName]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, pivotRatingTheme.index[index.flatten()[i]], distance.flatten()[i]))
    

Recommendations for Star Wars:

1: Space, with distance of 1.0:
2: Creator, with distance of 1.0:
3: Technic, with distance of 1.0:
4: Bionicle, with distance of 1.0:
5: Collectable Minifigures, with distance of 1.0:


### kNN Question: I don't like the result above but I have no idea where I cam improve this

## 2. TF-IDF Vetorizer

In [19]:
# Prepare dependencies
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [20]:
# Calculate TF-IDF score for each lego description, word-by-word
# Create the matrix containing each word and its TF-IDF score with regard to each description
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidfMatrix = tf.fit_transform(dfLego2['prod_desc'])

In [21]:
# Calculate cosine similarity
cosineSimilarity = linear_kernel(tfidfMatrix, tfidfMatrix)

In [23]:
# Create the loop
results = {}

for idx, row in dfLego2.iterrows():
    indices = cosineSimilarity[idx].argsort()[:-100:-1]
    items = [(cosineSimilarity[idx][i], dfLego2['prod_id'][i]) for i in indices]

    results[row['prod_id']] = items[1:]

In [26]:
# Test the model and make recommendations
def item(id):  
    return dfLego2.loc[dfLego2['prod_id'] == id]['prod_desc'].tolist()[0].split(' - ')[0] 


def recommend(item_id, num):
    print("Recommending " + str(num) + " products similar to " + item(item_id) + "...")   
    print("-------")    
    recs = results[item_id][:num]   
    for rec in recs: 
        print("Recommended: " + item(rec[1]) + " (score:" +      str(rec[0]) + ")")

In [33]:
recommend(item_id=75821, num=5)


Recommending 5 products similar to Chase the piggy with lightning-fast Chuck and rescue the eggs!...
-------
Recommended: Launch a flying attack and rescue the eggs from the Piggy Plane! (score:0.2800555018888261)
Recommended: Catapult into action and take back the eggs from the Piggy Trike! (score:0.17621493998679572)
Recommended: Scream around the circuit in the lightning-fast LEGO® Technic 24 Hours Race Car! (score:0.15732853487116075)
Recommended: Chase the Pteranodon and rescue it from danger! (score:0.1354159564460105)
Recommended: Speed to the rescue with Jay’s Lightning Jet! (score:0.1127751529169958)


### TF-IDF Question: I'm not sure this is the right track. The result seems odd to me...