In [2]:
# Import Pandas
import pandas as pd
from pandas import Series
from numpy.random import randn

# Load Movies Metadata
metadata = pd.read_csv('lego_sets.csv', low_memory=False)

# Print the first three rows
metadata.head(3)

Unnamed: 0,num_reviews,piece_count,play_star_rating,prod_desc,prod_id,review_difficulty,set_name,star_rating,theme_name,val_star_rating,country
0,2.0,277,4.0,Catapult into action and take back the eggs fr...,75823,Average,Bird Island Egg Heist,4.5,Angry Birds™,4.0,US
1,2.0,168,4.0,Launch a flying attack and rescue the eggs fro...,75822,Easy,Piggy Plane Attack,5.0,Angry Birds™,4.0,US
2,11.0,74,4.3,Chase the piggy with lightning-fast Chuck and ...,75821,Easy,Piggy Car Escape,4.3,Angry Birds™,4.1,US


In [3]:
# Calculate C
C = metadata['star_rating'].mean()
print(C)

4.514134009961459


In [4]:
m = metadata['num_reviews'].quantile(0.90)
print(m)

38.0


In [5]:
q_sets = metadata.copy().loc[metadata['num_reviews'] >= m]
q_sets.shape

(1105, 11)

In [6]:
# Function that computes the weighted rating of each movie
def weighted_rating(x, m=m, C=C):
    v = x['num_reviews']
    R = x['star_rating']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [7]:
# Define a new feature 'score' and calculate its value with `weighted_rating()`
q_sets['score'] = q_sets.apply(weighted_rating, axis=1)

In [8]:
#Print the top 15 movies
q_sets[['set_name', 'num_reviews', 'star_rating', 'score']].head(15)

Unnamed: 0,set_name,num_reviews,star_rating,score
11,The Eiffel Tower,53.0,4.6,4.564144
13,BOOST Creative Toolbox,63.0,3.4,3.819179
50,High-speed Passenger Train,45.0,4.4,4.452254
77,Flexible and Straight Tracks,89.0,2.5,3.102654
87,Switching Tracks,47.0,4.1,4.285142
88,Straight & Crossroad Plates,85.0,3.5,3.81331
89,T-Junction & Curved Road Plates,40.0,3.9,4.199193
128,Green Baseplate,46.0,2.6,3.465918
133,Brick Separator,180.0,4.8,4.75017
165,Taj Mahal,43.0,4.8,4.66589


In [9]:
#Print plot theme of the first 5 sets.
metadata['prod_desc'].head()

0    Catapult into action and take back the eggs fr...
1    Launch a flying attack and rescue the eggs fro...
2    Chase the piggy with lightning-fast Chuck and ...
3    Explore the architecture of the United States ...
4    Recreate the Solomon R. Guggenheim Museum® wit...
Name: prod_desc, dtype: object

In [10]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
metadata['prod_desc'] = metadata['prod_desc'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(metadata['prod_desc'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(12261, 1635)

In [11]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [12]:
#Construct a reverse map of indices and set names
indices = pd.Series(metadata.index, index=metadata['set_name']).drop_duplicates()

In [13]:
# Function that takes in movie title as input and outputs most similar sets
def get_recommendations(set_name, cosine_sim=cosine_sim):
    # Get the index of the set that matches the set_name
    idx = indices[set_name]

    # Get the pairwsie similarity scores of all sets with that sets
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the sets based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar sets
    sim_scores = sim_scores[1:11]

    # Get the set indices
    set_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar sets
    return metadata['set_name'].iloc[set_indices]

In [14]:
get_recommendations('Bird Island Egg Heist')

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()