In [1]:
import pandas as pd

metadata = pd.read_csv('movies_metadata.csv', low_memory=False)

print(metadata.head)

<bound method NDFrame.head of        adult                              belongs_to_collection    budget  \
0      False  {'id': 10194, 'name': 'Toy Story Collection', ...  30000000   
1      False                                                NaN  65000000   
2      False  {'id': 119050, 'name': 'Grumpy Old Men Collect...         0   
3      False                                                NaN  16000000   
4      False  {'id': 96871, 'name': 'Father of the Bride Col...         0   
...      ...                                                ...       ...   
45461  False                                                NaN         0   
45462  False                                                NaN         0   
45463  False                                                NaN         0   
45464  False                                                NaN         0   
45465  False                                                NaN         0   

                                             

In [2]:
C = metadata['vote_average'].mean()
print(C)

5.618207215134185


In [3]:
m = metadata['vote_count'].quantile(0.90)
print(m)

160.0


In [5]:
q_movies = metadata.copy().loc[metadata['vote_count'] >= m]
print(q_movies.shape)

(4555, 24)


In [6]:
print(metadata.shape)

(45466, 24)


In [8]:
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # IMDB formula ~ recommendation formula
    return (v / (v + m) * R) + (m / (m + v) * C)


q_movies['score'] = q_movies.apply(weighted_rating, axis=1)  # calculate score by weighted rating

q_movies = q_movies.sort_values('score', ascending=False)  # sort movies

# Best 15 movies
print(q_movies[['title', 'vote_count', 'vote_average', 'score']].head(20))

                                 title  vote_count  vote_average     score
314           The Shawshank Redemption      8358.0           8.5  8.445869
834                      The Godfather      6024.0           8.5  8.425439
10309      Dilwale Dulhania Le Jayenge       661.0           9.1  8.421453
12481                  The Dark Knight     12269.0           8.3  8.265477
2843                        Fight Club      9678.0           8.3  8.256385
292                       Pulp Fiction      8670.0           8.3  8.251406
522                   Schindler's List      4436.0           8.3  8.206639
23673                         Whiplash      4376.0           8.3  8.205404
5481                     Spirited Away      3968.0           8.3  8.196055
2211                 Life Is Beautiful      3643.0           8.3  8.187171
1178            The Godfather: Part II      3418.0           8.3  8.180076
1152   One Flew Over the Cuckoo's Nest      3001.0           8.3  8.164256
351                      

In [9]:
"""The movies above are the top 25 movies based on the number of voters and the vote average, this is called simple
recommendation"""

'The movies above are the top 25 movies based on the number of voters and the vote average, this is called simple\nrecommendation'

In [10]:
print(metadata['overview'].head())

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')

metadata['overview'] = metadata['overview'].fillna('')

tfidf_matrix = tfidf.fit_transform(metadata['overview'])

print(tfidf_matrix.shape)

(45466, 75827)


In [12]:
print(tfidf.get_feature_names()[5000:5010])

['avails', 'avaks', 'avalanche', 'avalanches', 'avallone', 'avalon', 'avant', 'avanthika', 'avanti', 'avaracious']


In [13]:
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix equasion
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

print(cosine_sim.shape)

(45466, 45466)


In [14]:
indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()

print(indices[:10])

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
Heat                           5
Sabrina                        6
Tom and Huck                   7
Sudden Death                   8
GoldenEye                      9
dtype: int64


In [23]:
def recommendation(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))

    # get the most similar movies based on a similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # get the most similar movies
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]

    # return most similar movies
    return metadata['title'].iloc[movie_indices]


recommendation('Kung Fu Panda')

17213                           Kung Fu Panda 2
30939     Kung Fu Panda: Secrets of the Masters
18335                     Kung Fu Panda Holiday
34061    Films of Fury: The Kung Fu Movie Movie
39432            Young Dragons: Kung Fu Kids II
43787                   A Gift with a Character
19894                        The 36 Crazy Fists
7767                        Meet the Applegates
11360                              Jade Warrior
9733                             Kung Fu Hustle
Name: title, dtype: object

In [26]:
recommendation('Batman v Superman: Dawn of Justice')

15511                           Batman: Under the Red Hood
21194    Batman Unmasked: The Psychology of the Dark Kn...
9169             The Batman Superman Movie: World's Finest
40974    LEGO DC Comics Super Heroes: Batman: Be-Leaguered
21400                      Batman: Mystery of the Batwoman
1328                                        Batman Returns
34858                                         Wonder Woman
26403                      JLA Adventures: Trapped in Time
9230                    Batman Beyond: Return of the Joker
19792              Batman: The Dark Knight Returns, Part 1
Name: title, dtype: object

In [24]:
"""This type of recommendation is a Content-Based Recommender which looks for keywords that describe each movie that 
most similar to the movies you watch. This recommendation system uses the cosine similarity score equasion to find
the most similar movies."""

'This type of recommendation is a Content-Based Recommender which looks for keywords that describe each movie that \nmost similar to the movies you watch. This recommendation system uses the cosine similarity score equasion to find\nthe most similar movies.'

In [27]:
__sources__ = 'https://www.kaggle.com/gspmoreira/recommender-systems-in-python-101', 
'https://www.datacamp.com/community/tutorials/recommender-systems-python', 
'https://realpython.com/build-recommendation-engine-collaborative-filtering/'
__creator__ = 'Henry Boisdequin'