Let's import our data and have a look at it

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('IMDb movies.csv',low_memory=False)

In [3]:
df.shape

(85855, 22)

In [4]:
#df = df[df['country'] == 'USA']

In [5]:
df.head(3)

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000009,Miss Jerry,Miss Jerry,1894,1894-10-09,Romance,45,USA,,Alexander Black,Alexander Black,Alexander Black Photoplays,"Blanche Bayliss, William Courtenay, Chauncey D...",The adventures of a female reporter in the 1890s.,5.9,154,,,,,1.0,2.0
1,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,Charles Tait,J. and N. Tait,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,589,$ 2250,,,,7.0,7.0
2,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,"Urban Gad, Gebhard Schätzler-Perasini",Fotorama,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,5.8,188,,,,,5.0,2.0


Before we get started, we should drop unnecessary fields

In [6]:
df.drop(['imdb_title_id','year','title','date_published','duration','language','production_company','budget','usa_gross_income','worlwide_gross_income','metascore','reviews_from_users','reviews_from_critics','country','writer'], axis=1, inplace=True)


In [7]:
df = df.rename(columns={'original_title': 'title'})

In [8]:
df.head(3)

Unnamed: 0,title,genre,director,actors,description,avg_vote,votes
0,Miss Jerry,Romance,Alexander Black,"Blanche Bayliss, William Courtenay, Chauncey D...",The adventures of a female reporter in the 1890s.,5.9,154
1,The Story of the Kelly Gang,"Biography, Crime, Drama",Charles Tait,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,589
2,Den sorte drøm,Drama,Urban Gad,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,5.8,188


Also, to make calculations easier we will use 90th percentile of votes as our cutoff. In other words, for a movie to feature in the charts, it must have more votes than at least 90% of the movies in the list.

In [9]:
m = df['votes'].quantile(0.9)
m

9819.600000000006

Now, we can filter out the movies

In [10]:
df1 = df.copy().loc[df['votes'] >= m]
df1.shape

(8586, 7)

Now we need a metric to score or rate movie. We can use the average ratings of the movie as the score but using this won't be fair enough since a movie with 8.9 average rating and only 3 votes cannot be considered better than the movie with 7.8 as as average rating but 40 votes. So, we'll be using IMDB's weighted rating.

In [11]:
df1 = df1.dropna()

In [12]:
C = df1['avg_vote'].mean()
C

6.652328159645232

The mean rating for all the movies is approx 6.5 on a scale of 10. 

In [13]:
def weighted_rating(x, m=m, C=C):
    v = x['votes']
    R = x['avg_vote']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [14]:
# Define a new feature 'score' and calculate its value with `weighted_rating()`
df1['score'] = df1.apply(weighted_rating, axis=1)

Finally, let's sort the DataFrame based on the score feature and output the title, vote count, vote average and weighted rating or score of the top 10 movies.

In [15]:
#Sort movies based on score calculated above
df1 = df1.sort_values('score', ascending=False)
df1.reset_index(inplace = True)
del df1['index']

In [16]:
def set_title_lower(x):
    return x.lower()

df1['title'] = [set_title_lower(x) for x in df1['title']]

## Building the recommender system

In [17]:
def get_actors(x):
    try:
        return x.lower().replace(' ','').split(',')[:3]
        
    except AttributeError:
        return None

df1['actors'] = [get_actors(x) for x in df1['actors']]

In [18]:
def get_genre(x):
    try:
        return x.lower().split(',')
    except AttributeError:
        return None

df1['genre'] = [get_genre(x) for x in df1['genre']]

In [19]:
def get_director(x):
    try:
        return x.lower().replace(' ','').split(',')
    except AttributeError:
        return None

df1['director'] = [get_director(x) for x in df1['director']]

In [20]:
#Extracting keywords from description
from rake_nltk import Rake
df1['keywords'] = ''

def get_keywords(x):
    r = Rake()
    try:
        r.extract_keywords_from_text(x)
        return list(r.get_word_degrees().keys())
    except:
        return None
    
df1['keywords'] = [get_keywords(x) for x in df1['description']]
del df1['description']


In [21]:
df1.head(3)

Unnamed: 0,title,genre,director,actors,avg_vote,votes,score,keywords
0,the shawshank redemption,[drama],[frankdarabont],"[timrobbins, morganfreeman, bobgunton]",9.3,2278845,9.28864,"[common, decency, number, two, imprisoned, men..."
1,the godfather,"[crime, drama]",[francisfordcoppola],"[marlonbrando, alpacino, jamescaan]",9.2,1572674,9.184191,"[aging, patriarch, organized, crime, dynasty, ..."
2,the dark knight,"[action, crime, drama]",[christophernolan],"[christianbale, heathledger, aaroneckhart]",9.0,2241615,8.989761,"[people, gotham, menace, known, fight, injusti..."


In [22]:
df1['soup'] = ''

for index, row in df1.iterrows():
    words = ''
    try:
        words = ' '.join(row['genre']) + ' ' + ' '.join(row['director']) + ' ' + ' '.join(row['actors']) + ' ' + ' '.join(row['keywords'])
    except:
        words = None
        
    df1.at[index,'soup'] = words
    

We are now in a position to create our "metadata soup", which is a string that contains all the metadata that we want to feed to our vectorizer (namely actors, director, genre and keywords)

In [23]:
df1['soup'] = ''

def create_soup(x):
    return ' '.join(x['genre']) + ' ' + ' '.join(x['director']) + ' ' + ' '.join(x['actors']) + ' ' + ' '.join(x['keywords'])
df1['soup'] = df1.apply(create_soup, axis=1)

In [24]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df1['soup'])

In [25]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [26]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(df1.index, index=df1['title'])

We are now in a good position to define our recommendation function. These are the following steps we'll follow :-

Get the index of the movie given its title.  
Get the list of cosine similarity scores for that particular movie with all movies. Convert it into a list of tuples where the first element is its position and the second is the similarity score.  
Sort the aforementioned list of tuples based on the similarity scores; that is, the second element.  
Get the top 10 elements of this list. Ignore the first element as it refers to self (the movie most similar to a particular movie is the movie itself).  
Return the titles corresponding to the indices of the top elements.  

In [27]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title):
    try:
        # Get the index of the movie that matches the title
        idx = indices[title.lower()]

        # Get the pairwsie similarity scores of all movies with that movie
        sim_scores = list(enumerate(cosine_sim[idx]))

        # Sort the movies based on the similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Get the scores of the 10 most similar movies
        sim_scores = sim_scores[1:11]

        # Get the movie indices
        movie_indices = [i[0] for i in sim_scores]

        # Return the top 10 most similar movies
        return df1['title'].iloc[movie_indices]
    
    except:
        return None

In [28]:
get_recommendations('batman')

4548                           batman: gotham by gaslight
120                                         batman begins
3018                                       batman returns
6847                                          the getaway
53                                  the dark knight rises
950                                       the dirty dozen
6780    birds of prey: and the fantabulous emancipatio...
2923                                        little caesar
5686                                        the two jakes
3719                                      xin su shi jian
Name: title, dtype: object

Now we can export the cosine matrix as a pickle file to be used by our main.py file

In [29]:
df2 = df1[['title','soup']]
df2.to_pickle('data.pickle')

In [30]:
import pickle
file = open('cosine.pickle', 'wb')
pickle.dump(cosine_sim, file)
file.close()

"""
import gzip, pickle, pickletools
with gzip.open('cosine.pkl', "wb") as f:
    pickled = pickle.dumps(cosine_sim)
    optimized_pickle = pickletools.optimize(pickled)
    f.write(optimized_pickle)
"""

'\nimport gzip, pickle, pickletools\nwith gzip.open(\'cosine.pkl\', "wb") as f:\n    pickled = pickle.dumps(cosine_sim)\n    optimized_pickle = pickletools.optimize(pickled)\n    f.write(optimized_pickle)\n'