## Using KNN for a Movie recommender and score predictor


### **Import the required Python libraries**

In [1]:
import pandas as pd
import json
from scipy import spatial
import operator
import warnings
warnings.filterwarnings('ignore')


### **Import the dataset**

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

**Converting JSON into strings**

In [3]:
# changing the genres column from json to string
movies['genres'] = movies['genres'].apply(json.loads)
for index,i in zip(movies.index,movies['genres']):
    list1 = []
    for j in range(len(i)):
        list1.append((i[j]['name'])) # the key 'name' contains the name of the genre
    movies.loc[index,'genres'] = str(list1)

# changing the keywords column from json to string
movies['keywords'] = movies['keywords'].apply(json.loads)
for index,i in zip(movies.index,movies['keywords']):
    list1 = []
    for j in range(len(i)):
        list1.append((i[j]['name']))
    movies.loc[index,'keywords'] = str(list1)
    
# changing the production_companies column from json to string
movies['production_companies'] = movies['production_companies'].apply(json.loads)
for index,i in zip(movies.index,movies['production_companies']):
    list1 = []
    for j in range(len(i)):
        list1.append((i[j]['name']))
    movies.loc[index,'production_companies'] = str(list1)

# changing the cast column from json to string
credits['cast'] = credits['cast'].apply(json.loads)
for index,i in zip(credits.index,credits['cast']):
    list1 = []
    for j in range(len(i)):
        list1.append((i[j]['name']))
    credits.loc[index,'cast'] = str(list1)

# changing the crew column from json to string    
credits['crew'] = credits['crew'].apply(json.loads)
def director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
credits['crew'] = credits['crew'].apply(director)
credits.rename(columns={'crew':'director'},inplace=True)

 **Merging Movies and Credits together**

In [4]:
movies = movies.merge(credits,left_on='id',right_on='movie_id',how='left')
movies = movies[['id','original_title','genres','cast','vote_average','director','keywords']]

**Cleaning the Genres column**

In [5]:
print(movies['genres'])
movies['genres'] = movies['genres'].str.strip('[]').str.replace(' ','').str.replace("'",'')
print(movies['genres'])
movies['genres'] = movies['genres'].str.split(',')
print(movies['genres'])
for i,j in zip(movies['genres'],movies.index):
    list2=i
    list2.sort()
    movies.loc[j,'genres']=str(list2)
movies['genres'] = movies['genres'].str.strip('[]').str.replace(' ','').str.replace("'",'')
movies['genres'] = movies['genres'].str.split(',')


0       ['Action', 'Adventure', 'Fantasy', 'Science Fi...
1                      ['Adventure', 'Fantasy', 'Action']
2                        ['Action', 'Adventure', 'Crime']
3                ['Action', 'Crime', 'Drama', 'Thriller']
4              ['Action', 'Adventure', 'Science Fiction']
                              ...                        
4798                      ['Action', 'Crime', 'Thriller']
4799                                ['Comedy', 'Romance']
4800           ['Comedy', 'Drama', 'Romance', 'TV Movie']
4801                                                   []
4802                                      ['Documentary']
Name: genres, Length: 4803, dtype: object
0       Action,Adventure,Fantasy,ScienceFiction
1                      Adventure,Fantasy,Action
2                        Action,Adventure,Crime
3                   Action,Crime,Drama,Thriller
4               Action,Adventure,ScienceFiction
                         ...                   
4798                      Action

Now lets generate a list 'genreList' with all possible unique genres mentioned in the dataset.



In [6]:
genreList=[]
for index, row in movies.iterrows():
    genres = row["genres"]
    
    for genre in genres:
        if genre not in genreList:
            genreList.append(genre)

**One-Hot Encoding for multiple labels**

In [7]:
def binary(genre_list):
    binaryList = []
    
    for genre in genreList:
        if genre in genre_list:
            binaryList.append(1)
        else:
            binaryList.append(0)
    
    return binaryList

In [8]:
print(genreList)
movies['genres_bin'] = movies['genres'].apply(lambda x: binary(x))
movies['genres_bin'].head()

['Action', 'Adventure', 'Fantasy', 'ScienceFiction', 'Crime', 'Drama', 'Thriller', 'Animation', 'Family', 'Western', 'Comedy', 'Romance', 'Horror', 'Mystery', 'History', 'War', 'Music', 'Documentary', 'Foreign', 'TVMovie', '']


0    [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1    [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2    [1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3    [1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...
4    [1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: genres_bin, dtype: object

## **Working with the Cast Column**
 

In [9]:
movies['cast'] = movies['cast'].str.strip('[]').str.replace(' ','').str.replace("'",'').str.replace('"','')
movies['cast'] = movies['cast'].str.split(',')

In [10]:
for i,j in zip(movies['cast'],movies.index):
    list2 = i[:6]
    movies.loc[j,'cast'] = str(list2)
movies['cast'] = movies['cast'].str.strip('[]').str.replace(' ','').str.replace("'",'')
movies['cast'] = movies['cast'].str.split(',')
for i,j in zip(movies['cast'],movies.index):
    list2 = i
    list2.sort()
    movies.loc[j,'cast'] = str(list2)
movies['cast']=movies['cast'].str.strip('[]').str.replace(' ','').str.replace("'",'')

In [11]:
castList = []
for index, row in movies.iterrows():
    cast = row["cast"]
    
    for i in cast:
        if i not in castList:
            castList.append(i)

In [12]:
def binary(cast_list):
    binaryList = []
    
    for genre in castList:
        if genre in cast_list:
            binaryList.append(1)
        else:
            binaryList.append(0)
    
    return binaryList

In [13]:
movies['cast_bin'] = movies['cast'].apply(lambda x: binary(x))
movies['cast_bin'].head()

0    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
1    [0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, ...
2    [0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, ...
3    [1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, ...
4    [0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, ...
Name: cast_bin, dtype: object

## **Working with Director column**

In [14]:
def xstr(s):
    if s is None:
        return ''
    return str(s)
movies['director'] = movies['director'].apply(xstr)

In [15]:
directorList=[]
for i in movies['director']:
    if i not in directorList:
        directorList.append(i)

In [16]:
def binary(director_list):
    binaryList = []  
    for direct in directorList:
        if direct in director_list:
            binaryList.append(1)
        else:
            binaryList.append(0)
    return binaryList

In [17]:
movies['director_bin'] = movies['director'].apply(lambda x: binary(x))
movies['director_bin'].head()

0    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1    [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2    [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3    [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4    [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: director_bin, dtype: object

## **Working with the Keywords column**

In [18]:
movies['keywords'] = movies['keywords'].str.strip('[]').str.replace(' ','').str.replace("'",'').str.replace('"','')
movies['keywords'] = movies['keywords'].str.split(',')
for i,j in zip(movies['keywords'],movies.index):
    list2 = []
    list2 = i
    movies.loc[j,'keywords'] = str(list2)
movies['keywords'] = movies['keywords'].str.strip('[]').str.replace(' ','').str.replace("'",'')
movies['keywords'] = movies['keywords'].str.split(',')
for i,j in zip(movies['keywords'],movies.index):
    list2 = []
    list2 = i
    list2.sort()
    movies.loc[j,'keywords'] = str(list2)
movies['keywords'] = movies['keywords'].str.strip('[]').str.replace(' ','').str.replace("'",'')
movies['keywords'] = movies['keywords'].str.split(',')

In [19]:
words_list = []
for index, row in movies.iterrows():
    genres = row["keywords"]
    
    for genre in genres:
        if genre not in words_list:
            words_list.append(genre)

In [20]:
def binary(words):
    binaryList = []
    for genre in words_list:
        if genre in words:
            binaryList.append(1)
        else:
            binaryList.append(0)
    return binaryList

In [21]:
movies['words_bin'] = movies['keywords'].apply(lambda x: binary(x))
movies = movies[(movies['vote_average']!=0)] #removing the movies with 0 score and without director names 
movies = movies[movies['director']!='']

## Using Cosine Similarity

In [22]:
def Similarity(movieId1, movieId2):
    a = movies.iloc[movieId1]
    b = movies.iloc[movieId2]
    
    genresA = a['genres_bin']
    genresB = b['genres_bin']
    genreDistance = spatial.distance.cosine(genresA, genresB)
    
    scoreA = a['cast_bin']
    scoreB = b['cast_bin']
    scoreDistance = spatial.distance.cosine(scoreA, scoreB)
    
    directA = a['director_bin']
    directB = b['director_bin']
    directDistance = spatial.distance.cosine(directA, directB)
    
    
    wordsA = a['words_bin']
    wordsB = b['words_bin']
    wordsDistance = spatial.distance.cosine(wordsA, wordsB)
    
    return genreDistance + scoreDistance + directDistance*(1.5) + wordsDistance*(1.5)

In [23]:
new_id = list(range(0,movies.shape[0]))
movies['new_id']=new_id
movies=movies[['original_title','genres','vote_average','genres_bin','cast_bin','new_id','director','director_bin','words_bin']]

## **Score Predictor**

In [24]:


def predict_score(name):
    #name = input('Enter a movie title: ')
    new_movie = movies[movies['original_title'].str.contains(name)].iloc[0].to_frame().T
    print('Selected Movie: ',new_movie.original_title.values[0])
    def getNeighbors(baseMovie, K):
        distances = []
    
        for index, movie in movies.iterrows():
            if movie['new_id'] != baseMovie['new_id'].values[0]:
                dist = Similarity(baseMovie['new_id'].values[0], movie['new_id'])
                distances.append((movie['new_id'], dist))
    
        distances.sort(key=operator.itemgetter(1))
        neighbors = []
    
        for x in range(K):
            neighbors.append(distances[x])
        return neighbors

    K = 7
    avgRating = 0
    neighbors = getNeighbors(new_movie, K)
    
    print('\nRecommended Movies: \n')
    for neighbor in neighbors:
        avgRating = avgRating+movies.iloc[neighbor[0]][2]  
        print( movies.iloc[neighbor[0]][0]+" | Genres: "+str(movies.iloc[neighbor[0]][1]).strip('[]').replace(' ','')+" | Rating: "+str(movies.iloc[neighbor[0]][2]))
    
    print('\n')
    avgRating = avgRating/K
    print('The predicted rating for %s is: %f' %(new_movie['original_title'].values[0],avgRating))
    print('The actual rating for %s is %f' %(new_movie['original_title'].values[0],new_movie['vote_average']))

In [25]:
predict_score('Avatar')

Selected Movie:  Avatar

Recommended Movies: 

Aliens | Genres: 'Action','Horror','ScienceFiction','Thriller' | Rating: 7.7
The Abyss | Genres: 'Action','Adventure','ScienceFiction','Thriller' | Rating: 7.1
Terminator 2: Judgment Day | Genres: 'Action','ScienceFiction','Thriller' | Rating: 7.7
The Terminator | Genres: 'Action','ScienceFiction','Thriller' | Rating: 7.3
Jupiter Ascending | Genres: 'Action','Adventure','Fantasy','ScienceFiction' | Rating: 5.2
Star Trek Into Darkness | Genres: 'Action','Adventure','ScienceFiction' | Rating: 7.4
Independence Day | Genres: 'Action','Adventure','ScienceFiction' | Rating: 6.7


The predicted rating for Avatar is: 7.014286
The actual rating for Avatar is 7.200000


In [26]:
predict_score('Interstellar')

Selected Movie:  Interstellar

Recommended Movies: 

Silent Running | Genres: 'Adventure','Drama','ScienceFiction' | Rating: 6.3
Stargate: The Ark of Truth | Genres: 'Adventure','ScienceFiction' | Rating: 6.9
The Martian | Genres: 'Adventure','Drama','ScienceFiction' | Rating: 7.6
The Prestige | Genres: 'Drama','Mystery','Thriller' | Rating: 8.0
Inception | Genres: 'Action','Adventure','Mystery','ScienceFiction','Thriller' | Rating: 8.1
A.I. Artificial Intelligence | Genres: 'Adventure','Drama','ScienceFiction' | Rating: 6.8
The Dark Knight | Genres: 'Action','Crime','Drama','Thriller' | Rating: 8.2


The predicted rating for Interstellar is: 7.414286
The actual rating for Interstellar is 8.100000
