In [23]:
# Build a movie recommendation system
# Following this youtube lesson https://www.youtube.com/watch?v=ueKXSupHz6Q

# Import the libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
#from sklearn.feature_extraction.text import CountVectorizer as countVectorizer
from sklearn.feature_extraction.text import CountVectorizer



In [24]:
# Load the data
from google.colab import files
uploaded = files.upload()


Saving IMDB-Movie-Data.csv to IMDB-Movie-Data (1).csv


In [36]:
# Store the data
df = pd.read_csv('IMDB-Movie-Data.csv')
df.head(10)

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0
5,6,The Great Wall,"Action,Adventure,Fantasy",European mercenaries searching for black powde...,Yimou Zhang,"Matt Damon, Tian Jing, Willem Dafoe, Andy Lau",2016,103,6.1,56036,45.13,42.0
6,7,La La Land,"Comedy,Drama,Music",A jazz pianist falls for an aspiring actress i...,Damien Chazelle,"Ryan Gosling, Emma Stone, Rosemarie DeWitt, J....",2016,128,8.3,258682,151.06,93.0
7,8,Mindhorn,Comedy,A has-been actor best known for playing the ti...,Sean Foley,"Essie Davis, Andrea Riseborough, Julian Barrat...",2016,89,6.4,2490,,71.0
8,9,The Lost City of Z,"Action,Adventure,Biography","A true-life drama, centering on British explor...",James Gray,"Charlie Hunnam, Robert Pattinson, Sienna Mille...",2016,141,7.1,7188,8.01,78.0
9,10,Passengers,"Adventure,Drama,Romance",A spacecraft traveling to a distant colony pla...,Morten Tyldum,"Jennifer Lawrence, Chris Pratt, Michael Sheen,...",2016,116,7.0,192177,100.01,41.0


In [37]:
#  Get a count of the number of rows/movies in the data set
df.shape

(1000, 12)

In [38]:
# Create a list of important columns for recommendation engine
columns = ['Actors', 'Director', 'Genre', 'Title']


In [39]:
# Show the data
df[columns].head(3)

Unnamed: 0,Actors,Director,Genre,Title
0,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",James Gunn,"Action,Adventure,Sci-Fi",Guardians of the Galaxy
1,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",Ridley Scott,"Adventure,Mystery,Sci-Fi",Prometheus
2,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",M. Night Shyamalan,"Horror,Thriller",Split


In [40]:
# Check for any missing values in the important columns
df[columns].isnull().values.any()

False

In [41]:
# Create a function to combine the values of the important columns into a single string
def get_important_features(data):
  important_features = []
  for i in range(0, data.shape[0]):
    important_features.append(data['Actors'][i] + ' '+ data['Director'][i] + ' ' + data['Genre'][i]+ ' '+data['Title'][i] )
  
  return important_features  


In [43]:
 # Createa column to hold the combined strings
 df['important_features'] = get_important_features(df)

 # Show the data
 df.head(2)

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,important_features
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S..."
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0,"Noomi Rapace, Logan Marshall-Green, Michael Fa..."


In [44]:
# Convert the text to a matrix of token counts
cm = CountVectorizer().fit_transform(df['important_features'])

In [46]:
# Get the cosine similarity matrix from the count matrix
cs = cosine_similarity(cm)

#Print the cosine similarity matrix
print(cs)

[[1.         0.1767767  0.06085806 ... 0.0571662  0.06537205 0.        ]
 [0.1767767  1.         0.         ... 0.         0.06933752 0.        ]
 [0.06085806 0.         1.         ... 0.         0.         0.        ]
 ...
 [0.0571662  0.         0.         ... 1.         0.06726728 0.        ]
 [0.06537205 0.06933752 0.         ... 0.06726728 1.         0.07161149]
 [0.         0.         0.         ... 0.         0.07161149 1.        ]]


In [47]:
# Get the shape of the cosine similarity matrix
cs.shape


(1000, 1000)

In [None]:
df


In [56]:
# Get the title of the movie that the user likes
title = 'The Amazing Spider-Man'

# Find the movies id
# !!!!!!!!!!! Attention !!!!  by some reason movie_id is not presented in the given dataset, therefore, below, I replaced the unique column 'Movie_id' by also unique column 'Rank' 
movie_id = df[df.Title == title]['Rank'].values[0]

In [59]:
# Create a list of enumerations for the similarity
# we want to get list [(movie_id, similarity score),...,]
scores = list(enumerate(cs[movie_id]))



In [61]:
# Sort
sorted_scores  = sorted(scores, key=lambda x:x[1], reverse =True)
sorted_scores = sorted_scores[1:]

In [62]:
# print
print(sorted_scores)

[(516, 0.25048971643405976), (177, 0.2369395511036369), (619, 0.21535276082326618), (72, 0.21483446221182984), (26, 0.20701966780270625), (89, 0.20701966780270625), (91, 0.20701966780270625), (149, 0.20701966780270625), (373, 0.20701966780270625), (488, 0.20701966780270625), (718, 0.20701966780270625), (856, 0.20701966780270625), (920, 0.20701966780270625), (29, 0.19999999999999996), (58, 0.19999999999999996), (210, 0.19999999999999996), (232, 0.19999999999999996), (301, 0.19999999999999996), (355, 0.19999999999999996), (385, 0.19999999999999996), (693, 0.19999999999999996), (735, 0.19999999999999996), (787, 0.19999999999999996), (880, 0.19999999999999996), (125, 0.19364916731037082), (387, 0.19364916731037082), (436, 0.19364916731037082), (576, 0.19364916731037082), (603, 0.19364916731037082), (854, 0.19364916731037082), (107, 0.1878672873255448), (268, 0.1878672873255448), (491, 0.1878672873255448), (126, 0.18257418583505536), (163, 0.18257418583505536), (234, 0.18257418583505536), (

In [69]:
# Create a loop to print the first 8 similar movies
j=0
print('The 8 most reciommended movies to', title, 'are:\n')
for item in sorted_scores:
  movie_title = df[df.Rank == item[0]]['Title'].values[0]
  print(j+1, movie_title)
  j=j+1
  if j>7:
     break

The 8 most reciommended movies to The Amazing Spider-Man are:

1 Easy A
2 Dawn of the Planet of the Apes
3 Mr. Right
4 Allied
5 Paris pieds nus
6 The Hateful Eight
7 Prisoners
8 Superbad
