# Create dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
!pip install nltk
import nltk
from nltk.tokenize import word_tokenize
import string
from nltk.corpus import stopwords



In [3]:
import pandas as pd

#create movieid: cast list dictionary 
fc = pd.read_csv('FilmCast.csv', header = 0) #title 
fd = pd.read_csv('FilmDirector.csv', header = 0) #title 
person = pd.read_csv('Person.csv', header = 0) #title 
fc = pd.merge(person,fc,on='personID')
fd = pd.merge(person,fd,on='personID')
person = fc.append(fd)
movie_person = dict()
for i in range(len(person)):
    if person.iloc[i,2] in movie_person:
        movie_person[person.iloc[i,2]].append(person.iloc[i,1].replace(" ", ""))
    else:
        movie_person[person.iloc[i,2]] = [person.iloc[i,1].replace(" ", "")]

#create movieid: genre list dictionary 
genre = pd.read_csv('GenreOfFilm.csv', header = 0)
genlist = pd.read_csv('Genres.csv', header = 0)
gen = pd.merge(genlist,genre,on='genreID')
movie_genre = dict()
for i in range(len(gen)):
    if gen.iloc[i,2] in movie_genre:
        movie_genre[gen.iloc[i,2]].append(gen.iloc[i,0])
    else:
        movie_genre[gen.iloc[i,2]] = [gen.iloc[i,0]]
        
        
movies = pd.read_csv('Movie.csv', header = 0) #title 
for i in range(len(movies)):
    movies.iloc[i,4] = [' '.join(word for word in movie_genre[movies.iloc[i,0]])]       #genre list for that movieid
    movies.iloc[i,5] = [' '.join(word for word in movie_person[movies.iloc[i,0]])]      #cast list for that movieid
    
# movies = movies.rename(columns = {'total_ratings':'genre', 'rating_count':'cast'})

In [4]:
movies.head()

Unnamed: 0,movieID,title,year,description,total_ratings,rating_count
0,114709.0,Toy Story,1995,A cowboy doll is profoundly threatened and jea...,Animation,TomHanks TimAllen JimVarney DonRickles JohnLas...
1,113497.0,Jumanji,1995,When two kids find and play a magical board ga...,Adventure,RobinWilliams KirstenDunst BonnieHunt Jonathan...
2,113228.0,Grumpier Old Men,1995,John and Max resolve to save their beloved bai...,Comedy,SophiaLoren Ann-Margret JackLemmon WalterMatth...
3,114885.0,Waiting to Exhale,1995,"Based on Terry McMillan's novel, this film fol...",Comedy,AngelaBassett WhitneyHouston LelaRochon Lorett...
4,113041.0,Father of the Bride Part II,1995,George Banks must deal not only with the pregn...,Comedy,SteveMartin DianeKeaton MartinShort KimberlyWi...


# Clean up description

In [5]:
stop_words = set(stopwords.words('english'))
for i in range(len(movies)):
    try:
        desc = movies.iloc[i,3]
        tokens = word_tokenize(desc)
        words = [word for word in tokens if word.isalpha()]              #remove non letters
        words = [word for word in words if not word in stop_words]       #remove stopwords
        words = [word.lower() for word in words]                         #convert to lower case
        movies.iloc[i,3] = ' '.join(word for word in words)              #join as a single string
    except:
        movies.iloc[i,3] = ''                                            #if nan replace with empty string

In [6]:
movies.head()

Unnamed: 0,movieID,title,year,description,total_ratings,rating_count
0,114709.0,Toy Story,1995,a cowboy doll profoundly threatened jealous ne...,Animation,TomHanks TimAllen JimVarney DonRickles JohnLas...
1,113497.0,Jumanji,1995,when two kids find play magical board game rel...,Adventure,RobinWilliams KirstenDunst BonnieHunt Jonathan...
2,113228.0,Grumpier Old Men,1995,john max resolve save beloved bait shop turnin...,Comedy,SophiaLoren Ann-Margret JackLemmon WalterMatth...
3,114885.0,Waiting to Exhale,1995,based terry mcmillan novel film follows four d...,Comedy,AngelaBassett WhitneyHouston LelaRochon Lorett...
4,113041.0,Father of the Bride Part II,1995,george banks must deal pregnancy daughter also...,Comedy,SteveMartin DianeKeaton MartinShort KimberlyWi...


# Bag of words

In [7]:
for i in range(len(movies)):
    movies.iloc[i,1] = [str(movies.iloc[i,1]) + ' ' + str(movies.iloc[i,2]) + ' ' + str(movies.iloc[i,3]) + ' ' + str(movies.iloc[i,4])]
movies.drop(columns = ['year','description','total_ratings','rating_count'], inplace = True)
movies = movies.rename(columns = {'title':'bag_of_words'})
movies.head()

Unnamed: 0,movieID,bag_of_words
0,114709.0,Toy Story 1995 a cowboy doll profoundly threat...
1,113497.0,Jumanji 1995 when two kids find play magical b...
2,113228.0,Grumpier Old Men 1995 john max resolve save be...
3,114885.0,Waiting to Exhale 1995 based terry mcmillan no...
4,113041.0,Father of the Bride Part II 1995 george banks ...


# model 

In [8]:
import time
start = time.time()

count = CountVectorizer()
matrix = count.fit_transform(movies['bag_of_words'])   #generating the count matrix

cosine_sim = cosine_similarity(matrix, matrix)         #cosine similarity matrix
print(time.time() - start)

2.0399422645568848


In [11]:
#save the data as a dataframe 
sim = pd.DataFrame(data=cosine_sim[0:,0:],index = movies.movieID, columns=movies.movieID) #values, index, columns
sim['movieID'] = sim.index                        #add a column for movieID
sim.to_csv('sim.csv',index=False)               #save to file

In [12]:
sim.head()

movieID,114709.0,113497.0,113228.0,114885.0,113041.0,113277.0,114319.0,112302.0,114576.0,113189.0,...,7374948.0,7131622.0,6751668.0,6105098.0,6806448.0,2935510.0,7349950.0,7286456.0,9243946.0,movieID
movieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
114709.0,1.0,0.048912,0.095673,0.055641,0.051299,0.054074,0.054074,0.051299,0.051299,0.042601,...,0.0,0.0,0.0,0.051299,0.0,0.0,0.0,0.0,0.0,114709.0
113497.0,0.048912,1.0,0.044455,0.051709,0.047673,0.050252,0.050252,0.190693,0.143019,0.03959,...,0.040291,0.0,0.0,0.0,0.0,0.044455,0.047673,0.0,0.0,113497.0
113228.0,0.095673,0.044455,1.0,0.101144,0.09325,0.049147,0.098295,0.046625,0.046625,0.03872,...,0.039406,0.041703,0.050572,0.0,0.0,0.0,0.0,0.0,0.0,113228.0
114885.0,0.055641,0.051709,0.101144,1.0,0.108465,0.057166,0.114332,0.054233,0.054233,0.090075,...,0.045835,0.048507,0.058824,0.0,0.0,0.0,0.0,0.0,0.042875,114885.0
113041.0,0.051299,0.047673,0.09325,0.108465,1.0,0.052705,0.105409,0.05,0.05,0.041523,...,0.042258,0.044721,0.054233,0.1,0.040161,0.046625,0.0,0.0,0.0,113041.0


# Query dataframe (can be done on sql)                      

In [17]:
# sim = pd.read_csv('sim.csv', header = 0)   
inputid = 113041.0                                          #should be the value from the user's input
row_index = sim.index[sim['movieID'] == inputid][0]
arr = sim.iloc[row_index]
top_1000 = list((arr.sort_values(ascending = False)[2:1002]).index) #top 1000 similar movieids as a list

In [18]:
top_1000

['101862.0',
 '42451.0',
 '349205.0',
 '216787.0',
 '102465.0',
 '31885.0',
 '110216.0',
 '880502.0',
 '97390.0',
 '59821.0',
 '331933.0',
 '213790.0',
 '756729.0',
 '4270516.0',
 '361620.0',
 '111309.0',
 '95990.0',
 '95705.0',
 '191043.0',
 '85959.0',
 '292542.0',
 '50658.0',
 '118804.0',
 '107207.0',
 '112572.0',
 '112508.0',
 '300051.0',
 '120533.0',
 '189998.0',
 '1578275.0',
 '120686.0',
 '112579.0',
 '109045.0',
 '82517.0',
 '327162.0',
 '98503.0',
 '93857.0',
 '129280.0',
 '167260.0',
 '325980.0',
 '1058017.0',
 '365885.0',
 '98621.0',
 '1951261.0',
 '90985.0',
 '97108.0',
 '111752.0',
 '118836.0',
 '95497.0',
 '107225.0',
 '114887.0',
 '357110.0',
 '120737.0',
 '65466.0',
 '35211.0',
 '60522.0',
 '945356.0',
 '112499.0',
 '1756750.0',
 '107151.0',
 '91480.0',
 '4225622.0',
 '1450321.0',
 '65777.0',
 '97368.0',
 '97523.0',
 '139239.0',
 '70040.0',
 '458364.0',
 '413893.0',
 '365748.0',
 '4669788.0',
 '86425.0',
 '47898.0',
 '1219342.0',
 '114214.0',
 '100240.0',
 '96874.0',
 '9