# IMDb Movies similarity from key words


In [None]:
# Import neccessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import table

In [None]:
# Import movies table and set the index
movies = pd.read_csv('../dataset/movie_info.csv')
#movies['genre'] = [genre.split("|") for genre in movies['genre']]
#movies['key words'] = [genre.split("|") for genre in movies['key words']]

In [None]:
movies.head(10)

For the time being, let's drop the plot column so that we can focus on calculating similarity that only uses keywords.



In [None]:
movies.drop(labels=['plot'], axis='columns', inplace=True)
movies.head(3)

## Deep dive into keywords
The preprocessing steps for text are as follows:

1. Lowercase the words
2. Take .isalpha() words
3. Remove Stop Words
4. Lemmatize

In our case, we will lowercase the words although it's not really necessary since they look all lowercase. It will be done for certainty.
We will not take only alpha words because most of the keywords are compound words created with dashes ("-") and taking only alpha words would thus result in us discarding most of the words.
We will remove stop words for completeness and safety although these are keywords so none should be stopwords.
We will not lemmatize since doing do changes the meaning of certain keywords. For example, "woods" which indicates the forest, becomes "wood" the material. Or "avengers" becomes "avenger". In both cases, the first words have a meaning that is more than just the plural of the second words. So we will not take this step.

Therefore in this case, we don't need to apply any of these steps to our data.

In [None]:
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim.corpora.dictionary import Dictionary

In [None]:
movies['key words']

In [None]:
keywords_list = [keywords.split("|") for keywords in movies['key words']]
docs = []
for keywords in keywords_list: 
    doc = []
    for keyword in keywords:
        doc.append(keyword)
    docs = docs + doc
docs

In [None]:
# Create functions for making alpha, removing stop words, and lemmatizing
def make_alpha(doc):
    # Retain alphabetic words: alpha_only
    alpha_only = [t for t in doc if t.isalpha()]    
    return(alpha_only)
def remove_stops(doc):
    no_stops = [t for t in doc if t not in stopwords.words('english')]
    return(no_stops)
def lemmatize(doc):
    wordnet_lemmatizer = WordNetLemmatizer()
    lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in doc]
    return(lemmatized)
def no_commas(doc):
    no_commas = [t for t in doc if t!=',']
    return(no_commas)

The code commented out below creates the processed_docs list which is what we use to find similarities.

In [None]:
lowercase_docs = [word_tokenize(doc.lower()) for doc in docs] #tokenize and lowercase

import csv

with open('lowercase.csv', 'w') as csvFile:
    writer = csv.writer(csvFile)
    writer.writerows(lowercase_docs)
    csvFile.close()


with open('lowercase.csv', 'r') as f:
    reader = csv.reader(f)
    lowercase_docs = list(reader)

In [None]:
lowercase_and_no_stop_docs = [remove_stops(doc) for doc in lowercase_docs]

with open('lowercase_and_no_stops.csv', 'w') as csvFile:
    writer = csv.writer(csvFile)
    writer.writerows(lowercase_and_no_stop_docs)
    csvFile.close()

In [None]:
lowercase_nostops_nocommas_docs = [no_commas(doc) for doc in lowercase_and_no_stop_docs]
processed_docs = lowercase_nostops_nocommas_docs

with open('processed_docs.csv', 'w') as csvFile:
    writer = csv.writer(csvFile)
    writer.writerows(processed_docs)
    csvFile.close()

The code below is all the relevent code for the model in one cell.

In [None]:
with open('processed_docs.csv', 'r') as f:
    reader = csv.reader(f)
    processed_docs = list(reader)
    processed_docs = processed_docs[0::2] # get rid of empty lists

dictionary = Dictionary(processed_docs) # create a dictionary of words from our keywords

corpus = [dictionary.doc2bow(doc) for doc in processed_docs] #create corpus where the corpus is a bag of words for each document

from gensim.models.tfidfmodel import TfidfModel

tfidf = TfidfModel(corpus) #create tfidf model of the corpus

import gensim
from gensim.similarities import Similarity
from gensim.similarities import MatrixSimilarity

# Create the similarity data structure. This is the most important part where we get the similarities between the movies.
sims = MatrixSimilarity(tfidf[corpus], num_features=len(dictionary))

In [None]:
dictionary = Dictionary(processed_docs) # create a dictionary of words from our keywords

# Print out first 10 words:
for i in range(len(dictionary))[0:10]:
    print(i, dictionary[i])

In [None]:
#create corpus where the corpus is a bag of words for each document
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [None]:
# Create the defaultdict: total_word_count
# This dictionary contains every word ID and its corresponding number of times it appears in the corpus
from collections import defaultdict
import itertools
total_word_count = defaultdict(int)
for word_id, word_count in itertools.chain.from_iterable(corpus):
    total_word_count[word_id] += word_count

# Create a sorted list from the defaultdict: sorted_word_count
sorted_word_count = sorted(total_word_count.items(), key=lambda w: w[1], reverse=True)

# Print the top 20 words across all documents alongside the count
for word_id, word_count in sorted_word_count[:20]:
    print(dictionary.get(word_id), word_count)

Clearly a lot of christmas themed movies and relationships in these keywords.

##### 1st Model: Jaccard Similarity Based on Word Counts
Jaccard similarity(df) : intersection of 2 sets divided by the union of those sets.

The idea of this model:

\# of common keywords between two movies / # of unique keywords in the union of two movies’ keywords

Then we rank the movies by their similarities and the user can query the top K results for each movie.

In [None]:
def get_jaccard_sim(str1, str2):
    a = set(str1.split('|'))
    b = set(str2.split('|'))
    c = a.intersection(b)
    return(float(len(c)) / (len(a) + len(b) - len(c)))

def keyword_string(movie):
    movie = movies[movies.title==movie]
    keyword_string = movie['key words'].iloc[0]
    
    return(keyword_string)

def get_jaccard_sim2(movie1, movie2):
    keywords1 = keyword_string(movie1)
    keywords2 = keyword_string(movie2)
    
    return(get_jaccard_sim(keywords1, keywords2))

In [None]:
def jaccard_recommender(movie_title, K=5):
    if (len(movies[movies['title']==movie_title])==0):
        print("Sorry, we don't have this movie in our database. But we will take it into consideration in the future, thank you!")
    else:
        movie = movies[movies.title==movie_title]
        keyword_string = movie['key words'].iloc[0]

        jaccards = []
        for movie in movies['key words']:
            jaccards.append(get_jaccard_sim(keyword_string, movie))
        jaccards = pd.Series(jaccards)
        jaccards_index = jaccards.nlargest(K+1).index
        matches = movies.loc[jaccards_index]
        for match,score in zip(matches['title'][1:],jaccards[jaccards_index][1:]) :
            print(match,score )

##### 2nd Model: Cosine Similarity Based on Word Counts
1. use CountVectorizer, Compute word counts for every movie’s keywords (word vectors)

2. Use scikit-learn library, Compute cosine similarity between any word vectors

Like the 1st model, then we rank the movies by their similarities and the user can query the top K results for each movie.

In [None]:
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def get_cosine_sim(*strs):
    vectors = [t for t in get_vectors1(*strs)]
    return(cosine_similarity(vectors))

def get_vectors1(*strs):
    text = [t for t in strs]
    vectorizer = CountVectorizer(text)
    vectorizer.fit(text)
    return(vectorizer.transform(text).toarray())

def get_vectors2(text):
    vectorizer = CountVectorizer(text)
    X = vectorizer.fit_transform(text)
    return(X.toarray())

In [1]:
vectors = get_vectors2(movies['key words'].tolist())

NameError: name 'get_vectors2' is not defined

In [None]:
def cosine_recommender(movie_title, K=5):
    if (len(movies[movies['title']==movie_title])==0):
        print("Sorry, we don't have this movie in our database. But we will take it into consideration in the future, thank you!")
    else:
        movie_index = movies[movies.title == movie_title].index.values[0]

        cosines = []
        for i in range(len(vectors)):
            vector_list = [vectors[movie_index], vectors[i]]
            cosines.append(cosine_similarity(vector_list)[0,1])

        cosines = pd.Series(cosines)
        index = cosines.nlargest(K+1).index

        matches = movies.loc[index]
        for match,score in zip(matches['title'][1:],cosines[index][1:]):
            print(match,score )

Quick comparison for a movie based on different models

Let's use 'Mean Girls 2' as an example:

In [None]:
movie_title = str(input("which movie you want to search? "))
K = int(input("How many most similarity movies you want to display? "))

jaccard_recommender(movie_title, K)

In [None]:
movie_title = str(input("which movie you want to search? "))
K = int(input("How many most similarity movies you want to display? "))

jaccard_recommender(movie_title, K)

In [None]:
Looks good. They all recommend similar appropriate movies for the same movie, with slight differences in recommendation.

What if we input some movie that doesn't exist in the dataset? say input "I am not a movie"



In [None]:
movie_title = str(input("which movie you want to search? "))
K = int(input("How many most similarity movies you want to display? "))

jaccard_recommender(movie_title, K)

In [None]:
movie_title = str(input("which movie you want to search? "))
K = int(input("How many most similarity movies you want to display? "))

jaccard_recommender(movie_title, K)