### Movie Recommender based on Plot summary & Word2Vec

Download "ready-to-use" Word2Vec from https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?resourcekey=0-wjGZdNAUop6WykTtMip30g
Unzip the file and put the "bin" file in the project directory

In [11]:
from gensim.models import KeyedVectors

# Path to the downloaded model
model_path = 'GoogleNews-vectors-negative300.bin'

# Load the model
word2vec = KeyedVectors.load_word2vec_format(model_path, binary=True)

# Example usage
vector = word2vec['example']
print(vector)

[ 2.05078125e-01  7.85827637e-04  3.54003906e-02  1.00585938e-01
 -5.44433594e-02  1.53320312e-01  2.55859375e-01 -2.18750000e-01
 -3.31115723e-03  2.09960938e-01 -2.07031250e-01  1.77001953e-02
  4.29687500e-02 -2.01171875e-01 -1.57226562e-01  1.88476562e-01
 -3.73535156e-02  2.36816406e-02 -2.63671875e-01 -1.33789062e-01
  2.23632812e-01  2.05078125e-01 -5.83496094e-02 -3.11279297e-02
  4.92095947e-04  2.36328125e-01  1.16699219e-01  4.24804688e-02
 -1.33789062e-01  1.84570312e-01  5.02929688e-02 -6.00585938e-02
 -6.22558594e-02  7.61718750e-02  1.48437500e-01  6.10351562e-02
  6.39648438e-02 -2.73437500e-01  1.48437500e-01  8.15429688e-02
  1.57226562e-01 -2.63671875e-02 -1.10839844e-01  3.24707031e-02
 -6.93359375e-02 -3.29589844e-02 -1.34765625e-01  4.32128906e-02
 -1.42578125e-01 -2.50000000e-01  9.86328125e-02 -1.10839844e-01
 -6.98242188e-02 -2.46093750e-01  1.65039062e-01 -9.81445312e-02
 -1.71875000e-01 -1.20117188e-01  1.21582031e-01  1.50390625e-01
  4.15039062e-02  2.16064

In [14]:
import pandas as pd

df = pd.read_csv("data/tmdb_5000_movies.csv")

This script creates a movie recommendation system using pre-trained Word2Vec embeddings and cosine similarity. It preprocesses movie descriptions by removing non-alphabetic characters, tokenizing, removing stopwords, and applying stemming. Word embeddings for descriptions are averaged to generate sentence embeddings. The system computes similarities between movies based on their embeddings to recommend movies similar to a given title.

In [15]:
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk


nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

model_path = "GoogleNews-vectors-negative300.bin"
word2vec = KeyedVectors.load_word2vec_format(model_path, binary=True)


def preprocess(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text.lower()) # only keep words
    tokens = word_tokenize(text) # tokenize text (convert to words) using nltk tokenizer
    stop_words = set(stopwords.words('english')) #remove stopwords using nltk
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and word in word2vec] #Lemmatization: finding root of the words
    return tokens

# Generate Embeddings for Movie Descriptions
def get_sentence_embedding(tokens):
    if tokens: #average the word embedding for each words in the description of each movie
        return np.mean([word2vec[word] for word in tokens if word in word2vec], axis=0)
    else:
        return np.zeros(word2vec.vector_size)

# Add embeddings to the DataFrame
df.dropna(subset=["overview"], inplace=True)
df["tokens"] = df["overview"].apply(preprocess)
df["embedding"] = df["tokens"].apply(get_sentence_embedding)

# Recommendation System using cosine similarity
def recommend_movies(movie_title, df, top_n=5):
    movie_row = df[df["title"].str.contains(movie_title, case=False, na=False)]
    if movie_row.empty:
        return f"Movie '{movie_title}' not found in dataset."
    
    movie_embedding = movie_row.iloc[0]["embedding"]

    similarities = []
    for index, row in df.iterrows():
        if row["title"] != movie_title:
            sim = cosine_similarity([movie_embedding], [row["embedding"]])[0][0]
            similarities.append((row["title"], sim))

    recommendations = sorted(similarities, key=lambda x: x[1], reverse=True)[:top_n]
    return [rec[0] for rec in recommendations]

movie_title = "The Godfather"
recommended_movies = recommend_movies(movie_title, df)
print(f"Movies similar to '{movie_title}': {recommended_movies}")


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Movies similar to 'The Godfather': ['The Godfather: Part III', 'Captain America: The Winter Soldier', 'The Boondock Saints', 'Ghost', 'The Godfather: Part II']


Cool? The only problem is that "Averaging word embeddings" loses word order and context, which might reduce the quality of recommendations