# 02 - NLP Modeling

## 1. Import Packages <a name="import"></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import nltk
# nltk.download()

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 
import textstat

In [3]:
import pickle
import json, csv
import os

import re
import string

In [4]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, NMF, LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize

## Table of Contents <a name="table"></a>
1. [Import Packages](#import)
2. [Load Data](#load)
3. [NLP Modeling](#model)
4. [Text Metrics](#metrics)
5. [Save Data](#save)

## 2. Load Data <a name="load"></a>

In [6]:
movie_details = pd.read_pickle('./data/movie_details.pkl.gz', compression = 'gzip')

movie_details.head()

Unnamed: 0,movie_id,plot_summary,genre,plot_synopsis,title
0,tt0105112,"Former CIA analyst, Jack Ryan is in England wi...","[Action, Thriller]","Jack Ryan (Ford) is on a ""working vacation"" in...",Patriot Games
1,tt1204975,"Billy (Michael Douglas), Paddy (Robert De Niro...",[Comedy],Four boys around the age of 10 are friends in ...,Last Vegas
2,tt0126886,Tracy Flick is running unopposed for this year...,"[Comedy, Drama, Romance]",Jim McAllister (Matthew Broderick) is a much-a...,Election
3,tt0286716,"Bruce Banner, a brilliant scientist with a clo...","[Action, Sci-Fi]",Bruce Banner (Eric Bana) is a research scienti...,Hulk
4,tt0090605,57 years after Ellen Ripley had a close encoun...,"[Action, Adventure, Sci-Fi]","After the opening credits, we see a spacecraft...",Aliens


In [7]:
movie_reviews = pd.read_pickle('./data/movie_reviews.pkl.gz', compression = 'gzip')

movie_reviews.head()

Unnamed: 0,review_date,movie_id,user_id,is_spoiler,review_text,rating,review_summary
0,10 February 2006,tt0111161,ur1898687,True,"In its Oscar year, Shawshank Redemption (writt...",10,A classic piece of unforgettable film-making.
1,6 September 2000,tt0111161,ur0842118,True,The Shawshank Redemption is without a doubt on...,10,Simply amazing. The best film of the 90's.
2,3 August 2001,tt0111161,ur1285640,True,I believe that this film is the best story eve...,8,The best story ever told on film
3,1 September 2002,tt0111161,ur1003471,True,"**Yes, there are SPOILERS here**This film has ...",10,Busy dying or busy living?
4,20 May 2004,tt0111161,ur0226855,True,At the heart of this extraordinary movie is a ...,8,"Great story, wondrously told and acted"


Return to [Table of Contents](#table)

## 3. NLP Modeling <a name="model"></a>

We perform topic modeling to help us identify similarities a movie's reviews and plot synopsis. <br>
We can regard the plot synopsis as the ultimate spoiler. <br>
We first defined some functions and helper functions to aid us in the modeling. 

In [8]:
#initialize lemmatizer here to avoid re-initialization later
lemmatizer = WordNetLemmatizer()

def text_preprocessing(text):
    """
        Helper function for preprocessing text for vectorization
    
        :param text: raw text as string
        :returns text: preprocessed text as a string
    """
    #match only letters
    text = re.sub('\w*\d\w*', ' ', text) 
    
    #lowercase all text and remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text.lower()) 
    
    #lemmatize text and recombined into one string
    word_list = nltk.word_tokenize(text)
    text = ' '.join([lemmatizer.lemmatize(word) for word in word_list]) 
    
    return text

In [9]:
def generate_paragraphs(synopsis):
    """
        The movie synopses come as one string when we need paragraphs.
        This is a helper function to turn a plot synopsis into a paragraph.
        
        :param synopsis: plot synopsis as one string
        
        :returns p_list: a list of paragraph strings from the synopsis
    """

    #turn synopsis into a list of sentence tokens
    synopsis = synopsis.split('.')

    p_list = []

    #specify the number of sentences per paragraph
    p_length = 5

    #lower and upper indices for slicing the list of sentence tokens
    lower = 0
    upper = p_length

    while lower <= len(synopsis):
        #slice the list of sentence tokens from 
        #lower inclusive to upper exclusive
        sentences = synopsis[lower:upper]

        #combine sentences into a single string
        paragraph = '. '.join(sentences)

        p_list.append(paragraph)

        #increment lower and upper indices for next slice 
        lower += p_length
        upper += p_length
    
    return p_list

In [22]:
def vectorize_movie_items(movie_details, movie_reviews, movie_id):
    """
        Function for producing the necessary vector representations
        of the reviews and the plot synopsis for a given movie_id
        
        :param movie_id: string representing which movie to use
        :param movie_details: dataframe containing the plot synopsis of the movie
        :param movie_reviews: dataframe containing the reviews for the movie
        
        :returns vectorizer: vectorizer used to vectorize the text
        :returns synopsis_p_vector: a word matrix representation of self-designated 
                                  paragraphs in the synopsis
        :returns synopsis_vector: a word vector representation of the synopsis
        :returns review_word_matrix: a word matrix representation of the reviews
    """
    
    #preprocess the synopsis
    synopsis = movie_details.loc[movie_details['movie_id'] == movie_id, 'plot_synopsis'].values[0]

    #turn the synopsis from one string into a list of paragraph strings
    synopsis_p = generate_paragraphs(synopsis)

    synopsis_p = [text_preprocessing(paragraph) for paragraph in synopsis_p]

    #preprocess the reviews
    reviews = movie_reviews.loc[movie_reviews['movie_id'] == movie_id, 'review_text'].values
    reviews = [text_preprocessing(review) for review in reviews]
    
    
    #import stopwords and add custom stop words like the title
    #'spoiler' was included as a stopword to make the task of 
    #spoiler detection fairer and more challenging

    stop_words = nltk.corpus.stopwords.words('english')

    new_stop_words = ['actor', 'actress', 'director', 'film', 
                      'movie', 'rating', 'spoiler', 'book']
    
    title = movie_details.loc[movie_details['movie_id'] == movie_id, 'title'].values[0]

    title_words = text_preprocessing(title).split(' ')

    stop_words.extend(new_stop_words)

    
    #vectorize the text
    #CountVectorizer performed reasonably well,
    #perhaps due to most plot points appearing with some frequency
    vectorizer = CountVectorizer(ngram_range=(1, 3), stop_words = stop_words)

    
    #we fit the vectorizer on the synopsis paragraphs
    #however, we also transform the original synopsis 
    #we want to compare each review to the synopsis
    #a review that is very similar to a synopsis is most likely a spoiler
    synopsis_p_vector = vectorizer.fit_transform(synopsis_p)

    synopsis_vector = vectorizer.transform([text_preprocessing(synopsis)])

    review_word_matrix = vectorizer.transform(reviews)
    
    return vectorizer, synopsis_p_vector, synopsis_vector, review_word_matrix

In [11]:
def display_topics(model, ngrams, num_words, topic_names=None):
    """
        Helper function for displaying what words are associated with each topic
        
        :param model: the model used for topic modeling
        :param ngrams: the ngrams produced by the vectorizer used for topic modeling
        :param num_words: number of words for each topic
        :param topic names: a list of topic names as strings, optional
    """
    
    topics = model.components_
    
    for idx, topic in enumerate(topics):
        #if there are no topic names, print numerics for topic names
        if not topic_names:
            print("\nTopic ", idx)
        else:
            print("\nTopic: '",topic_names[idx],"'")
        
        #find the indices of the words that define the topic the best
        #the indices are already sorted in terms of contribution
        sorted_top_indices = topic.argsort()[:-num_words - 1:-1]
        
        print(", ".join([ngrams[i] for i in sorted_top_indices]))

In [12]:
def topic_modeling(synopsis_p_vector, synopsis_vector, review_word_matrix, 
                   vectorizer, model_name, num_components = 15):
    """
        Function for performing topic modeling and dimensionality reducing 
        
        :param synopsis_p_vector: a word matrix representation of self-designated 
                                paragraphs in the synopsis
        :param synopsis_vector: a word vector representation of the synopsis
        :param review_word_matrix: a word matrix representation of the reviews
        :param vectorizer: the vectorizer used before topic modeling
        :param model_name: string name of the model, options limited to 'lda', 'lsa', 'nmf'
        :param num_components: number of components to use in modeling

        :returns synopsis_vector: a topic representation of synopis_vector
        :returns review_word_matrix: a topic representation of review_word_matrix
    
    """

    #however, we cannot expect a review with spoiler
    #to contains most of the words and phrases from the synopsis
    #we need to perform some dimensionality reduction via topic modeling
    #this will transform our words into topics
    if model_name == 'lda':        
        model = LatentDirichletAllocation(n_components=num_components, 
                                          random_state=42, learning_method = 'batch')        
    elif model_name == 'lsa':
        model = TruncatedSVD(num_components, random_state = 42)
    elif model_name == 'nmf':
        model = NMF(num_components)

    #we fit our model on synopsis paragraphs
    #this allows to find common topics among the paragraphs
    #the assumption is topics will be related to plot points
    model.fit(synopsis_p_vector)

    #however, we only transform synopsis vector because 
    #we want to compare the whole synopsis to each review, 
    #not each paragraph of the synopsis to each review
    synopsis_vector = model.transform(synopsis_vector)

    #we apply the same tranformation from words to topics on the reviews
    review_topic_matrix = model.transform(review_word_matrix)

    #uncomment the line below if you want to see what topics the model produced
    # display_topics(model, vectorizer.get_feature_names(), 10)
    
    return synopsis_vector, review_topic_matrix

The function `spoiler_similarity` works like a naive implementation of a recommendation engine. <br>
Recommendation engines built on cosine similarity have high bias and low variance. <br>
This is because they recommend things solely based on feature similarity, placing all importance on features. <br>
Recommendation engines built on clustering algorithms have low bias and high variance. <br>
This is because clustering is usually not-deterministic. 

In [13]:
def spoiler_similarity(synopsis_vector, review_topic_matrix):
    """
        Helper function for finding which reviews are most similar to the synopsis
        
        :param synopsis_vector: a topic vector representation of the synopsis as a list
        :param review_topic_matrix: a review topic matrix produced from topic modeling
        
        :returns similarities: a sorted list of the reviews and 
                               how similar they are to the synopsis 
    """
    
    #reshape synopsis_vector for cosine similarity
    synopsis_vector = synopsis_vector.reshape(1, -1)
    
    similarities = []
    
    for idx, review_topic_vector in enumerate(review_topic_matrix):
        
        #reshape review_topic_vector for cosine similarity
        review_topic_vector = review_topic_vector.reshape(1, -1)
        
        #find how similar the two vectors are
        similarity = cosine_similarity(synopsis_vector, review_topic_vector)[0][0]
        
        #append similarity score to similarities
        similarities.append(similarity)
        
    return similarities

We loop through each movie ID and find how similar each review is to the plot synopsis. <br>
The assumption is the more similar a review is to the plot synopsis, the more likely it is to be a spoiler. <br>
We use cosine similarity to determine whether a review is similar to the plot synopsis. <br>
A cosine similarity score of 0 indicates the review is not similar to the synopsis. <br>
Meanwhile, a cosine similarity score of 1 indicates it is identical to the synopsis. <br>
We generate cosine simimilarity scores via three different topic modeling approaches - LSA, LDA, and NMF. <br>

In [23]:
#find the similarity scores for each review using LSA and LDA
#tqdm is to keep track of modeling progress

model_names = ['lsa', 'lda', 'nmf']

for movie_id in movie_details['movie_id'].values:
    
    movie_id_mask = movie_reviews['movie_id'] == movie_id
    
    #produce the necessary word vector representations of the reviews and synopsis
    vectorizer, synopsis_p_vector, synopsis_vector, review_word_matrix = \
        vectorize_movie_items(movie_details, movie_reviews, movie_id)

    for model_name in model_names:        
        try:
            #produce the necessary topic representations of the reviews and synopsis
            synopsis_topic_vector, review_topic_matrix = topic_modeling(synopsis_p_vector, synopsis_vector, 
                                                                        review_word_matrix, vectorizer, model_name) 

            #assign the similarity scores from the model to movie_reviews for the current movie_id
            movie_reviews.loc[movie_id_mask, 'similarities_' + model_name] = spoiler_similarity(synopsis_topic_vector, 
                                                                                                review_topic_matrix)

        except ValueError:
            movie_reviews.loc[movie_id_mask, 'similarities_' + model_name] = np.nan

In [24]:
movie_reviews.drop(columns = ['review_date', 'user_id', 'is_spoiler'])

Unnamed: 0,movie_id,review_text,rating,review_summary,similarities_lsa,similarities_lda,similarities_nmf
0,tt0111161,"In its Oscar year, Shawshank Redemption (writt...",10,A classic piece of unforgettable film-making.,0.961052,0.875862,0.899648
1,tt0111161,The Shawshank Redemption is without a doubt on...,10,Simply amazing. The best film of the 90's.,0.966842,0.806110,0.925397
2,tt0111161,I believe that this film is the best story eve...,8,The best story ever told on film,0.924182,0.783039,0.789735
3,tt0111161,"**Yes, there are SPOILERS here**This film has ...",10,Busy dying or busy living?,0.909945,0.881686,0.784359
4,tt0111161,At the heart of this extraordinary movie is a ...,8,"Great story, wondrously told and acted",0.977627,0.915197,0.942112
...,...,...,...,...,...,...,...
573908,tt0139239,"Go is wise, fast and pure entertainment. Assem...",10,The best teen movie of the nineties,0.641747,0.414961,0.584577
573909,tt0139239,"Well, what shall I say. this one´s fun at any ...",9,Go - see the movie,0.684094,0.734287,0.639079
573910,tt0139239,"Go is the best movie I have ever seen, and I'v...",10,It's the best movie I've ever seen,0.839586,0.747773,0.769812
573911,tt0139239,Call this 1999 teenage version of Pulp Fiction...,3,Haven't we seen this before?,0.739808,0.775189,0.704071


Return to [Table of Contents](#table)

## 4. Text Metrics <a name="metrics"></a>

To further supplement our data, we will include metrics about the text like how difficult the words are and the sentiment. 

In [25]:
analyzer = SentimentIntensityAnalyzer()

movie_reviews['flesch_reading_ease_score'] = \
    movie_reviews['review_text'].apply(lambda review: round(textstat.flesch_reading_ease(review)))

#how difficult the words used in the review are
movie_reviews['difficult_words_score'] = \
    movie_reviews['review_text'].apply(lambda review: round(textstat.difficult_words(review)))

#how difficult the words used in the review are based on syllables
movie_reviews['linsear_write_score'] = \
    movie_reviews['review_text'].apply(lambda review: round(textstat.linsear_write_formula(review)))

#find the compound score, which is a measurement of how polar the sentiment in
movie_reviews['compound_score'] = \
    movie_reviews['review_summary'].apply(lambda summary: analyzer.polarity_scores(summary)['compound'])

In [26]:
movie_reviews = movie_reviews.drop(columns = ['review_date', 'user_id', 'is_spoiler'])

movie_reviews.head()

Unnamed: 0,movie_id,review_text,rating,review_summary,similarities_lsa,similarities_lda,similarities_nmf,flesch_reading_ease_score,difficult_words_score,linsear_write_score,compound_score
0,tt0111161,"In its Oscar year, Shawshank Redemption (writt...",10,A classic piece of unforgettable film-making.,0.961052,0.875862,0.899648,70,120,16,0.0
1,tt0111161,The Shawshank Redemption is without a doubt on...,10,Simply amazing. The best film of the 90's.,0.966842,0.80611,0.925397,69,35,18,0.8402
2,tt0111161,I believe that this film is the best story eve...,8,The best story ever told on film,0.924182,0.783039,0.789735,70,45,14,0.6369
3,tt0111161,"**Yes, there are SPOILERS here**This film has ...",10,Busy dying or busy living?,0.909945,0.881686,0.784359,-6,76,15,0.0
4,tt0111161,At the heart of this extraordinary movie is a ...,8,"Great story, wondrously told and acted",0.977627,0.915197,0.942112,63,133,15,0.6249


Return to [Table of Contents](#table)

## 5. Save Data <a name="save"></a>

We save the data for later. <br>
It takes about an hour to get to this point.

In [27]:
file_dir = os.path.abspath('.')
data_folder = 'data'
path = os.path.join(file_dir, data_folder, 'processed_movie_reviews.pkl.gz')

movie_reviews.to_pickle(path, compression = 'gzip')

Return to [Table of Contents](#table)