In [1]:
import re
import os
import string
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from nltk.stem.snowball import SnowballStemmer
import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [2]:
dataset_path = "./moviereviews/crime/tt0114369_reviews.csv"

reviews = pd.read_csv(dataset_path,encoding = 'utf-8')

reviews

Unnamed: 0,Reviews
0,"The movie, ""Se7en"", starring Brad Pitt, Morgan..."
1,It is a rarity for a film to be completely uns...
2,Seldom does a film elucidate the culpability o...
3,Seven's quality puts it so far beyond most of ...
4,"Gothic, shocking, suspenseful, disturbing and ..."
5,Rarely there has been a movie with such a good...
6,This movie is from start to finish a well prod...
7,Se7En Se7en is just one of those movies that b...
8,A stand out film dealing with a particularly u...
9,"David Fincher's bleak, relentless, and ultimat..."


In [3]:
from nltk.corpus import stopwords
from pywsd.utils import lemmatize
stop_words = set(stopwords.words('english'))
stop_words.add('didnt')
stop_words.add('movie')
stop_words.add('film')
stop_words.add('could')
stop_words.add('be')
stop_words.add('even')
stop_words.add('would')

def preprocess(text):
    new_text = re.sub('<.*?>', '', text)   # remove HTML tags
    new_text = re.sub(r'[^\w\s]', ' ', new_text) # remove punc.
    new_text = re.sub(r'\d+','',new_text) # remove numbers
    new_text = new_text.lower() # lower case, .upper() for upper
    return new_text

def tokenization_w(texts):
    tokenized_texts = []
    for text in texts:
        w_token = word_tokenize(text)
        filtered_sentence = [w for w in w_token]
        tokenized_texts.append(filtered_sentence)
    return tokenized_texts

snowball = SnowballStemmer(language = 'english')
def stemming(token_array):
    stems = []
    for tokens in token_array:
        stem_words = [snowball.stem(x) for x in tokens]
        stems.append(stem_words)
    return stems

def lemmatization(stem_array):
    lemmatized = []
    for stems in stem_array:
        lemmas = [lemmatize(x) for x in stems if not x in stop_words]
        lemmatized.append(lemmas)
    return lemmatized

Warming up PyWSD (takes ~10 secs)... took 7.897058010101318 secs.


In [4]:
reviews['Reviews'] = reviews['Reviews'].apply(preprocess)

tokens = tokenization_w(reviews['Reviews'])

#stemmed = stemming(tokens)

lemmatized_data = lemmatization(tokens)

processed_data = [' '.join(list) for list in lemmatized_data]

processed_data

['see star brad pitt morgan freeman gwyneth paltrow far one inventive well write cerebral film recent history blending well put together combination dark visual style intense plot development polished act remains tight focus throughout begin end never stray outwards unimportant issue resort typical hollywood clichés see uniquely suspense drama fuel need audience drawn entertain event unfold remain uncompromising shock thus satisfy initial vision director david fincher story surround hunt serial killer inspire dante alighieri seven deadly sin divine comedy set preach man impurity target victim torture pit underline sin see seemingly start typical cat mouse detective story however quickly develops sort modern myth good evil take centre stage story original count thrill level important aspect see however keep audience numerous step behind story oppose thriller become predictable bland end keep audience dark remains fresh original progress see dramatically turn tide one point audience fina

In [5]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(processed_data))

In [6]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(bigram_mod[data_words[0]])

['see', 'star', 'brad_pitt', 'morgan_freeman', 'gwyneth_paltrow', 'far', 'one', 'inventive', 'well', 'write', 'cerebral', 'film', 'recent', 'history', 'blending', 'well', 'put', 'together', 'combination', 'dark', 'visual', 'style', 'intense', 'plot', 'development', 'polished', 'act', 'remains', 'tight', 'focus', 'throughout', 'begin', 'end', 'never', 'stray', 'outwards', 'unimportant', 'issue', 'resort', 'typical', 'hollywood', 'cliches', 'see', 'uniquely', 'suspense', 'drama', 'fuel', 'need', 'audience', 'drawn', 'entertain', 'event', 'unfold', 'remain', 'uncompromising', 'shock', 'thus', 'satisfy', 'initial', 'vision', 'director', 'david', 'fincher', 'story', 'surround', 'hunt', 'serial', 'killer', 'inspire', 'dante', 'alighieri', 'seven_deadly', 'sin', 'divine', 'comedy', 'set', 'preach', 'man', 'impurity', 'target', 'victim', 'torture', 'pit', 'underline', 'sin', 'see', 'seemingly', 'start', 'typical', 'cat', 'mouse', 'detective', 'story', 'however', 'quickly', 'develops', 'sort', 

In [7]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

# Form Bigrams
data_words_bigrams = make_trigrams(data_words)

data_words_bigrams

[['see',
  'star',
  'brad_pitt',
  'morgan_freeman',
  'gwyneth_paltrow',
  'far',
  'one',
  'inventive',
  'well',
  'write',
  'cerebral',
  'film',
  'recent',
  'history',
  'blending',
  'well',
  'put',
  'together',
  'combination',
  'dark',
  'visual',
  'style',
  'intense',
  'plot',
  'development',
  'polished',
  'act',
  'remains',
  'tight',
  'focus',
  'throughout',
  'begin',
  'end',
  'never',
  'stray',
  'outwards',
  'unimportant',
  'issue',
  'resort',
  'typical',
  'hollywood',
  'cliches',
  'see',
  'uniquely',
  'suspense',
  'drama',
  'fuel',
  'need',
  'audience',
  'drawn',
  'entertain',
  'event',
  'unfold',
  'remain',
  'uncompromising',
  'shock',
  'thus',
  'satisfy',
  'initial',
  'vision',
  'director',
  'david',
  'fincher',
  'story',
  'surround',
  'hunt',
  'serial',
  'killer',
  'inspire',
  'dante',
  'alighieri',
  'seven_deadly_sin',
  'divine',
  'comedy',
  'set',
  'preach',
  'man',
  'impurity',
  'target',
  'victim',
  

In [8]:
# Create Dictionary
id2word = corpora.Dictionary(data_words_bigrams)

# Create Corpus
texts = data_words_bigrams

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=8,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=False)

In [9]:
pprint(lda_model.show_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.011*"see" + 0.011*"make" + 0.010*"one" + 0.007*"seven" + 0.007*"killer" + '
  '0.007*"end" + 0.007*"film" + 0.007*"fincher" + 0.007*"really" + '
  '0.006*"mill"'),
 (1,
  '0.018*"see" + 0.016*"somerset" + 0.011*"one" + 0.009*"mill" + '
  '0.008*"fincher" + 0.008*"killer" + 0.007*"detective" + 0.007*"world" + '
  '0.007*"story" + 0.006*"take"'),
 (2,
  '0.013*"see" + 0.011*"murder" + 0.010*"character" + 0.010*"scene" + '
  '0.009*"make" + 0.009*"mill" + 0.008*"killer" + 0.008*"dark" + 0.007*"two" + '
  '0.007*"doe"'),
 (3,
  '0.014*"see" + 0.011*"one" + 0.008*"killer" + 0.008*"david" + 0.008*"movie" '
  '+ 0.006*"seven" + 0.005*"somerset" + 0.005*"mill" + 0.005*"end" + '
  '0.005*"say"'),
 (4,
  '0.019*"see" + 0.013*"character" + 0.011*"somerset" + 0.009*"one" + '
  '0.009*"mill" + 0.008*"doe" + 0.008*"make" + 0.008*"get" + 0.008*"pitt" + '
  '0.007*"say"'),
 (5,
  '0.016*"see" + 0.012*"john_doe" + 0.011*"mill" + 0.008*"somerset" + '
  '0.007*"one" + 0.007*"detective" + 0.006*

In [13]:
x=lda_model.show_topics(num_topics=8, num_words=10,formatted=False)
topics_words = [[wd[0] for wd in tp[1]] for tp in x]

topics = []
for words in topics_words:
    topics.append(words)
    
topics

[['see',
  'make',
  'one',
  'seven',
  'killer',
  'end',
  'film',
  'fincher',
  'really',
  'mill'],
 ['see',
  'somerset',
  'one',
  'mill',
  'fincher',
  'killer',
  'detective',
  'world',
  'story',
  'take'],
 ['see',
  'murder',
  'character',
  'scene',
  'make',
  'mill',
  'killer',
  'dark',
  'two',
  'doe'],
 ['see',
  'one',
  'killer',
  'david',
  'movie',
  'seven',
  'somerset',
  'mill',
  'end',
  'say'],
 ['see',
  'character',
  'somerset',
  'one',
  'mill',
  'doe',
  'make',
  'get',
  'pitt',
  'say'],
 ['see',
  'john_doe',
  'mill',
  'somerset',
  'one',
  'detective',
  'david',
  'fincher',
  'killer',
  'performance'],
 ['killer',
  'character',
  'one',
  'see',
  'make',
  'get',
  'end',
  'way',
  'like',
  'brad_pitt'],
 ['think',
  'dark',
  'like',
  'watch',
  'see',
  'end',
  'killer',
  'one',
  'make',
  'get']]

In [11]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.556154548666046


NameError: name 'data_lemmatized' is not defined

In [None]:
import pyLDAvis
import pyLDAvis.gensim  # don't skip thi

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis