## Packages

In [95]:
import numpy as np
import pandas as pd
import nltk
from nltk.tag import pos_tag
from nltk import word_tokenize
from nltk import stem
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('words')
import re
import os
import codecs
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import matplotlib.pyplot as plt
from __future__ import print_function
from time import time
%matplotlib inline

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\krist\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\krist\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\krist\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\krist\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


## Define Parameters

In [96]:
n_samples = 2000
n_features = 1000
n_components = 10
n_top_words = 20

In [97]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

## Importaing Data

In [98]:
#https://www.empireonline.com/movies/features/best-movies/
empire100 = pd.read_excel('C:/Users/krist/OneDrive/Documents/Python/film/empirefilms.xlsx')
empire100.head()



Unnamed: 0,Rank,Name,Year,Description
0,1,The Godfather,1972,"Well, if Stanley Kubrick described it as ""poss..."
1,2,Star Wars: Episode V — The Empire Strikes Back,1980,"The original “this one’s darker” sequel, and b..."
2,3,The Dark Knight,2008,Easily as influential on the genre as that oth...
3,4,The Shawshank Redemption,1994,"The warm, leathery embrace of Morgan Freeman’s..."
4,5,Pulp Fiction,1994,If Reservoir Dogs was a blood-spattered callin...


In [99]:
synopses = empire100['Description'].tolist()

In [100]:
#source for Processing Raw Text: http://www.nltk.org/book/ch03.html
lemma = nltk.wordnet.WordNetLemmatizer()
stemmer = stem.snowball.EnglishStemmer()
stopwords = nltk.corpus.stopwords.words('english')
standard_words = [w for w in nltk.corpus.words.words('en') if w.islower()]

def tokenize_and_stem(text):
    final_list = []
    for i in range(len(text)):
        filtered_tokens = []
        item=word_tokenize(text[i])
        for word in item:
             if re.search('^[a-zA-Z]+$', word):
                filtered_tokens.append(word.lower())
        final_list.append([stemmer.stem(word) for word in filtered_tokens if word not in stopwords and word in standard_words])
    return final_list

def tokenize_and_lem(text):
    final_list = []
    for i in range(len(text)):
        filtered_tokens = []
        item=word_tokenize(text[i])
        for word in item:
             if re.search('^[a-zA-Z]+$', word):
                filtered_tokens.append(word.lower())
        final_list.append([lemma.lemmatize(word) for word in filtered_tokens if word not in stopwords and word in standard_words])
    return final_list

        

In [101]:
synopses_lemmed=tokenize_and_lem(synopses)
synopses_lemma_string=[]
for doc in synopses_lemmed:
    synopses_lemma_string.append(" ".join(str(x) for x in doc))

print(type(synopses_lemma_string))
print(len(synopses_lemma_string))

<class 'list'>
100


In [102]:
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(synopses_lemma_string)
print("done in %0.3fs." % (time() - t0))

Extracting tf-idf features for NMF...
done in 0.010s.


In [103]:
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(synopses_lemma_string)
print("done in %0.3fs." % (time() - t0))
print()

Extracting tf features for LDA...
done in 0.006s.



In [104]:
from sklearn.decomposition import NMF, LatentDirichletAllocation

# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=2000 and n_features=100...
done in 0.046s.


In [105]:
print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


Topics in NMF model (Frobenius norm):
Topic #0: right feel film time like make truly good great course performance story little cinema new human man adventure hand ultimate
Topic #1: big way action yes new great came make like white bad feature instead thanks movie inside half narrative kind release
Topic #2: got drama better hard dark pretty original hand look director violence fantastic great perfectly course like people way world inside
Topic #3: world war movie shoot epic got king old scene set make adventure best adaptation half henry sense western bad narrative
Topic #4: movie turned best release horror good marvel pretty sense guy jack director despite form story performance better time adaptation love
Topic #5: scene time work thriller thanks crime long twist tale remains white case way seven far guy best western action instead
Topic #6: battle power cinema great future peter writer set action seven narrative fantastic day twist man case director got alien adventure



In [106]:
# Fit the NMF model
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

Fitting the NMF model (generalized Kullback-Leibler divergence) with tf-idf features, n_samples=2000 and n_features=100...
done in 0.050s.


In [107]:
print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


Topics in NMF model (generalized Kullback-Leibler divergence):
Topic #0: movie good film right make story like time sense adventure bad adaptation old feel cinema man ultimate deeply perfectly day
Topic #1: way big action new like great yes win human truly feel horror came course kind inside instead power director make
Topic #2: drama got great fantastic little battle violence peter day cinema like le film writer man henry look inside set twist
Topic #3: world movie release really set sense turned star feature narrative making war make half magic kelly king know western shoot
Topic #4: movie director pretty better turned dark despite marvel performance novel best love feature jack guy came look perfectly form violence
Topic #5: remains thriller thanks tale original crime work better truly different white scene modern epic long twist best course alien form
Topic #6: time war future movie great power scene case seven narrative battle best kind shoot cinema horror writer thriller crime h

In [108]:
print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))

Fitting LDA models with tf features, n_samples=2000 and n_features=100...


In [109]:
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)

In [110]:
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

done in 0.160s.


In [111]:
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


Topics in LDA model:
Topic #0: like crime ultimate adventure really drama day power better best course perfectly love movie great action director world look marvel
Topic #1: movie make great director release far film big case sense turned new white man despite story feature seven love writer
Topic #2: film western man set little star like right peter making form epic world feature crime human came work thriller really
Topic #3: movie time world war action epic good make cinema big shoot future like best novel course human adaptation old battle
Topic #4: way hard twist cinema thriller half action better deeply horror director case alien movie performance turned remains big pretty drama
Topic #5: film drama got right world story movie le central adaptation better war great truly best guy know original people violence
Topic #6: thanks narrative win work alien truly scene tale world henry way day long feel power remains time modern crime turned



In [112]:
print('Perplexity values reported during training:')
lda.fit(tf)

print('Perplexity using normalized doc-topic matrix returned by lda.transform():')
print(lda.perplexity(tf))  # calls lda.transform(tf), which normalizes doc-topic-matrix

print('Perplexity using unnormalized doc-topic matrix returned by lda._e_step(), as used in lda.fit():')
print(lda.perplexity(tf, lda._e_step(tf,False,False)[0]))

Perplexity values reported during training:
Perplexity using normalized doc-topic matrix returned by lda.transform():
405.66269449059587
Perplexity using unnormalized doc-topic matrix returned by lda._e_step(), as used in lda.fit():
405.66269449059587


  if doc_topic_distr != 'deprecated':
