## Movie titles from IMDB and plot summary from Wiki
https://www.imdb.com/list/ls055592025/

## Load Text

In [1]:
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import os
import warnings

warnings.filterwarnings('ignore')

In [2]:
titles = open('title_list.txt').read().split('\n')
titles = titles[:100]

summary_wiki = open('summary_list_wiki.txt').read().split('\n BREAKS HERE')
summary_wiki = summary_wiki[:100]

summary_imdb = open('summary_list_imdb.txt').read().split('\n BREAKS HERE')
summary_imdb = summary_imdb[:100]

#Combine imdb and wiki to get full synoposes for the top 100 movies. 
summary = []
for i in range(len(summary_wiki)):
    item = summary_wiki[i] + summary_imdb[i]
    summary.append(item)
    
ranks = range(len(titles))

## Tokenizing and Stemming

In [3]:
stopwords = nltk.corpus.stopwords.words('english')

In [4]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

def tokenization_and_stemming(text):
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent) if word not in stopwords]

    filtered_tokens = []
    
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
            
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

def tokenization(text):
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent) if word not in stopwords]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

Use our defined functions to analyze (i.e. tokenize, stem) our synoposes.

In [5]:
docs_stemmed = []
docs_tokenized = []
for i in summary:
    tokenized_and_stemmed_results = tokenization_and_stemming(i)
    docs_stemmed.extend(tokenized_and_stemmed_results)
    
    tokenized_results = tokenization(i)
    docs_tokenized.extend(tokenized_results)

In [6]:
# Create a mapping from stemmed words to original tokenized words for result interpretation.
vocab_frame_dict = {docs_stemmed[x]:docs_tokenized[x] for x in range(len(docs_stemmed))}
print (vocab_frame_dict['angel'])

angeles


Create a mapping from stemmed words to original tokenized words for result interpretation.

## TF-IDF

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_model = TfidfVectorizer(max_df=0.8, max_features=2000,
                                 min_df=0, stop_words='english',
                                 use_idf=True, tokenizer=tokenization_and_stemming, ngram_range=(1,3))

tfidf_matrix = tfidf_model.fit_transform(summary)

In [8]:
tf_selected_words = tfidf_model.get_feature_names()

## Document Similarity

In [9]:
# Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity
cos_matrix = cosine_similarity(tfidf_matrix)
print (cos_matrix)

[[1.         0.01597512 0.0195209  ... 0.02250815 0.02263374 0.04419743]
 [0.01597512 1.         0.03149342 ... 0.01242353 0.01330407 0.01937118]
 [0.0195209  0.03149342 1.         ... 0.01612811 0.0124787  0.04164716]
 ...
 [0.02250815 0.01242353 0.01612811 ... 1.         0.03166906 0.04620866]
 [0.02263374 0.01330407 0.0124787  ... 0.03166906 1.         0.01976279]
 [0.04419743 0.01937118 0.04164716 ... 0.04620866 0.01976279 1.        ]]


## K-means clustering

In [10]:
from sklearn.cluster import KMeans

num_clusters = 5
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

In [11]:
films = { 'title': titles, 'rank': ranks, 'summary': summary, 'cluster': clusters}
frame = pd.DataFrame(films, index = [clusters] , columns = ['rank', 'title', 'cluster'])
frame.head(10)

Unnamed: 0,rank,title,cluster
4,0,The Godfather,4
1,1,The Shawshank Redemption,1
1,2,Schindler's List,1
2,3,Raging Bull,2
0,4,Casablanca,0
1,5,One Flew Over the Cuckoo's Nest,1
1,6,Gone with the Wind,1
2,7,Citizen Kane,2
4,8,The Wizard of Oz,4
2,9,Titanic,2


In [12]:
print ("Number of films included in each cluster:")
frame['cluster'].value_counts().to_frame()

Number of films included in each cluster:


Unnamed: 0,cluster
1,51
4,25
2,18
3,5
0,1


In [13]:
print ("<Document Clustering Result by K-means>")

order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

Cluster_keywords_summary = {}
for i in range(num_clusters):
    print ("Cluster " + str(i) + " words:", end='')
    Cluster_keywords_summary[i] = []
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        Cluster_keywords_summary[i].append(vocab_frame_dict[tf_selected_words[ind]])
        print (vocab_frame_dict[tf_selected_words[ind]] + ",", end='')
    print ()
    
    cluster_movies = frame.ix[i]['title'].values.tolist()
    print ("Cluster " + str(i) + " titles (" + str(len(cluster_movies)) + " movies): ")
    print (", ".join(cluster_movies))
    print ()

<Document Clustering Result by K-means>
Cluster 0 words:rick,laszlo,renault,ilsa,letters,german,
Cluster 0 titles (1 movies): 
Casablanca

Cluster 1 words:paul,killing,butch,german,soldiers,alex,
Cluster 1 titles (51 movies): 
The Shawshank Redemption, Schindler's List, One Flew Over the Cuckoo's Nest, Gone with the Wind, Lawrence of Arabia, Psycho, On the Waterfront, Star Wars, E.T. the Extra-Terrestrial, 2001: A Space Odyssey, The Silence of the Lambs, The Bridge on the River Kwai, Apocalypse Now, The Lord of the Rings: The Return of the King, Gladiator, From Here to Eternity, Saving Private Ryan, Unforgiven, Raiders of the Lost Ark, Rocky, A Streetcar Named Desire, Ben-Hur, Doctor Zhivago, Patton, Jaws, Braveheart, Butch Cassidy and the Sundance Kid, Platoon, Dances with Wolves, The Pianist, Goodfellas, The Deer Hunter, All Quiet on the Western Front, The French Connection, The King's Speech, Mr. Smith Goes to Washington, Fargo, The Grapes of Wrath, Shane, The Green Mile, Close Enco

## Topic Modeling - Latent Dirichlet Allocation

In [14]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=5, learning_method = 'online')

In [15]:
tfidf_matrix_lda = (tfidf_matrix * 100)
tfidf_matrix_lda = tfidf_matrix_lda.astype(int)

In [16]:
lda.fit(tfidf_matrix_lda)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=5, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [17]:
topic_word = lda.components_
print(topic_word.shape)

(5, 2000)


In [18]:
n_top_words = 7
topic_keywords_list = []
for i, topic_dist in enumerate(topic_word):
    lda_topic_words = np.array(tf_selected_words)[np.argsort(topic_dist)][:-n_top_words:-1] 
    for j in range(len(lda_topic_words)):
        lda_topic_words[j] = vocab_frame_dict[lda_topic_words[j]]
    topic_keywords_list.append(lda_topic_words.tolist())

In [19]:
doc_topic = lda.transform(tfidf_matrix_lda)
print (doc_topic.shape)

(100, 5)


In [20]:
topic_doc_dict = {}
print ("<Document Clustering Result by Latent Dirichlet Allocation>")
for i in range(len(doc_topic)):
    topicID = doc_topic[i].argmax()
    if topicID not in topic_doc_dict:
        topic_doc_dict[topicID] = [titles[i]]
    else:
        topic_doc_dict[topicID].append(titles[i])
for i in topic_doc_dict:
    print ("Cluster " + str(i) + " words: " + ", ".join(topic_keywords_list[i]))
    print ("Cluster " + str(i) + " titles (" + str(len(topic_doc_dict[i])) + " movies): ")
    print (', '.join(topic_doc_dict[i]))
    print ()

<Document Clustering Result by Latent Dirichlet Allocation>
Cluster 2 words: terry, michael, karen, henry, lawrence, vito
Cluster 2 titles (20 movies): 
The Godfather, Lawrence of Arabia, The Godfather: Part II, Vertigo, On the Waterfront, The Silence of the Lambs, Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb, Apocalypse Now, The Lord of the Rings: The Return of the King, An American in Paris, The Treasure of the Sierra Madre, High Noon, Goodfellas, Mr. Smith Goes to Washington, Annie Hall, Out of Africa, Terms of Endearment, Shane, American Graffiti, Mutiny on the Bounty

Cluster 4 words: dorothy, maria, juror, andy, munny, kane
Cluster 4 titles (18 movies): 
The Shawshank Redemption, Citizen Kane, The Wizard of Oz, The Sound of Music, West Side Story, E.T. the Extra-Terrestrial, 12 Angry Men, From Here to Eternity, Unforgiven, To Kill a Mockingbird, City Lights, It Happened One Night, A Place in the Sun, Tootsie, Fargo, Pulp Fiction, Stagecoach, Wuthering Heig