In [None]:
import re
import string
from tqdm import tqdm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import spacy
from spacy import displacy
from spacy.pipeline import merge_entities

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import PCA, NMF, TruncatedSVD, LatentDirichletAllocation
from sklearn.pipeline import Pipeline
from sklearn.metrics import pairwise_distances

from corextopic import corextopic as ct
from corextopic import vis_topic as vt


In [None]:
df = pd.read_csv('movie_summaries.csv', index_col=0)
df['original_summary'] = df['summary']

In [None]:
# remove digits from docs
df['summary'] = df['summary'].str.replace('\d+', ' ', regex=True)


# Baseline Topic Models

In [None]:
def make_topics(docs, vectorizer, topic_modeler, n_words=15):
    
    # Vectorize documents into a document-word matrix.
    doc_word_vectors = vectorizer.fit_transform(docs)
    
    # Fit the topic model.
    doc_topic_vectors = topic_modeler.fit_transform(doc_word_vectors)
    
    # Print the topics.
    vocab = vectorizer.get_feature_names()
    for idx, topic in enumerate(topic_modeler.components_):
        # Select the top 15 words in vocab for this topic.
        top_words = [vocab[i].upper() for i in topic.argsort()[:-n_words-1:-1]]
        print(f'Topic {idx}:\n', ', '.join(top_words), '\n')
    
    return doc_word_vectors, doc_topic_vectors


In [None]:
def token_formatter(token, doc_entities):
    
    token_text = token.lemma_.lower().strip().replace('star ', '')
    
    if 'academy award' in token.text.lower():
        return 'academy award'
        
    elif token.text in doc_entities:
        token_string = re.sub(f'[{string.punctuation}]+', ' ', token_text).strip().lower()
        return re.sub(r'\s+', ' ', token_string)
        
    elif token.pos_ == 'PROPN':
        token_string = re.sub(f'[{string.punctuation}]+', ' ', token_text).strip().lower()
        return re.sub(r'\s+', ' ', token_string)
    
    else:
        return token.lemma_.lower().strip()
            

In [None]:
def spacy_tokenizer(spacy_doc):
        
    ent_list = ['DATE', 'CARDINAL', 'ORDINAL', 'MONEY']
        
    doc_entities = [ent.text for ent in spacy_doc.ents if (ent.label_ not in ent_list)]
    ignore_ents = [ent.text for ent in spacy_doc.ents if (ent.label_ in ent_list)]
    
    # remove stop words, parts of speech, and punctuation
    pos_list = ['SPACE', 'PUNCT', 'SYM']
    puncs = string.punctuation.replace('+','_')

    doc_tokens = [token for token in spacy_doc if (not token.is_stop) and (token.lemma_ not in nlp.Defaults.stop_words) and (token.pos_ not in pos_list) and (not any(p in token.text for p in puncs)) and (token.text not in ignore_ents)]

    # lemmatize each token and convert to lowercase if POS is not a proper noun
    doc_tokens = [token_formatter(token, doc_entities) for token in doc_tokens if len(token) > 2]

    return doc_tokens


In [None]:
def plot_top_words(model, feature_names, n_words, title, rows=2, cols=5):

    fig, axes = plt.subplots(rows, cols, figsize=(30, 15), sharex=True)
    axes = axes.flatten()

    if str(type(model)) == "<class 'corextopic.corextopic.Corex'>":
        for topic_idx, topic_words in enumerate(model.get_topics(n_words=n_words)):
            top_features = [word[0] for word in topic_words]
            weights = [word[1] for word in topic_words]
            
            ax = axes[topic_idx]
            ax.barh(top_features, weights, height=0.7)
            ax.set_title(f'Topic {topic_idx +1}',
                     fontdict={'fontsize': 30})
            ax.invert_yaxis()
            ax.tick_params(axis='both', which='major', labelsize=20)
            for i in 'top right left'.split():
                ax.spines[i].set_visible(False)
    
    else:
        for topic_idx, topic in enumerate(model.components_):
            top_features_ind = topic.argsort()[:-n_words - 1:-1]
            top_features = [feature_names[i] for i in top_features_ind]
            weights = topic[top_features_ind]

            ax = axes[topic_idx]
            ax.barh(top_features, weights, height=0.7)
            ax.set_title(f'Topic {topic_idx +1}',
                     fontdict={'fontsize': 30})
            ax.invert_yaxis()
            ax.tick_params(axis='both', which='major', labelsize=20)
            for i in 'top right left'.split():
                ax.spines[i].set_visible(False)
    
    fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()
    

## CountVectorizer, NMF, 10 topic model

In [None]:
docs = df['summary']

nlp = spacy.load('en_core_web_lg')
# add step to pipeline that merges named entities into a single token
nlp.add_pipe('merge_entities')
nlp.Defaults.stop_words |= {'min', 'release', 'film', 'video', 'location', 'include', 'direct', 'set', 'widescreen' ,'studio'}

vectorizer = CountVectorizer(preprocessor=nlp, tokenizer=spacy_tokenizer)
topic_modeler = NMF(10, max_iter=1000, random_state=42)

doc_word_vectors, doc_topic_vectors = make_topics(docs, vectorizer, topic_modeler);


In [None]:
%matplotlib inline

plot_top_words(topic_modeler, vectorizer.get_feature_names(), 10, 'Topics in Count, NMF, 10 topic model')

In [None]:
doc_topic_df = pd.DataFrame(doc_topic_vectors.round(5),
                             index = df['title'])
doc_topic_df

## TfidfVectorizer, NMF, 10 topic model

In [None]:
docs = df['summary']

nlp = spacy.load('en_core_web_lg')
# add step to pipeline that merges named entities into a single token
nlp.add_pipe('merge_entities')
nlp.Defaults.stop_words |= {'min', 'release', 'film', 'video', 'location', 'include', 'direct', 'set', 'widescreen' ,'studio'}

vectorizer = TfidfVectorizer(preprocessor=nlp, tokenizer=spacy_tokenizer)
topic_modeler = NMF(10, max_iter=1000, random_state=42)

doc_word_vectors, doc_topic_vectors = make_topics(docs, vectorizer, topic_modeler);


In [None]:
%matplotlib inline

plot_top_words(topic_modeler, vectorizer.get_feature_names(), 10, 'Topics in Tfidf, NMF, 10 topic model')

In [None]:
doc_topic_df = pd.DataFrame(doc_topic_vectors.round(5),
                             index = df['title'])
doc_topic_df

## CountVectorizer, CorEx, 10 topic model


In [None]:
docs = df['summary']

nlp = spacy.load('en_core_web_lg')
# add step to pipeline that merges named entities into a single token
nlp.add_pipe('merge_entities')
nlp.Defaults.stop_words |= {'min', 'release', 'film', 'video', 'location', 'include', 'direct', 'set', 'widescreen' ,'studio'}

vectorizer = CountVectorizer(preprocessor=nlp, tokenizer=spacy_tokenizer, binary=True)

doc_word_vectors = vectorizer.fit_transform(docs)
words = list(np.asarray(vectorizer.get_feature_names()))

topic_modeler = ct.Corex(n_hidden=10, words=words, seed=42)
topic_modeler.fit(doc_word_vectors, words=words, docs=docs)

topics = topic_modeler.get_topics()
for n,topic in enumerate(topics):
    topic_words,_,_ = zip(*topic)
    print('Topic {}:'.format(n))
    print(', '.join(topic_words), '\n')
    

In [None]:
%matplotlib inline

plot_top_words(topic_modeler, vectorizer.get_feature_names(), 10, 'Topics in Count, Corex, 10 topic model')

In [None]:
plt.figure(figsize=(10,5))
plt.bar(range(topic_modeler.tcs.shape[0]), topic_modeler.tcs, width=0.5)
plt.xlabel('Topic', fontsize=16)
plt.ylabel('Total Correlation (nats)', fontsize=16)
plt.show();


In [None]:
doc_topic_df = pd.DataFrame(topic_modeler.p_y_given_x,
                             index = df['title'])
doc_topic_df

## TfidfVectorizer, TruncatedSVD, 10 topic model

In [None]:
docs = df['summary']

nlp = spacy.load('en_core_web_lg')
# add step to pipeline that merges named entities into a single token
nlp.add_pipe('merge_entities')
nlp.Defaults.stop_words |= {'min', 'release', 'film', 'video', 'location', 'include', 'direct', 'set', 'widescreen' ,'studio'}

vectorizer = TfidfVectorizer(preprocessor=nlp, tokenizer=spacy_tokenizer)
topic_modeler = TruncatedSVD(n_components=10, random_state=42)

doc_word_vectors, doc_topic_vectors = make_topics(docs, vectorizer, topic_modeler);


In [None]:
%matplotlib inline

plot_top_words(topic_modeler, vectorizer.get_feature_names(), 10, 'Topics in Tfidf, SVD, 10 topic model')

In [None]:
doc_topic_df = pd.DataFrame(doc_topic_vectors.round(5),
                             index = df['title'])
doc_topic_df

## CountVectorizer, CorEx, 10 topic model, tuning min_df


In [None]:
docs = df['summary']

nlp = spacy.load('en_core_web_lg')
# add step to pipeline that merges named entities into a single token
nlp.add_pipe('merge_entities')
nlp.Defaults.stop_words |= {'min', 'release', 'film', 'video', 'location', 'include', 'direct', 'set', 'widescreen', 'studio'}

vectorizer = CountVectorizer(preprocessor=nlp, tokenizer=spacy_tokenizer, min_df=0.0025, binary=True)

doc_word_vectors = vectorizer.fit_transform(docs)
words = list(np.asarray(vectorizer.get_feature_names()))

topic_modeler = ct.Corex(n_hidden=10, words=words, seed=42)
topic_modeler.fit(doc_word_vectors, words=words, docs=docs)

topics = topic_modeler.get_topics()
for n,topic in enumerate(topics):
    topic_words,_,_ = zip(*topic)
    print('Topic {}:'.format(n))
    print(', '.join(topic_words), '\n')
    

In [None]:
%matplotlib inline

plot_top_words(topic_modeler, vectorizer.get_feature_names(), 10, 'Topics in Count, Corex, 10 topic model')

In [None]:
plt.figure(figsize=(10,5))
plt.bar(range(topic_modeler.tcs.shape[0]), topic_modeler.tcs, width=0.5)
plt.xlabel('Topic', fontsize=16)
plt.ylabel('Total Correlation (nats)', fontsize=16)
plt.show();


In [None]:
doc_topic_df = pd.DataFrame(topic_modeler.p_y_given_x,
                             index = df['title'])
doc_topic_df

## CountVectorizer, CorEx, tuning topics and anchor topics


In [None]:
docs = df['summary']

nlp = spacy.load('en_core_web_lg')
# add step to pipeline that merges named entities into a single token
nlp.add_pipe('merge_entities')
nlp.Defaults.stop_words |= {'min', 'release', 'film', 'video', 'movie', 'location', 'include', 'direct', 'set', 'widescreen', 'studio', 'cartoon', 'touchstone', 'cinemascope'}#,  'touchstone pictures', 'caravan pictures',  'hollywood pictures', 'hollywood picture'}#'educational'}

vectorizer = CountVectorizer(preprocessor=nlp, tokenizer=spacy_tokenizer, min_df=0.0025, binary=True)

doc_word_vectors = vectorizer.fit_transform(docs)
words = list(np.asarray(vectorizer.get_feature_names()))

topic_anchors = [
                 ['mickey', 'mickey mouse', 'pluto'],
                 ['donald', 'donald duck', 'nephew'],
                 ['live', 'action'],
                 ['academy', 'award'],
                 ['pooh', 'tigger', 'piglet'],
                 #['book', 'novel'],
                 ['educational']
                ]

topic_modeler = ct.Corex(n_hidden=27, words=words, seed=42,
                         anchors=topic_anchors, anchor_strength=3)
topic_modeler.fit(doc_word_vectors, words=words, docs=docs)

topics = topic_modeler.get_topics()
for n,topic in enumerate(topics):
    topic_words,_,_ = zip(*topic)
    print('Topic {}:'.format(n+1))
    print(', '.join(topic_words), '\n')
    

In [None]:
plt.figure(figsize=(10,5))
plt.bar(range(topic_modeler.tcs.shape[0]), topic_modeler.tcs, width=0.5)
plt.xlabel('Topic', fontsize=16)
plt.ylabel('Total Correlation (nats)', fontsize=16)
plt.show();


In [None]:
%matplotlib inline

plot_top_words(topic_modeler, vectorizer.get_feature_names(), 10, 'Topics in Count, Corex model', rows=3, cols=9)

In [None]:
doc_topic_df = pd.DataFrame(topic_modeler.labels,
                            index = df['title'], 
                            columns=[f'topic_{n}' for n in range(1,27+1)])
doc_topic_df


In [None]:
doc_topic_df = pd.DataFrame(topic_modeler.p_y_given_x,
                            #index = df['title'], 
                            columns=[f'topic_{n}' for n in range(1,27+1)])
doc_topic_df


## Test recommendation metrics

In [None]:
def test_distance_metrics(movie_idx, n_recs=3):
    print(df['title'][movie_idx])
    print(df['original_summary'][movie_idx][:100] + '...')
    print('\n')
    
    distance_metrics = ['cosine', 'euclidean', 'l1', 'l2']
    
    for metric in distance_metrics:
        recs = pairwise_distances(np.array(doc_topic_df.iloc[movie_idx]).reshape(1,-1), doc_topic_df, metric=metric).argsort()[0][1:]
        print(f'Recommendations using {metric} distance metric:')
        
        for i in range(n_recs):
            print('\t', df['title'][recs[i]])
            #print('\t', df['original_summary'][recs[i]][:100] + '...')
        print('\n')
        

In [None]:
test_distance_metrics(211)

In [None]:
test_distance_metrics(1095)

In [None]:
test_distance_metrics(1762)

In [None]:
test_distance_metrics(2095)

In [None]:
test_distance_metrics(260)

## Test single recommendations

In [None]:
def recommendation_single(movie_idx, n_recs=5):
    print(df['title'][movie_idx])
    print(df['original_summary'][movie_idx][:100] + '...')
    print('\n')
        
    recs = pairwise_distances(np.array(doc_topic_df.iloc[movie_idx]).reshape(1,-1), doc_topic_df, metric='cosine').argsort()[0][1:]
        
    for i in range(n_recs):
        print('\t', df['title'][recs[i]])
        print('\t', df['original_summary'][recs[i]][:100] + '...')
        print('\n')
    

In [None]:
recommendation_single(210, n_recs=3)

In [None]:
recommendation_single(1762, n_recs=3)

## Test pair recommendations

In [None]:
def recommendation_pair(movie_idx_1, movie_idx_2, n_recs=5):
    print('First selected movie:')
    print(df['title'][movie_idx_1])
    print(df['original_summary'][movie_idx_1][:100] + '...')
    print('\n')
    
    print('Second selected movie:')
    print(df['title'][movie_idx_2])
    print(df['original_summary'][movie_idx_2][:100] + '...')
    print('\n')
    
    print('Getting recommendations...')
    
    recs_1 = pairwise_distances(np.array(doc_topic_df.iloc[movie_idx_1]).reshape(1,-1), doc_topic_df, metric='cosine').argsort()[0]
    recs_2 = pairwise_distances(np.array(doc_topic_df.iloc[movie_idx_2]).reshape(1,-1), doc_topic_df, metric='cosine').argsort()[0]

    rec_ranks = []
    for i,title in enumerate(df['title']):
        rec_ranks.append(list(recs_1).index(i) + list(recs_2).index(i)) 
        
    rec_idx = np.argpartition(rec_ranks, n_recs)
    
    print('\n')
    if (movie_idx_1 in rec_idx[:n_recs]):
        n_recs += 1
    if (movie_idx_2 in rec_idx[:n_recs]):
        n_recs += 1
    if (movie_idx_1 == movie_idx_2):
        n_recs -= 1
        
    for i in rec_idx[:n_recs]:
        if i not in [movie_idx_1, movie_idx_2]:
            print('\t', df['title'][i])
            print('\t', df['original_summary'][i][:100] + '...')
            print('\n')
    
    return

In [None]:
recommendation_pair(210, 1762, n_recs=3)

In [None]:
recommendation_pair(210, 1522, n_recs=3)

In [None]:
# Walt Disney's favorite movies: Bambi, Dumbo
recommendation_pair(184, 584, 3)

In [None]:
recommendation_pair(1896, 1981, n_recs=3)

In [None]:
doc_topic_df.to_pickle('doc_topic_df.pkl')
df.to_pickle('movie_summaries.pkl')
