In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from gensim.summarization.bm25 import BM25
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import WordNetLemmatizer, PorterStemmer
import re



In [2]:
df = pd.read_csv("../data/wiki_movie_plots_deduped.csv")

In [3]:
# Lowercase, Stem, and Tokenize Plot
def stem_tokenize(x, stemmer = PorterStemmer, word_tokenizer=TreebankWordTokenizer):   
    x = x.lower()
    x = re.sub('[^a-zA-z]', ' ', x)
    tokens = word_tokenizer().tokenize(x)
    return [stemmer().stem(w) for w in tokens]

In [4]:
x = df["Plot"].iloc[0]

print("Raw Text")
print(x)
print("\n")
print("Tokenized and Stemmed")
print(stem_tokenize(x))

Raw Text
A bartender is working at a saloon, serving drinks to customers. After he fills a stereotypically Irish man's bucket with beer, Carrie Nation and her followers burst inside. They assault the Irish man, pulling his hat over his eyes and then dumping the beer over his head. The group then begin wrecking the bar, smashing the fixtures, mirrors, and breaking the cash register. The bartender then sprays seltzer water in Nation's face before a group of policemen appear and order everybody to leave.[1]


Tokenized and Stemmed
['a', 'bartend', 'is', 'work', 'at', 'a', 'saloon', 'serv', 'drink', 'to', 'custom', 'after', 'he', 'fill', 'a', 'stereotyp', 'irish', 'man', 's', 'bucket', 'with', 'beer', 'carri', 'nation', 'and', 'her', 'follow', 'burst', 'insid', 'they', 'assault', 'the', 'irish', 'man', 'pull', 'hi', 'hat', 'over', 'hi', 'eye', 'and', 'then', 'dump', 'the', 'beer', 'over', 'hi', 'head', 'the', 'group', 'then', 'begin', 'wreck', 'the', 'bar', 'smash', 'the', 'fixtur', 'mirro

In [None]:
# Tokenize entire corpus
corpus = df["Plot"].apply(stem_tokenize)

In [None]:
# Output as pickle object
pickle.dump(corpus, open("plot_tokens.pkl", "wb"))

In [None]:
class BM25_rank(BM25):
    def __init__(self, corpus, tokenizer=stem_tokenize, titles=None):
        super().__init__(corpus)
        self.tokenizer = tokenizer
        self.average_idf = sum(map(lambda k: float(self.idf[k]), self.idf.keys())) / len(self.idf.keys())
        self.titles = titles
        
    def movie_query(self, query, top_n=10):
        query_tokens = self.tokenizer(query)
        rank_scores =  self.get_scores(query_tokens, self.average_idf)
        
        rank_scores = list(zip(np.arange(self.corpus_size), rank_scores))
        
        ranks_sorted = sorted(rank_scores, key=lambda x: x[1])[::-1]
        
        if self.titles is None:
            return ranks_sorted[:top_n]
        else:
            return [(self.titles[x], y) for x, y in ranks_sorted[:top_n]]   

In [None]:
bm25 = BM25_rank(corpus, titles = df[["Title", "Release Year"]].apply(lambda x: "%s (%s)" %(x[0], x[1]), axis=1))

In [None]:
pickle.dump(bm25, open("bm25_obj.pkl", "wb"))

In [None]:
# Shape of Water
bm25.movie_query("woman in a research facility falls in love with a fish creature")

In [None]:
# The Goonies
bm25.movie_query("kids go on a quest to find a pirate's treasure in order to save their town")

In [None]:
bm25.movie_query("the griswolds go on a vacation")

In [None]:
# Shawshank Redemption
bm25.movie_query("man is wrongfully convicted of murder.  He goes to prison. Escapes in a tunnel")

In [None]:
# The Hunger Games
bm25.movie_query("children from each district must fight to the death")