In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from gensim.summarization.bm25 import BM25
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import WordNetLemmatizer, PorterStemmer
import re



In [2]:
df = pd.read_csv("../data/wiki_movie_plots_deduped.csv")

In [3]:
# Lowercase, Stem, and Tokenize Plot
def stem_tokenize(x, stemmer = PorterStemmer, word_tokenizer=TreebankWordTokenizer):   
    x = x.lower()
    x = re.sub('[^a-zA-z]', ' ', x)
    tokens = word_tokenizer().tokenize(x)
    return [stemmer().stem(w) for w in tokens]

In [4]:
x = df["Plot"].iloc[0]

print("Raw Text")
print(x)
print("\n")
print("Tokenized and Stemmed")
print(stem_tokenize(x))

Raw Text
A bartender is working at a saloon, serving drinks to customers. After he fills a stereotypically Irish man's bucket with beer, Carrie Nation and her followers burst inside. They assault the Irish man, pulling his hat over his eyes and then dumping the beer over his head. The group then begin wrecking the bar, smashing the fixtures, mirrors, and breaking the cash register. The bartender then sprays seltzer water in Nation's face before a group of policemen appear and order everybody to leave.[1]


Tokenized and Stemmed
['a', 'bartend', 'is', 'work', 'at', 'a', 'saloon', 'serv', 'drink', 'to', 'custom', 'after', 'he', 'fill', 'a', 'stereotyp', 'irish', 'man', 's', 'bucket', 'with', 'beer', 'carri', 'nation', 'and', 'her', 'follow', 'burst', 'insid', 'they', 'assault', 'the', 'irish', 'man', 'pull', 'hi', 'hat', 'over', 'hi', 'eye', 'and', 'then', 'dump', 'the', 'beer', 'over', 'hi', 'head', 'the', 'group', 'then', 'begin', 'wreck', 'the', 'bar', 'smash', 'the', 'fixtur', 'mirro

In [5]:
# Tokenize entire corpus
corpus = df["Plot"].apply(stem_tokenize)

In [6]:
# Output as pickle object
pickle.dump(corpus, open("plot_tokens.pkl", "wb"))

In [7]:
class BM25_rank(BM25):
    def __init__(self, corpus, tokenizer=stem_tokenize, titles=None):
        super().__init__(corpus)
        self.tokenizer = tokenizer
        self.average_idf = sum(map(lambda k: float(self.idf[k]), self.idf.keys())) / len(self.idf.keys())
        self.titles = titles
        
    def movie_query(self, query, top_n=10):
        query_tokens = self.tokenizer(query)
        rank_scores =  self.get_scores(query_tokens, self.average_idf)
        
        rank_scores = list(zip(np.arange(self.corpus_size), rank_scores))
        
        ranks_sorted = sorted(rank_scores, key=lambda x: x[1])[::-1]
        
        if self.titles is None:
            return ranks_sorted[:top_n]
        else:
            return [(self.titles[x], y) for x, y in ranks_sorted[:top_n]]   

In [8]:
bm25 = BM25_rank(corpus, titles = df[["Title", "Release Year"]].apply(lambda x: "%s (%s)" %(x[0], x[1]), axis=1))

In [9]:
pickle.dump(bm25, open("bm25_obj.pkl", "wb"))

In [10]:
# Shape of Water
bm25.movie_query("woman in a research facility falls in love with a fish creature")

[('Doom (2005)', 41.471848025579355),
 ('Trog (1970)', 40.32975068181052),
 ('Storage 24 (2012)', 38.49302206015112),
 ('Deep Blue Sea (1999)', 38.1974182899188),
 ('Godzilla (1998)', 37.85995706040838),
 ('The Shape of Water (2017)', 37.78273190160721),
 ('Local Hero (1983)', 37.75685055258853),
 ('Zeus and Roxanne (1997)', 36.48649776514964),
 ('Happy Feet (2006)', 36.18962800887014),
 ('Happy Feet (2006)', 36.18962800887014)]

In [11]:
# The Goonies
bm25.movie_query("kids go on a quest to find a pirate's treasure in order to save their town")

[('Goonies, The (1985)', 56.803992327928626),
 ('Animal Treasure Island (1971)', 56.6740520416034),
 ("Doraemon: Nobita's Great Adventure in the South Seas (1998)",
  56.647181678541784),
 ('Detective Conan: Jolly Roger in the Deep Azure (2007)', 56.333854189167525),
 ('One Piece: The Movie (2000)', 56.29234553751853),
 ('Treasure Planet (2002)', 56.09902804385832),
 ('The Pirate Movie (1982)', 56.067366307676046),
 ('Muppet Treasure Island (1996)', 55.457005033825396),
 ('Cutthroat Island (1995)', 53.96883843697358),
 ('Double Crossbones (1951)', 53.86277318641123)]

In [12]:
bm25.movie_query("the griswolds go on a vacation")

[('Vegas Vacation (1997)', 32.134485023689095),
 ("National Lampoon's Vacation (1983)", 31.708393827184203),
 ('The Court Jester (1956)', 29.88037447955937),
 ('Goodbye, My Fancy (1951)', 28.901828666468482),
 ('The Killer Shrews (1959)', 28.46725676123539),
 ('Success at Any Price (1934)', 24.30014821506127),
 ("National Lampoon's Christmas Vacation (1989)", 24.270342566383533),
 ('The Secret Life of Walter Mitty (1947)', 22.156401873304667),
 ('Now or Never (1921)', 21.998062183000982),
 ('Now or Never (1920)', 21.998062183000982)]

In [13]:
# Shawshank Redemption
bm25.movie_query("man is wrongfully convicted of murder.  He goes to prison. Escapes in a tunnel")

[('The Shawshank Redemption (1994)', 46.69316416148913),
 ('Behind the News (1940)', 46.37903808420655),
 ('They Live by Night (1949)', 45.37402674282349),
 ('They Live by Night (1948)', 45.37402674282349),
 ('Disaster Zone: Volcano in New York (2006)', 44.77080771121156),
 ('Danger Within (1959)', 44.76878173020662),
 ('The Great Escape (1963)', 44.47327483907986),
 ('Convicted (1950)', 44.32509711018184),
 ('Murder in the First (1995)', 44.17752971238405),
 ('Muktodhara (2012)', 44.04501153878567)]

In [14]:
# The Hunger Games
bm25.movie_query("children from each district must fight to the death")

[('The Hunger Games (2012)', 27.467279904347578),
 ('Catching Fire, The Hunger Games:The Hunger Games: Catching Fire (2013)',
  26.08755519847012),
 ('Where Are My Children? (1916)', 26.004233583375527),
 ('Subramaniapuram (2008)', 25.45166982886299),
 ("Tantei Opera Milky Holmes the Movie: Milky Holmes' Counterattack (2016)",
  25.13291845688281),
 ('The Story of Qiu Ju (1992)', 24.752382910885768),
 ('Perumazhakkalam (2004)', 24.528254818851092),
 ('Ilavattam (2006)', 24.443887420861273),
 ('28 Weeks Later (2007)', 24.34382582673216),
 ('The Dark Tower (2017)', 24.265703004075235)]