# Spoilers

In [1]:
__author__ = "Kristine Guo and Caroline Ho"
__version__ = "CS224u, Stanford, Spring 2018 term"

## Contents

0. [Overview](#Overview)
0. [Set-up](#Set-up)
0. [Baseline](#Baseline)
  0. [Features](#Features)
  0. [Experiment](#Experiment)
0. [Sentiment](#Sentiment)
0. [Dependency Parsing](#Dependency-Parsing)

## Overview



## Set-up

* Make sure your environment meets all the requirements for [the cs224u repository](https://github.com/cgpotts/cs224u/). For help getting set-up, see [setup.ipynb](setup.ipynb).

* Make sure you've downloaded [the data distribution for this unit](http://web.stanford.edu/class/cs224u/data/vsmdata.zip), unpacked it, and placed it in the current directory (or wherever you point `data_home` to below).

In [1]:
from collections import Counter
import copy
from nltk.corpus import stopwords
from nltk.parse.stanford import StanfordDependencyParser
import numpy as np
import os
import pandas as pd
import PorterStemmer
import scipy.stats
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.preprocessing import normalize
from sklearn.svm import LinearSVC
import string

In [2]:
data_home = 'tvtropes'

In [3]:
dev1 = pd.read_csv(
    os.path.join(data_home, 'dev1.balanced.csv'))

In [4]:
dev2 = pd.read_csv(
    os.path.join(data_home, 'dev2.balanced.csv'))

In [5]:
test = pd.read_csv(
    os.path.join(data_home, 'test.balanced.csv'))

In [6]:
train = pd.read_csv(
    os.path.join(data_home, 'train.balanced.csv'))

In [7]:
print(test.loc[0, 'trope'])

WorkCom


## Baseline

In [7]:
ps = PorterStemmer.PorterStemmer()
translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
stop_words = set(stopwords.words('english'))

def parse_sentence(sentence):
    s = sentence.translate(translator).split()
    for i in range(len(s)):
        s[i] = s[i].strip(string.punctuation).lower()
    s = list(filter(None, s))
    return [word for word in s if word not in stop_words]

### Features

Description here

In [8]:
def unigrams_phi(s):
    return Counter(s)

In [9]:
def stemmed_phi(s):
    return Counter([ps.stem(word) for word in s])

In [10]:
def bigrams_phi(s):
    t = copy.deepcopy(s)
    t.insert(0, '<S>')
    t.append('</S>')
    bigrams = [tuple([t[i], t[i + 1]]) for i in range(len(t) - 1)]
    return Counter(bigrams)

### Experiment

Description

In [11]:
def trunc_sent(sentence):
    if (len(sentence) > 200):
        print("Truncating ", sentence, " to ", sentence[:200])
        return sentence[:200]
    else:
        return sentence

def vectorize_X_dependency(X, phi, vectorizer=None, should_parse=True):
    feat_dicts = []
    if should_parse:
        if len(X) == 11970:
            print("In train!")
            feat_dicts = [phi(parse_sentence(sentence)) + train_parsed_lowered[i] for i, sentence in enumerate(X)]
        else:
            print("Not in train!")
            feat_dicts = [phi(parse_sentence(sentence)) + dep_phi(trunc_sent(sentence)) for sentence in X]
    else: feat_dicts = [phi(sentence) for sentence in X]

    if vectorizer == None:
        vectorizer = DictVectorizer(sparse=False)
        print("Gets to vectorizer")
        return (vectorizer.fit_transform(feat_dicts), vectorizer)
    else:
        return (vectorizer.transform(feat_dicts), vectorizer)

In [12]:
def vectorize_X(X, phi, vectorizer=None, should_parse=True):
    feat_dicts = []
    if should_parse: feat_dicts = [phi(parse_sentence(sentence)) for i, sentence in enumerate(X)]
    else: feat_dicts = [phi(sentence) for sentence in X]

    if vectorizer == None:
        vectorizer = DictVectorizer(sparse=False)
        return (vectorizer.fit_transform(feat_dicts), vectorizer)
    else:
        return (vectorizer.transform(feat_dicts), vectorizer)

In [13]:
def build_dataset(data, phi, vectorizer=None, should_parse=True):
    X = [data.loc[i, 'sentence'] for i in range(len(data.index))]
    y = [data.loc[i, 'spoiler'] for i in range(len(data.index))]
    feat_matrix, vectorizer = vectorize_X_dependency(X, phi, vectorizer=vectorizer, should_parse=should_parse)
    print("Finishes vectorize_X_dependency")
    print(feat_matrix.shape)
    return (normalize(feat_matrix), y, vectorizer)

In [14]:
def fit_svc(X, y):
    mod = LinearSVC()
    mod.fit(X, y)
    return mod

In [15]:
def experiment(phi, train_func, train_data, test_data, should_parse=True):
    X, y, vectorizer = build_dataset(train_data, phi, should_parse=should_parse)
    print(len(X[0]))
    mod = train_func(X, y)
    X_test, y_test, vectorizer = build_dataset(test_data, phi, vectorizer=vectorizer, should_parse=should_parse)
    predictions = mod.predict(X_test)
    print('Accuracy: %0.03f' % accuracy_score(y_test, predictions))
    print(classification_report(y_test, predictions, digits=3))
    return f1_score(y_test, predictions, average = 'macro', pos_label=None)

In [17]:
experiment(unigrams_phi, fit_svc, train, dev1)
experiment(stemmed_phi, fit_svc, train, dev1)
experiment(bigrams_phi, fit_svc, train, dev1)

18968
Accuracy: 0.614
             precision    recall  f1-score   support

      False      0.578     0.633     0.604       496
       True      0.652     0.598     0.624       570

avg / total      0.618     0.614     0.615      1066

13520
Accuracy: 0.618
             precision    recall  f1-score   support

      False      0.584     0.623     0.603       496
       True      0.652     0.614     0.632       570

avg / total      0.620     0.618     0.619      1066

110710
Accuracy: 0.598
             precision    recall  f1-score   support

      False      0.568     0.567     0.567       496
       True      0.623     0.625     0.624       570

avg / total      0.598     0.598     0.598      1066



0.5955589791028989

## Dependency Parsing

In [16]:
path_to_jar = 'stanford-parser-full-2018-02-27/stanford-parser.jar'
path_to_models_jar = 'stanford-english-corenlp-2018-02-27-models.jar'

dep_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)

In [17]:
def dep_phi(sentence):
    deps = []
    for parse in dep_parser.raw_parse(sentence):
        for t in parse.triples():
            deps.append(t[2][0] + '-' + t[0][0])
    return Counter({process(k): v for k, v in Counter(deps).items()})

In [18]:
import pickle
f = open("train_parsed_full.txt", "rb")
train_parsed = []
while 1:
    try:
        o = pickle.load(f)
    except EOFError:
        break
    train_parsed.append(o)
f.close()

In [19]:
def process(k):
    arr = k.split('-')
    arr = [ps.stem(a.lower()) for a in arr]
    return '-'.join(arr)

In [20]:
train_parsed_lowered = []
for a in train_parsed:
    temp = {}
    for k, v in a.items():
        p = process(k)
        if p not in temp:
            temp[p] = 0
        temp[p] += v
    train_parsed_lowered.append(Counter(temp))

In [21]:
print(len(train_parsed_lowered))

11970


In [22]:
experiment(stemmed_phi, fit_svc, train, test, should_parse=True)

In train!
Gets to vectorizer
Finishes vectorize_X_dependency
(11970, 160751)
160751
Not in train!
Truncating  In Season 6, Kinetic King spent two or three days setting up a Chain-Reaction Gadget ( NOT  a  Rube Goldberg Device ) for his first Hollywood performance, but when the time came to set it off it failed to do much of anything for no discernible reason.  to  In Season 6, Kinetic King spent two or three days setting up a Chain-Reaction Gadget ( NOT  a  Rube Goldberg Device ) for his first Hollywood performance, but when the time came to set it off it faile
Truncating  One of the four possible identities assigned by the company to its workers when decorating their cubicles to express their individuality, along with "Green Bay Packers Fan", "Space", and "Classic Cars".  to  One of the four possible identities assigned by the company to its workers when decorating their cubicles to express their individuality, along with "Green Bay Packers Fan", "Space", and "Classic Cars
Truncating 

Truncating  Adelle  used Victor for several "romantic engagements" and ultimately would have allowed Sierra to be permanently given to the man who had put her in the Dollhouse as his personal revenge, but eventually takes a stand against Rossum .  to  Adelle  used Victor for several "romantic engagements" and ultimately would have allowed Sierra to be permanently given to the man who had put her in the Dollhouse as his personal revenge, but eventua
Truncating  Also a  Fridge Brilliance  demonstration of why composite events are exceedingly dangerous when it happens to someone like Alpha   being able to instantly access what could potentially amount to many lifetimes of information, skills, tactics, abilities and experience and use it precisely and exactly as need and applied.  to  Also a  Fridge Brilliance  demonstration of why composite events are exceedingly dangerous when it happens to someone like Alpha   being able to instantly access what could potentially amount to many 
Truncat

Truncating  In "Getting Closer" (2x11), we discover that the memory of Caroline that Bennett showed to Echo in "The Left Hand" (2x06) took significant latitude in its interpretation Caroline did, indeed, say, "Sorry, sister, if I stay we both get nabbed.  to  In "Getting Closer" (2x11), we discover that the memory of Caroline that Bennett showed to Echo in "The Left Hand" (2x06) took significant latitude in its interpretation Caroline did, indeed, say, "So
Truncating  While not a literal example, the post-apocalyptic scenario depicted in the Epitaph episodes invokes tropes commonly associated with this you have people who have lost their minds and identities running around and attacking everything on sight, with a handful of unaffected individuals trying to survive in the violent new world order and find a cure, etc.  to  While not a literal example, the post-apocalyptic scenario depicted in the Epitaph episodes invokes tropes commonly associated with this you have people who have lost

Truncating  Miles asks him why on earth he would need three million dollars from him when there are "a couple of jabronis named Nikki and Paulo (whom Miles knows about because of his ability to "talk to the dead") buried alive right over there with eight million dollars worth of diamonds sitting right on top of them".  to  Miles asks him why on earth he would need three million dollars from him when there are "a couple of jabronis named Nikki and Paulo (whom Miles knows about because of his ability to "talk to the dead"
Truncating  One example is the pan from the season 1 finale of Jack and Locke peering into the hatch being used again in the  Series Finale  when Jack and   the Man in Black as  Locke looked into   the Heart of the Island .  to  One example is the pan from the season 1 finale of Jack and Locke peering into the hatch being used again in the  Series Finale  when Jack and   the Man in Black as  Locke looked into   the Heart of t
Truncating  There's the 815 fuselage survivo

Truncating  A  massive  handling by the remaining A-Team and Boaties, who spent the last few episodes of season 5 formulating and executing a plan that hinged on the small probability that  setting off a nuke would prevent the mysterious "Incident" that happened to the Swan station .  to  A  massive  handling by the remaining A-Team and Boaties, who spent the last few episodes of season 5 formulating and executing a plan that hinged on the small probability that  setting off a nuke wou
Truncating  In "Live Together, Die Alone," Penny is upset that Desmond never wrote to her when he was in prison, when in reality he did; Charles Widmore had been intercepting all of his letters to make Penny think that Desmond had given up on her.  to  In "Live Together, Die Alone," Penny is upset that Desmond never wrote to her when he was in prison, when in reality he did; Charles Widmore had been intercepting all of his letters to make Penny thin
Truncating  Kate (a fugitive murderer,  she had a good 

Truncating  : Rest in peace, Sun and Jin Kwon  Completly averted with the bones in the cave because  they belong to Jacob's brother and a woman he killed, who pretended to be their mother  Ultimately,  everyone  at the very end, when they all meet up at the church .  to  : Rest in peace, Sun and Jin Kwon  Completly averted with the bones in the cave because  they belong to Jacob's brother and a woman he killed, who pretended to be their mother  Ultimately,  everyone  
Truncating  By the end of the last episode,  Faraday had been killed by his own mother ,  Sayid was shot by Roger Linus; though he got better, sort of ,  Jacob had been knifed to death by Ben ,  Juliet fell down a pit on the island with everybody else and repeatedly hit an armed H-bomb with a rock,  and  Locke was  revealed  to have been dead the whole time .  to  By the end of the last episode,  Faraday had been killed by his own mother ,  Sayid was shot by Roger Linus; though he got better, sort of ,  Jacob had been kni

Truncating  In the Season 3 finale,  Charles finally gets to be a stellar King Lear, Sophie and Paul get together, and Geoffrey and Ellen get married, but Charles dies, Geoffrey resigns from the festival, Richard relapses into a soulless corporate executive, Darren gets appointed artistic director, Anna is fired, and Geoffrey tells Oliver he loves him only after Oliver has disappeared for good .  to  In the Season 3 finale,  Charles finally gets to be a stellar King Lear, Sophie and Paul get together, and Geoffrey and Ellen get married, but Charles dies, Geoffrey resigns from the festival, Richard
Truncating  Yvette, Floyd, and Marcus try to get TJ dates for the upcoming dance, but because they didn't tell the other (and because TJ fears saying "no" to their choices would hurt his family), he ended up getting three dates, and Mo's suggestion to hide it makes it hilariously worse.  to  Yvette, Floyd, and Marcus try to get TJ dates for the upcoming dance, but because they didn't tell the

Truncating  Most importantly, Jimmy owes Greg everything, without him Jimmy would be a homeless, unemployed utter failure as a human being, the quote "I've done everything short of breathing air into your lungs" comes to mind as it is entirely accurate.  to  Most importantly, Jimmy owes Greg everything, without him Jimmy would be a homeless, unemployed utter failure as a human being, the quote "I've done everything short of breathing air into your lungs" 
Finishes vectorize_X_dependency
(1477, 160751)
Accuracy: 0.640
             precision    recall  f1-score   support

      False      0.620     0.619     0.619       700
       True      0.657     0.659     0.658       777

avg / total      0.640     0.640     0.640      1477



0.6387770263064017

In [23]:
experiment(stemmed_phi, fit_svc, train, dev1, should_parse=True)

In train!
Gets to vectorizer
Finishes vectorize_X_dependency




(11970, 500)
500
Not in train!
Truncating  There was also a comic book adaptation which tended to play down the sitcom aspects in favor of giving him strange technological toys; it seemed more inspired by the  Animated Adaptation  than the original sitcom and would in fact often include stories set on Melmac, usually with the framing story of Alf telling the Tanners of life on his home planet.  to  There was also a comic book adaptation which tended to play down the sitcom aspects in favor of giving him strange technological toys; it seemed more inspired by the  Animated Adaptation  than the ori
Truncating  Jay Mariotti was a regular panelist since the show's beginning, but he  hasn't appeared  since his  August 2010 domestic violence arrest , and ESPN has made it clear they aren't really interested in bringing him back.  to  Jay Mariotti was a regular panelist since the show's beginning, but he  hasn't appeared  since his  August 2010 domestic violence arrest , and ESPN has made it cl

Truncating  They are openly hostile to each other, the former being a "Communist" who claims the director his natural enemy  because of their opposite political leanings , but actually they are always making secret agreements and plotting together against the other workers for their own personal profit.  to  They are openly hostile to each other, the former being a "Communist" who claims the director his natural enemy  because of their opposite political leanings , but actually they are always making secr
Truncating  Mostly noticeable in a sketch beginning with Luca screws up and gets tea instead of coffee; instead of simply throwing it away, he tries to give it to someone else *  (the former would be a solution, but  his strong Communist beliefs keep him from "wasting" tea, as it would be "an instance of Capitalism" )]].  to  Mostly noticeable in a sketch beginning with Luca screws up and gets tea instead of coffee; instead of simply throwing it away, he tries to give it to someone el

Truncating  When arranged as a colour spectrum, this forms a time line, so in a case of fridge brilliance, Carrie made a connection between the chronology and the chromatology, again words with a very similar morphology.  to  When arranged as a colour spectrum, this forms a time line, so in a case of fridge brilliance, Carrie made a connection between the chronology and the chromatology, again words with a very similar mor
Truncating  Brody   among other things: he sleeps on the floor to avoid assaulting his wife in his sleep, cowers in a corner of his bedroom all day after slipping into an  Angst Coma  and has  intimacy issues  due to his time as a POW.  to  Brody   among other things: he sleeps on the floor to avoid assaulting his wife in his sleep, cowers in a corner of his bedroom all day after slipping into an  Angst Coma  and has  intimacy issues  du
Truncating  Or mutant sea monsters...  Also, HHTV's war correspondent invariably signs off with "This is Mike Peabody, reporting fo

Truncating  In the episode, "You Better Shop Around" Jerry Mathers (known for  Leave It To Beaver ) guest stars, playing himself as a washed-up former child star, who has resorted to judging shopping contests in supermarkets.  to  In the episode, "You Better Shop Around" Jerry Mathers (known for  Leave It To Beaver ) guest stars, playing himself as a washed-up former child star, who has resorted to judging shopping contests in 
Truncating  It's implied in several episodes that Kelly's promiscuity stems from being starved for attention and treated as the  Butt Monkey  in her own family (the episode where Al invents shoe-lights lampshaded this with Kelly's (who is being used as the guinea pig for shoe-lights) line: "'Kelly, this meat is green.  to  It's implied in several episodes that Kelly's promiscuity stems from being starved for attention and treated as the  Butt Monkey  in her own family (the episode where Al invents shoe-lights lampshaded
Truncating  At the end of the episode, Jef

Truncating  When this plan seemed no longer viable, Ari proposed to frame Semak for killing Alex, and stealing Gogol's operating budget in the ensuing chaos; while 300 million US dollars isn't a  Mega Corp , it's still an incredible amount of money for two people.  to  When this plan seemed no longer viable, Ari proposed to frame Semak for killing Alex, and stealing Gogol's operating budget in the ensuing chaos; while 300 million US dollars isn't a  Mega Corp , it's
Truncating  All of Division's missions are named this way, but the most important is "Operation Sparrow", which is involved in the last several episodes of the first season, and is revealed in the finale to be   a ruse as part of a  Batman Gambit  to assassinate and replace the director of the CIA .  to  All of Division's missions are named this way, but the most important is "Operation Sparrow", which is involved in the last several episodes of the first season, and is revealed in the finale to be   
Truncating  The final 

Truncating  Elizabeth Berkley ("Jessie") was somewhat of a borderline case, as she was 17 when her character was introduced, pushing her to around 20-21 by the time the series wrapped (playing a high-school senior).  to  Elizabeth Berkley ("Jessie") was somewhat of a borderline case, as she was 17 when her character was introduced, pushing her to around 20-21 by the time the series wrapped (playing a high-school senio
Truncating  As a result, the character of Tori was hastily created to fill in the roles left by the absence of the two aforementioned characters, and especially in the case of Kaposwki's absence,   serve as a love interest for Zack Morris.  to  As a result, the character of Tori was hastily created to fill in the roles left by the absence of the two aforementioned characters, and especially in the case of Kaposwki's absence,   serve as a lov
Truncating  The Tori episodes were filmed after the series finale and the made-for-TV movie that were supposed to wrap up the plot o

Truncating  Dwight, who manages a one night stand with one of Pam's friends at  her and Jim's wedding  and handles it with a level of expertise that suggests experience in the field   and when Pam's friend approaches Dwight at the wedding, he's doing well while chatting with another girl .  to  Dwight, who manages a one night stand with one of Pam's friends at  her and Jim's wedding  and handles it with a level of expertise that suggests experience in the field   and when Pam's friend approa
Truncating  And on Take Your Daughter to Work Day, where he talks to the camera about how he has to make sure none of the young girls see the porn on his computer, then realizes they could be doing that right now.  to  And on Take Your Daughter to Work Day, where he talks to the camera about how he has to make sure none of the young girls see the porn on his computer, then realizes they could be doing that right now
Truncating  When Dwight becomes acting manager he forces everyone to use an antiqua

0.532682366038134

In [None]:
experiment(stemmed_phi, fit_svc, train, dev1, should_parse=True)

In train!
Gets to vectorizer
Finishes vectorize_X_dependency




(11970, 1000)
1000
Not in train!
Truncating  There was also a comic book adaptation which tended to play down the sitcom aspects in favor of giving him strange technological toys; it seemed more inspired by the  Animated Adaptation  than the original sitcom and would in fact often include stories set on Melmac, usually with the framing story of Alf telling the Tanners of life on his home planet.  to  There was also a comic book adaptation which tended to play down the sitcom aspects in favor of giving him strange technological toys; it seemed more inspired by the  Animated Adaptation  than the ori
Truncating  Jay Mariotti was a regular panelist since the show's beginning, but he  hasn't appeared  since his  August 2010 domestic violence arrest , and ESPN has made it clear they aren't really interested in bringing him back.  to  Jay Mariotti was a regular panelist since the show's beginning, but he  hasn't appeared  since his  August 2010 domestic violence arrest , and ESPN has made it 

Truncating  They are openly hostile to each other, the former being a "Communist" who claims the director his natural enemy  because of their opposite political leanings , but actually they are always making secret agreements and plotting together against the other workers for their own personal profit.  to  They are openly hostile to each other, the former being a "Communist" who claims the director his natural enemy  because of their opposite political leanings , but actually they are always making secr
Truncating  Mostly noticeable in a sketch beginning with Luca screws up and gets tea instead of coffee; instead of simply throwing it away, he tries to give it to someone else *  (the former would be a solution, but  his strong Communist beliefs keep him from "wasting" tea, as it would be "an instance of Capitalism" )]].  to  Mostly noticeable in a sketch beginning with Luca screws up and gets tea instead of coffee; instead of simply throwing it away, he tries to give it to someone el

Truncating  When arranged as a colour spectrum, this forms a time line, so in a case of fridge brilliance, Carrie made a connection between the chronology and the chromatology, again words with a very similar morphology.  to  When arranged as a colour spectrum, this forms a time line, so in a case of fridge brilliance, Carrie made a connection between the chronology and the chromatology, again words with a very similar mor
Truncating  Brody   among other things: he sleeps on the floor to avoid assaulting his wife in his sleep, cowers in a corner of his bedroom all day after slipping into an  Angst Coma  and has  intimacy issues  due to his time as a POW.  to  Brody   among other things: he sleeps on the floor to avoid assaulting his wife in his sleep, cowers in a corner of his bedroom all day after slipping into an  Angst Coma  and has  intimacy issues  du
Truncating  Or mutant sea monsters...  Also, HHTV's war correspondent invariably signs off with "This is Mike Peabody, reporting fo

Truncating  In the episode, "You Better Shop Around" Jerry Mathers (known for  Leave It To Beaver ) guest stars, playing himself as a washed-up former child star, who has resorted to judging shopping contests in supermarkets.  to  In the episode, "You Better Shop Around" Jerry Mathers (known for  Leave It To Beaver ) guest stars, playing himself as a washed-up former child star, who has resorted to judging shopping contests in 
Truncating  It's implied in several episodes that Kelly's promiscuity stems from being starved for attention and treated as the  Butt Monkey  in her own family (the episode where Al invents shoe-lights lampshaded this with Kelly's (who is being used as the guinea pig for shoe-lights) line: "'Kelly, this meat is green.  to  It's implied in several episodes that Kelly's promiscuity stems from being starved for attention and treated as the  Butt Monkey  in her own family (the episode where Al invents shoe-lights lampshaded
Truncating  At the end of the episode, Jef

Truncating  When this plan seemed no longer viable, Ari proposed to frame Semak for killing Alex, and stealing Gogol's operating budget in the ensuing chaos; while 300 million US dollars isn't a  Mega Corp , it's still an incredible amount of money for two people.  to  When this plan seemed no longer viable, Ari proposed to frame Semak for killing Alex, and stealing Gogol's operating budget in the ensuing chaos; while 300 million US dollars isn't a  Mega Corp , it's
Truncating  All of Division's missions are named this way, but the most important is "Operation Sparrow", which is involved in the last several episodes of the first season, and is revealed in the finale to be   a ruse as part of a  Batman Gambit  to assassinate and replace the director of the CIA .  to  All of Division's missions are named this way, but the most important is "Operation Sparrow", which is involved in the last several episodes of the first season, and is revealed in the finale to be   
Truncating  The final 

Truncating  Elizabeth Berkley ("Jessie") was somewhat of a borderline case, as she was 17 when her character was introduced, pushing her to around 20-21 by the time the series wrapped (playing a high-school senior).  to  Elizabeth Berkley ("Jessie") was somewhat of a borderline case, as she was 17 when her character was introduced, pushing her to around 20-21 by the time the series wrapped (playing a high-school senio
Truncating  As a result, the character of Tori was hastily created to fill in the roles left by the absence of the two aforementioned characters, and especially in the case of Kaposwki's absence,   serve as a love interest for Zack Morris.  to  As a result, the character of Tori was hastily created to fill in the roles left by the absence of the two aforementioned characters, and especially in the case of Kaposwki's absence,   serve as a lov
Truncating  The Tori episodes were filmed after the series finale and the made-for-TV movie that were supposed to wrap up the plot o

Truncating  Dwight, who manages a one night stand with one of Pam's friends at  her and Jim's wedding  and handles it with a level of expertise that suggests experience in the field   and when Pam's friend approaches Dwight at the wedding, he's doing well while chatting with another girl .  to  Dwight, who manages a one night stand with one of Pam's friends at  her and Jim's wedding  and handles it with a level of expertise that suggests experience in the field   and when Pam's friend approa
Truncating  And on Take Your Daughter to Work Day, where he talks to the camera about how he has to make sure none of the young girls see the porn on his computer, then realizes they could be doing that right now.  to  And on Take Your Daughter to Work Day, where he talks to the camera about how he has to make sure none of the young girls see the porn on his computer, then realizes they could be doing that right now
Truncating  When Dwight becomes acting manager he forces everyone to use an antiqua

#### Creating train_parsed_full.txt

In [None]:
test = Counter({'fox-jumps': 1, 'The-fox': 1, 'quick-fox': 1, 'brown-fox': 1, 'dog-jumps': 1, 'over-dog': 1, 'the-dog': 1, 'lazy-dog': 1})

In [None]:
print(test)

In [None]:
import pickle
output = pickle.dumps(test)
pickle.loads(output)

In [23]:
X = [train.loc[i, 'sentence'] for i in range(len(train.index))]

In [None]:
file = open("train_parsed.txt", "wb")
for example in X:
    output = pickle.dump(dep_phi(example), file)
file.close() 

In [25]:
f = open("train_parsed.txt", "rb")

In [72]:
pickle.load(f)

Counter({'Because-Die': 1,
         'Die-became': 1,
         'EVEN-WORSE': 1,
         'Never-Die': 1,
         'Say-Die': 1,
         'WORSE-became': 1,
         'a-scene': 1,
         'arguably-WORSE': 1,
         'horrifying-scene': 1,
         'of-Die': 1,
         'scene-became': 1})

In [26]:
objs = []
while 1:
    try:
        o = pickle.load(f)
    except EOFError:
        break
    objs.append(o)

In [38]:
print(len(objs))
print(objs[10505])
print(X[10506])

10506
Counter({'CMOA-suspected': 1, 'Doubles-CMOA': 1, 'as-CMOA': 1, 'a-CMOA': 1, 'Craig-CMOA': 1, 'for-Craig': 1, 'ruining-suspected': 1, 'as-ruining': 1, 'despite-ruining': 1, 'single-handedly-ruining': 1, 'mission-ruining': 1, 'that-mission': 1, 'person-suspected': 1, 'not-person': 1, 'one-person': 1, 'him-suspected': 1, 'Mole-suspected': 1, 'of-Mole': 1, 'being-Mole': 1, 'The-Mole': 1})
When the first, the extremely weird  33 1/3 Revolutions per Monkee , bombed in the ratings, plans for the other two specials were cancelled.


In [89]:
print(dep_phi(" Michelotto  is utterly matter-of-fact when he informs Cesare that he  murdered his own father , and Cesare's reaction is equally unfazed."))

Counter({'Michelotto-matter-of-fact': 1, 'is-matter-of-fact': 1, 'utterly-matter-of-fact': 1, 'informs-matter-of-fact': 1, 'when-informs': 1, 'he-informs': 1, 'Cesare-informs': 1, 'murdered-informs': 1, 'that-murdered': 1, 'he-murdered': 1, 'father-murdered': 1, 'his-father': 1, 'own-father': 1, 'and-matter-of-fact': 1, 'unfazed-matter-of-fact': 1, 'reaction-unfazed': 1, 'Cesare-reaction': 1, "'s-Cesare": 1, 'is-unfazed': 1, 'equally-unfazed': 1})


In [79]:
print(objs[0])

Counter({'Die-became': 1, 'Because-Die': 1, 'of-Die': 1, 'Never-Die': 1, 'Say-Die': 1, 'scene-became': 1, 'a-scene': 1, 'horrifying-scene': 1, 'WORSE-became': 1, 'arguably-WORSE': 1, 'EVEN-WORSE': 1})


In [90]:
f.close()

In [None]:
file = open("train_parsed2.txt", "wb")
for example in X[9910:]:
    output = pickle.dump(dep_phi(example), file)
file.close() 

In [29]:
f = open("train_parsed2.txt", "rb")

In [30]:
objs2 = []
while 1:
    try:
        o = pickle.load(f)
    except EOFError:
        break
    objs2.append(o)

In [31]:
f.close()

In [32]:
print(len(objs2))

596


In [111]:
final = objs+objs2
print(len(final))
print(final[-1])
print(X[10506])

10506
Counter({'CMOA-suspected': 1, 'Doubles-CMOA': 1, 'as-CMOA': 1, 'a-CMOA': 1, 'Craig-CMOA': 1, 'for-Craig': 1, 'ruining-suspected': 1, 'as-ruining': 1, 'despite-ruining': 1, 'single-handedly-ruining': 1, 'mission-ruining': 1, 'that-mission': 1, 'person-suspected': 1, 'not-person': 1, 'one-person': 1, 'him-suspected': 1, 'Mole-suspected': 1, 'of-Mole': 1, 'being-Mole': 1, 'The-Mole': 1})
When the first, the extremely weird  33 1/3 Revolutions per Monkee , bombed in the ratings, plans for the other two specials were cancelled.


In [108]:
print(len(objs2))
print(objs2[-1])

596
Counter({'CMOA-suspected': 1, 'Doubles-CMOA': 1, 'as-CMOA': 1, 'a-CMOA': 1, 'Craig-CMOA': 1, 'for-Craig': 1, 'ruining-suspected': 1, 'as-ruining': 1, 'despite-ruining': 1, 'single-handedly-ruining': 1, 'mission-ruining': 1, 'that-mission': 1, 'person-suspected': 1, 'not-person': 1, 'one-person': 1, 'him-suspected': 1, 'Mole-suspected': 1, 'of-Mole': 1, 'being-Mole': 1, 'The-Mole': 1})


In [41]:
print(dep_phi("When the first, the extremely weird 33-1/3 Revolutions per Monkee , bombed in the ratings, plans for the other two specials were cancelled."))

Counter({'bombed-cancelled': 1, 'When-bombed': 1, 'first-bombed': 1, 'the-first': 1, 'Revolutions-first': 1, 'the-Revolutions': 1, 'weird-Revolutions': 1, 'extremely-weird': 1, '33-1/3-Revolutions': 1, 'Monkee-Revolutions': 1, 'per-Monkee': 1, 'ratings-bombed': 1, 'in-ratings': 1, 'the-ratings': 1, 'plans-cancelled': 1, 'specials-plans': 1, 'for-specials': 1, 'the-specials': 1, 'other-specials': 1, 'two-specials': 1, 'were-cancelled': 1})


In [None]:
file = open("train_parsed3.txt", "wb")
pickle.dump(dep_phi("When the first, the extremely weird 33-1/3 Revolutions per Monkee , bombed in the ratings, plans for the other two specials were cancelled."), file)
for example in X[10507:]:
    print(example)
    output = pickle.dump(dep_phi(example), file)
file.close() 

In [43]:
file.close()

In [44]:
f = open("train_parsed3.txt", "rb")

In [45]:
objs3 = []
while 1:
    try:
        o = pickle.load(f)
    except EOFError:
        break
    objs3.append(o)

In [54]:
f.close()

In [46]:
print(len(objs3))
print(objs3[-1])

1260
Counter({'Where-go': 1, 'do-go': 1, 'I-go': 1})


In [48]:
final = objs + objs3

In [55]:
print(len(final))
print(final[-1])
print(X[11766])

11766
Counter({'Where-go': 1, 'do-go': 1, 'I-go': 1})
After the gang has learned that  Cloudcuckoolander  Lowell's family possesses a huge family trust which all Mathers get a huge payout from upon turning 31 1/2 years old: Antonio:  God, if only I'd been born a Mather!


In [None]:
file = open("train_parsed4.txt", "wb")
pickle.dump(dep_phi("After the gang has learned that  Cloudcuckoolander  Lowell's family possesses a huge family trust which all Mathers get a huge payout from upon turning 31-1/2 years old: Antonio:  God, if only I'd been born a Mather!"), file)
for example in X[11767:]:
    print(example)
    output = pickle.dump(dep_phi(example), file)
file.close() 

In [57]:
f = open("train_parsed4.txt", "rb")
objs4 = []
while 1:
    try:
        o = pickle.load(f)
    except EOFError:
        break
    objs4.append(o)
f.close()

In [58]:
print(len(objs4))

204


In [59]:
final = final + objs4
print(len(final))
print(len(X))

11970
11970


In [62]:
f = open("train_parsed_full.txt", "rb")
train_parsed = []
while 1:
    try:
        o = pickle.load(f)
    except EOFError:
        break
    train_parsed.append(o)
f.close()

In [63]:
print(len(train_parsed))

11970


In [67]:
print(train_parsed[11003])
print(X[11003])

Counter({'Marshals-burst': 1, 'claiming-burst': 1, 'is-claiming': 1, 'that-is': 1, 'fugitive-is': 1, 'a-fugitive': 1, 'there-is': 1, 'they-burst': 1, 'so-they': 1, 'in-burst': 1, 'arrest-burst': 1, 'to-arrest': 1, 'him-arrest': 1, 'and-arrest': 1, 'cause-arrest': 1, 'shootout-cause': 1, 'a-shootout': 1})
Marshals, claiming that a fugitive is there, so they burst in to arrest him and cause a shootout.


In [66]:
experiment(dep_phi, fit_svc, train, dev1, should_parse=False)

SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
Parsing file: /var/folders/tk/sdl353716c17z4dyf_77pmd00000gp/T/tmp_u83c9q8
Parsing [sent. 1 len. 167]: Well ... Peter is in love with Olivia who liked John at first but he died and she liked Peter back but she does n't remember him as of season 4 and now has a thing for Lincoln who is quite smitten back but is friends with Peter who seems to ship them as well for some reason but Olivia remembered Peter again and now Lincoln is broken hearted and in the alternate time line Peter thought that Fauxlivia was Olivia and spent seven episodes enamoured with her but she was dating Frank but Alternate Lincoln liked her too and seemed to be fond of our Olivia as well when Walternate mind raped her into thinking she was Fauxlivia but in the amber timeline Fauxlivia has broken up with Frank 

0.6141201960551175

# Latent Dirichlet Allocation

In [22]:
from sklearn.decomposition import LatentDirichletAllocation

In [23]:
def build_dataset(data, phi, vectorizer=None, lda=None):
    X = [data.loc[i, 'sentence'] for i in range(len(data.index))]
    y = [data.loc[i, 'spoiler'] for i in range(len(data.index))]
    feat_matrix, vectorizer = vectorize_X(X, phi, vectorizer)
    if lda==None:
        lda = LatentDirichletAllocation(n_components=500)
        feat_matrix = lda.fit_transform(feat_matrix)
    else:
        feat_matrix = lda.transform(feat_matrix)
    return (normalize(feat_matrix), y, vectorizer, lda)

In [24]:
def experiment(phi, train_func, train_data, test_data):
    X, y, vectorizer, lda = build_dataset(train_data, phi)
    print(len(X[0]))
    mod = train_func(X, y)
    X_test, y_test, vectorizer, lda = build_dataset(test_data, phi, vectorizer=vectorizer, lda=lda)
    predictions = mod.predict(X_test)
    print('Accuracy: %0.03f' % accuracy_score(y_test, predictions))
    print(classification_report(y_test, predictions, digits=3))
    return f1_score(y_test, predictions, average = 'macro', pos_label=None)

In [25]:
experiment(stemmed_phi, fit_svc, train, test)



500
Accuracy: 0.567
             precision    recall  f1-score   support

      False      0.555     0.437     0.489       700
       True      0.575     0.685     0.625       777

avg / total      0.565     0.567     0.561      1477



0.5569942167229223

In [26]:
experiment(unigrams_phi, fit_svc, train, test)



500
Accuracy: 0.574
             precision    recall  f1-score   support

      False      0.560     0.473     0.513       700
       True      0.584     0.665     0.622       777

avg / total      0.572     0.574     0.570      1477



0.5672743397208949