# Spoilers

In [1]:
__author__ = "Kristine Guo and Caroline Ho"
__version__ = "CS224u, Stanford, Spring 2018 term"

## Contents

0. [Overview](#Overview)
0. [Set-up](#Set-up)
0. [Baseline](#Baseline)
  0. [Features](#Features)
  0. [Experiment](#Experiment)
0. [Sentiment](#Sentiment)
0. [Dependency Parsing](#Dependency-Parsing)

## Overview



## Set-up

* Make sure your environment meets all the requirements for [the cs224u repository](https://github.com/cgpotts/cs224u/). For help getting set-up, see [setup.ipynb](setup.ipynb).

* Make sure you've downloaded [the data distribution for this unit](http://web.stanford.edu/class/cs224u/data/vsmdata.zip), unpacked it, and placed it in the current directory (or wherever you point `data_home` to below).

In [2]:
from collections import Counter
import copy
from nltk.corpus import stopwords
from nltk.parse.stanford import StanfordDependencyParser
import numpy as np
import os
import pandas as pd
import PorterStemmer
import scipy.stats
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.preprocessing import normalize
from sklearn.svm import LinearSVC
import string

In [3]:
data_home = 'tvtropes'

In [4]:
dev1 = pd.read_csv(
    os.path.join(data_home, 'dev1.balanced.csv'))

In [5]:
dev2 = pd.read_csv(
    os.path.join(data_home, 'dev2.balanced.csv'))

In [6]:
test = pd.read_csv(
    os.path.join(data_home, 'test.balanced.csv'))

In [7]:
train = pd.read_csv(
    os.path.join(data_home, 'train.balanced.csv'))

In [8]:
print(test.loc[0, 'trope'])

WorkCom


## Baseline

In [9]:
ps = PorterStemmer.PorterStemmer()
translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
stop_words = set(stopwords.words('english'))

def parse_sentence(sentence):
    s = sentence.translate(translator).split()
    for i in range(len(s)):
        s[i] = s[i].strip(string.punctuation).lower()
    s = list(filter(None, s))
    return [word for word in s if word not in stop_words]

### Features

Description here

In [10]:
def unigrams_phi(s):
    return Counter(s)

In [11]:
def stemmed_phi(s):
    return Counter([ps.stem(word) for word in s])

In [12]:
def bigrams_phi(s):
    t = copy.deepcopy(s)
    t.insert(0, '<S>')
    t.append('</S>')
    bigrams = [tuple([t[i], t[i + 1]]) for i in range(len(t) - 1)]
    return Counter(bigrams)

### Experiment

Description

In [12]:
def vectorize_X(X, phi, vectorizer=None, should_parse=True):
    feat_dicts = []
    if should_parse: feat_dicts = [phi(parse_sentence(sentence)) for sentence in X]
    else: feat_dicts = [phi(sentence) for sentence in X]
    if vectorizer == None:
        vectorizer = DictVectorizer(sparse=False)
        return (vectorizer.fit_transform(feat_dicts), vectorizer)
    else:
        return (vectorizer.transform(feat_dicts), vectorizer)

In [13]:
def build_dataset(data, phi, vectorizer=None, should_parse=True):
    X = [data.loc[i, 'sentence'] for i in range(len(data.index))]
    y = [data.loc[i, 'spoiler'] for i in range(len(data.index))]
    feat_matrix, vectorizer = vectorize_X(X, phi, vectorizer=vectorizer, should_parse=should_parse)
    return (normalize(feat_matrix), y, vectorizer)

In [14]:
def fit_svc(X, y):
    mod = LinearSVC()
    mod.fit(X, y)
    return mod

In [15]:
def experiment(phi, train_func, train_data, test_data, should_parse=True):
    X, y, vectorizer = build_dataset(train_data, phi, should_parse=should_parse)
    print(len(X[0]))
    mod = train_func(X, y)
    X_test, y_test, vectorizer = build_dataset(test_data, phi, vectorizer=vectorizer, should_parse=should_parse)
    predictions = mod.predict(X_test)
    print('Accuracy: %0.03f' % accuracy_score(y_test, predictions))
    print(classification_report(y_test, predictions, digits=3))
    return f1_score(y_test, predictions, average = 'macro', pos_label=None)

In [17]:
experiment(unigrams_phi, fit_svc, train, dev1)
experiment(stemmed_phi, fit_svc, train, dev1)
experiment(bigrams_phi, fit_svc, train, dev1)

18968
Accuracy: 0.614
             precision    recall  f1-score   support

      False      0.578     0.633     0.604       496
       True      0.652     0.598     0.624       570

avg / total      0.618     0.614     0.615      1066

13520
Accuracy: 0.618
             precision    recall  f1-score   support

      False      0.584     0.623     0.603       496
       True      0.652     0.614     0.632       570

avg / total      0.620     0.618     0.619      1066

110710
Accuracy: 0.598
             precision    recall  f1-score   support

      False      0.568     0.567     0.567       496
       True      0.623     0.625     0.624       570

avg / total      0.598     0.598     0.598      1066



0.5955589791028989

## Sentiment

In [16]:
def sentiment_phi(s):
    t = copy.deepcopy(s)
    is_neg = 0
    for i in range(len(t)):
        if is_neg > 4: is_neg = 0
        if is_neg > 0:
            is_neg += 1
            t[i] = 'NOT_' + t[i]
        if 'n\'t' in t[i] or t[i] in ['not', 'no', 'never']: is_neg = 1
    return Counter(t)

In [19]:
# experiment(sentiment_phi, fit_svc, train, dev1)

## Dependency Parsing

In [20]:
path_to_jar = 'stanford-parser-full-2018-02-27/stanford-parser.jar'
path_to_models_jar = 'stanford-english-corenlp-2018-02-27-models.jar'

dep_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)

In [41]:
x = [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")]
print(x, len(x))

[[(('jumps', 'VBZ'), 'nsubj', ('fox', 'NN')), (('fox', 'NN'), 'det', ('The', 'DT')), (('fox', 'NN'), 'amod', ('quick', 'JJ')), (('fox', 'NN'), 'amod', ('brown', 'JJ')), (('jumps', 'VBZ'), 'nmod', ('dog', 'NN')), (('dog', 'NN'), 'case', ('over', 'IN')), (('dog', 'NN'), 'det', ('the', 'DT')), (('dog', 'NN'), 'amod', ('lazy', 'JJ'))]] 1


In [61]:
def dep_phi(sentence):
    deps = []
    for parse in dep_parser.raw_parse(sentence):
        for t in parse.triples():
            deps.append(t[2][0] + '-' + t[0][0])
    return Counter(deps)

print(dep_phi("The quick brown fox jumps over the lazy dog."))

Counter({'fox-jumps': 1, 'The-fox': 1, 'quick-fox': 1, 'brown-fox': 1, 'dog-jumps': 1, 'over-dog': 1, 'the-dog': 1, 'lazy-dog': 1})


In [66]:
experiment(dep_phi, fit_svc, train, dev1, should_parse=False)

SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
Parsing file: /var/folders/tk/sdl353716c17z4dyf_77pmd00000gp/T/tmp_u83c9q8
Parsing [sent. 1 len. 167]: Well ... Peter is in love with Olivia who liked John at first but he died and she liked Peter back but she does n't remember him as of season 4 and now has a thing for Lincoln who is quite smitten back but is friends with Peter who seems to ship them as well for some reason but Olivia remembered Peter again and now Lincoln is broken hearted and in the alternate time line Peter thought that Fauxlivia was Olivia and spent seven episodes enamoured with her but she was dating Frank but Alternate Lincoln liked her too and seemed to be fond of our Olivia as well when Walternate mind raped her into thinking she was Fauxlivia but in the amber timeline Fauxlivia has broken up with Frank 

0.6141201960551175

# Latent Dirichlet Allocation

In [18]:
from sklearn.decomposition import LatentDirichletAllocation

In [41]:
def build_dataset(data, phi, vectorizer=None, lda=None):
    X = [data.loc[i, 'sentence'] for i in range(len(data.index))]
    y = [data.loc[i, 'spoiler'] for i in range(len(data.index))]
    feat_matrix, vectorizer = vectorize_X(X, phi, vectorizer)
    if lda==None:
        lda = LatentDirichletAllocation(n_components=50)
        feat_matrix = lda.fit_transform(feat_matrix)
    else:
        feat_matrix = lda.transform(feat_matrix)
    return (normalize(feat_matrix), y, vectorizer, lda)

In [42]:
def experiment(phi, train_func, train_data, test_data):
    X, y, vectorizer, lda = build_dataset(train_data, phi)
    print(len(X[0]))
    mod = train_func(X, y)
    X_test, y_test, vectorizer, lda = build_dataset(test_data, phi, vectorizer=vectorizer, lda=lda)
    predictions = mod.predict(X_test)
    print('Accuracy: %0.03f' % accuracy_score(y_test, predictions))
    print(classification_report(y_test, predictions, digits=3))
    return f1_score(y_test, predictions, average = 'macro', pos_label=None)

In [43]:
experiment(unigrams_phi, fit_svc, train, dev1)



50
Accuracy: 0.541
             precision    recall  f1-score   support

      False      0.510     0.367     0.427       496
       True      0.557     0.693     0.618       570

avg / total      0.535     0.541     0.529      1066



0.5221996229102638