# Spoilers

In [1]:
__author__ = "Kristine Guo and Caroline Ho"
__version__ = "CS224u, Stanford, Spring 2018 term"

## Contents

0. [Overview](#Overview)
0. [Set-up](#Set-up)
0. [Baseline](#Baseline)
  0. [Features](#Features)
  0. [Experiment](#Experiment)
0. [Sentiment](#Sentiment)
0. [Dependency Parsing](#Dependency-Parsing)

## Overview



## Set-up

* Download [the TV Tropes dataset](http://cs.colorado.edu/~jbg/downloads/spoilers.tar.gz), unpack it, and place it in the current directory (or wherever you point `data_home` to below).

* Download [PorterStemmer](https://tartarus.org/martin/PorterStemmer/python.txt) and save as a .py file in the current directory.

* Download [the Stanford parser](https://nlp.stanford.edu/software/stanford-parser-full-2018-02-27.zip), unpack it, and place it in the current directory.

* Download [English models for the Stanford parser](https://nlp.stanford.edu/software/stanford-english-corenlp-2018-02-27-models.jar), unpack the file, and place it in the current directory.

In [2]:
from collections import Counter
import copy
from nltk.corpus import stopwords
from nltk.parse.stanford import StanfordDependencyParser
import numpy as np
import os
import pandas as pd
import pickle
import PorterStemmer
import scipy.stats
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.preprocessing import normalize
from sklearn.svm import LinearSVC
import string
import time

In [3]:
data_home = 'tvtropes'

In [4]:
dev1 = pd.read_csv(
    os.path.join(data_home, 'dev1.balanced.csv'))

In [5]:
dev2 = pd.read_csv(
    os.path.join(data_home, 'dev2.balanced.csv'))

In [6]:
test = pd.read_csv(
    os.path.join(data_home, 'test.balanced.csv'))

In [7]:
train = pd.read_csv(
    os.path.join(data_home, 'train.balanced.csv'))

In [None]:
print(test.loc[0, 'trope'])

## Baseline

In [8]:
ps = PorterStemmer.PorterStemmer()
translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
stop_words = set(stopwords.words('english'))

def parse_sentence(sentence):
    s = sentence.translate(translator).split()
    for i in range(len(s)):
        s[i] = s[i].strip(string.punctuation).lower()
    s = list(filter(None, s))
    return [word for word in s if word not in stop_words]

### Features

Description here

In [9]:
def unigrams_phi(s):
    return Counter(s)

In [10]:
def stemmed_phi(s):
    return Counter([ps.stem(word) for word in s])

In [11]:
def bigrams_phi(s):
    t = copy.deepcopy(s)
    t.insert(0, '<S>')
    t.append('</S>')
    bigrams = [tuple([t[i], t[i + 1]]) for i in range(len(t) - 1)]
    return Counter(bigrams)

### Experiment

Description

In [12]:
def vectorize_X(X, phi, vectorizer=None, should_parse=True):
    feat_dicts = []
    if should_parse: feat_dicts = [phi(parse_sentence(sentence)) for sentence in X]
    else:
        phi(X)
        file = open("train_parsed.txt", "rb")
        while True:
            try:
                feat_dicts.append(pickle.load(file))
            except EOFError:
                break
        file.close()
    if vectorizer == None:
        vectorizer = DictVectorizer(sparse=False)
        return (vectorizer.fit_transform(feat_dicts), vectorizer)
    else:
        return (vectorizer.transform(feat_dicts), vectorizer)

In [13]:
def build_dataset(data, phi, vectorizer=None, should_parse=True):
    X = [data.loc[i, 'sentence'] for i in range(len(data.index))]
    y = [data.loc[i, 'spoiler'] for i in range(len(data.index))]
    feat_matrix, vectorizer = vectorize_X(X, phi, vectorizer=vectorizer, should_parse=should_parse)
    return (normalize(feat_matrix), y, vectorizer)

In [14]:
def fit_svc(X, y):
    mod = LinearSVC()
    mod.fit(X, y)
    return mod

In [15]:
def experiment(phi, train_func, train_data, test_data, should_parse=True):
    X, y, vectorizer = build_dataset(train_data, phi, should_parse=should_parse)
    print(len(X[0]))
    mod = train_func(X, y)
    X_test, y_test, vectorizer = build_dataset(test_data, phi, vectorizer=vectorizer, should_parse=should_parse)
    predictions = mod.predict(X_test)
    print('Accuracy: %0.03f' % accuracy_score(y_test, predictions))
    print(classification_report(y_test, predictions, digits=3))
    return f1_score(y_test, predictions, average = 'macro', pos_label=None)

In [None]:
experiment(unigrams_phi, fit_svc, train, dev1)
experiment(stemmed_phi, fit_svc, train, dev1)
experiment(bigrams_phi, fit_svc, train, dev1)

## Sentiment

In [None]:
def sentiment_phi(s):
    t = copy.deepcopy(s)
    is_neg = 0
    for i in range(len(t)):
        if is_neg > 4: is_neg = 0
        if is_neg > 0:
            is_neg += 1
            t[i] = 'NOT_' + t[i]
        if 'n\'t' in t[i] or t[i] in ['not', 'no', 'never']: is_neg = 1
    return Counter(t)

In [None]:
# experiment(sentiment_phi, fit_svc, train, dev1)

## Dependency Parsing

In [16]:
path_to_jar = 'stanford-parser-full-2018-02-27/stanford-parser.jar'
path_to_models_jar = 'stanford-english-corenlp-2018-02-27-models.jar'

dep_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)

In [None]:
x = [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")]
print(x, len(x))
y = [[list(parse.triples()) for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents(["The quick brown fox jumps over the lazy dog.", "I like rice."])]
print(y, len(y))

In [17]:
def dep_phi(sentence):
    deps = []
    for parse in dep_parser.raw_parse(sentence):
        for t in parse.triples():
            deps.append(t[2][0] + '-' + t[0][0])
    return Counter(deps)

In [18]:
def dep_mass_phi(sentences):
    feat_dicts = []
    for dep_graphs in dep_parser.raw_parse_sents(sentences):
        deps = []
        for parse in dep_graphs:
            for t in parse.triples():
                deps.append(t[2][0] + '-' + t[0][0])
        feat_dicts.append(Counter(deps))
    return feat_dicts

In [27]:
def dep_mass_pickle_phi(sentences):
    file = open("train_parsed.txt", "wb")
    for dep_graphs in dep_parser.raw_parse_sents(sentences):
        deps = []
        for parse in dep_graphs:
            for t in parse.triples():
                deps.append(t[2][0] + '-' + t[0][0])
        output = pickle.dump(Counter(deps), file)
    file.close()
    return file

In [28]:
experiment(dep_mass_pickle_phi, fit_svc, train, dev1, should_parse=False)

In [None]:
experiment(dep_phi, fit_svc, train, dev1, should_parse=False)

# Latent Dirichlet Allocation

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
def build_dataset(data, phi, vectorizer=None, lda=None):
    X = [data.loc[i, 'sentence'] for i in range(len(data.index))]
    y = [data.loc[i, 'spoiler'] for i in range(len(data.index))]
    feat_matrix, vectorizer = vectorize_X(X, phi, vectorizer)
    if lda==None:
        lda = LatentDirichletAllocation(n_components=50)
        feat_matrix = lda.fit_transform(feat_matrix)
    else:
        feat_matrix = lda.transform(feat_matrix)
    return (normalize(feat_matrix), y, vectorizer, lda)

In [None]:
def experiment(phi, train_func, train_data, test_data):
    X, y, vectorizer, lda = build_dataset(train_data, phi)
    print(len(X[0]))
    mod = train_func(X, y)
    X_test, y_test, vectorizer, lda = build_dataset(test_data, phi, vectorizer=vectorizer, lda=lda)
    predictions = mod.predict(X_test)
    print('Accuracy: %0.03f' % accuracy_score(y_test, predictions))
    print(classification_report(y_test, predictions, digits=3))
    return f1_score(y_test, predictions, average = 'macro', pos_label=None)

In [None]:
experiment(unigrams_phi, fit_svc, train, dev1)