# Automated Spoiler Detection

In [1]:
__author__ = "Kristine Guo and Caroline Ho"
__version__ = "CS224u, Stanford, Spring 2018 term"

## Contents

0. [Overview](#Overview)
0. [Set-up](#Set-up)
0. [Baseline](#Baseline)
  0. [Features](#Features)
  0. [Experiment](#Experiment)
0. [Dependency Parsing](#Dependency-Parsing)
0. [Latent Dirichlet Allocation](#Latent-Dirichlet-Allocation)

## Overview

Manually tagging and avoiding spoilers online has become an increasingly difficult task due to the Internet's widespread social influence. We aim to tackle the problem of automatic spoiler detection, specifically applied to the TV Tropes dataset (Boyd-Graber et al., 2013). For our baseline, we train a linear kernel SVM that utilizes standard Bag of Words features (unigrams, stemming, bigrams). We improve upon this model by utilizing 1) dependency parsing, and 2) Latent Dirichlet Allocation (LDA). Our experiments revealed that dependency parsing and stemming produces the highest F1 score and that LDA negatively impacts performance.

## Set-up

* Download [the TV Tropes dataset](http://cs.colorado.edu/~jbg/downloads/spoilers.tar.gz), unpack it, and place it in the current directory (or wherever you point `data_home` to below).

* Download train_parsed_full.txt and place it in the current directory.

* Download [PorterStemmer](https://tartarus.org/martin/PorterStemmer/python.txt) and save as a .py file in the current directory.

* Download [the Stanford parser](https://nlp.stanford.edu/software/stanford-parser-full-2018-02-27.zip), unpack it, and place it in the current directory.

* Download [English models for the Stanford parser](https://nlp.stanford.edu/software/stanford-english-corenlp-2018-02-27-models.jar), unpack the file, and place it in the current directory.

In [2]:
from collections import Counter
import copy
from nltk.corpus import stopwords
from nltk.parse.stanford import StanfordDependencyParser
import numpy as np
import os
import pandas as pd
import pickle
import PorterStemmer
import scipy.stats
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.preprocessing import normalize
from sklearn.svm import LinearSVC
import string
import time

In [3]:
data_home = 'tvtropes'

In [4]:
dev1 = pd.read_csv(
    os.path.join(data_home, 'dev1.balanced.csv'))

In [5]:
dev2 = pd.read_csv(
    os.path.join(data_home, 'dev2.balanced.csv'))

In [6]:
test = pd.read_csv(
    os.path.join(data_home, 'test.balanced.csv'))

In [7]:
train = pd.read_csv(
    os.path.join(data_home, 'train.balanced.csv'))

## Baseline

In [8]:
ps = PorterStemmer.PorterStemmer()
translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
stop_words = set(stopwords.words('english'))

def parse_sentence(sentence):
    s = sentence.translate(translator).split()
    for i in range(len(s)):
        s[i] = s[i].strip(string.punctuation).lower()
    s = list(filter(None, s))
    return [word for word in s if word not in stop_words]

### Features

Description here

In [9]:
def unigrams_phi(s):
    return Counter(s)

In [10]:
def stemmed_phi(s):
    return Counter([ps.stem(word) for word in s])

In [11]:
def bigrams_phi(s):
    t = copy.deepcopy(s)
    t.insert(0, '<S>')
    t.append('</S>')
    bigrams = [tuple([t[i], t[i + 1]]) for i in range(len(t) - 1)]
    return Counter(bigrams)

### Experiment

Description

In [12]:
def vectorize_X(X, phi, vectorizer=None, should_parse=True):
    feat_dicts = []
    if should_parse: feat_dicts = [phi(parse_sentence(sentence)) for sentence in X]
    else:
        if len(X) == len(train): feat_dicts = copy.deepcopy(train_parsed_lowered)
        else: feat_dicts = phi(X)
    if vectorizer == None:
        vectorizer = DictVectorizer(sparse=False)
        return (vectorizer.fit_transform(feat_dicts), vectorizer)
    else:
        return (vectorizer.transform(feat_dicts), vectorizer)

In [13]:
def build_dataset(data, phi, vectorizer=None, should_parse=True):
    X = [data.loc[i, 'sentence'] for i in range(len(data.index))]
    y = [data.loc[i, 'spoiler'] for i in range(len(data.index))]
    feat_matrix, vectorizer = vectorize_X(X, phi, vectorizer=vectorizer, should_parse=should_parse)
    return (normalize(feat_matrix), y, vectorizer)

In [14]:
def fit_svc(X, y):
    mod = LinearSVC()
    mod.fit(X, y)
    return mod

In [15]:
def experiment(phi, train_func, train_data, test_data, should_parse=True):
    X, y, vectorizer = build_dataset(train_data, phi, should_parse=should_parse)
    print(len(X[0]))
    mod = train_func(X, y)
    X_test, y_test, vectorizer = build_dataset(test_data, phi, vectorizer=vectorizer, should_parse=should_parse)
    predictions = mod.predict(X_test)
    print('Accuracy: %0.03f' % accuracy_score(y_test, predictions))
    print(classification_report(y_test, predictions, digits=3))
    return f1_score(y_test, predictions, average = 'macro', pos_label=None)

In [16]:
experiment(unigrams_phi, fit_svc, train, test)
experiment(stemmed_phi, fit_svc, train, test)
experiment(bigrams_phi, fit_svc, train, test)

18968
Accuracy: 0.629
             precision    recall  f1-score   support

      False      0.595     0.681     0.635       700
       True      0.670     0.582     0.623       777

avg / total      0.634     0.629     0.629      1477

13520
Accuracy: 0.634
             precision    recall  f1-score   support

      False      0.604     0.663     0.632       700
       True      0.667     0.609     0.637       777

avg / total      0.637     0.634     0.634      1477

110710
Accuracy: 0.588
             precision    recall  f1-score   support

      False      0.566     0.567     0.566       700
       True      0.609     0.607     0.608       777

avg / total      0.588     0.588     0.588      1477



0.5872906157624602

## Dependency Parsing

In [17]:
path_to_jar = 'stanford-parser-full-2018-02-27/stanford-parser.jar'
path_to_models_jar = 'stanford-english-corenlp-2018-02-27-models.jar'

dep_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)

In [18]:
# Create train_parsed_lowered for dependency parsing.
train_parsed = []
f = open("train_parsed_full.txt", "rb")
while 1:
    try:
        o = pickle.load(f)
    except EOFError:
        break
    train_parsed.append(o)
f.close()

def process(k):
    arr = k.split('-')
    arr = [ps.stem(a.lower()) for a in arr]
    return '-'.join(arr)

train_parsed_lowered = []
for a in train_parsed:
    temp = {}
    for k, v in a.items():
        p = process(k)
        if p in temp: temp[p] += v
        else: temp[p] = v
    train_parsed_lowered.append(temp)

In [19]:
def dep_mass_phi(sentences):
    trunc_sentences = [sentence[:min(len(sentence), 200)] for sentence in sentences]
    feat_dicts = []
    for dep_graphs in dep_parser.raw_parse_sents(trunc_sentences):
        deps = []
        for parse in dep_graphs:
            for t in parse.triples():
                deps.append(process(t[2][0] + '-' + t[0][0]))
        feat_dicts.append(Counter(deps))
    return feat_dicts

In [20]:
experiment(dep_mass_phi, fit_svc, train, test, should_parse=False)

147231
Accuracy: 0.631
             precision    recall  f1-score   support

      False      0.613     0.599     0.606       700
       True      0.646     0.660     0.653       777

avg / total      0.631     0.631     0.631      1477



0.6295081725766134

## Latent Dirichlet Allocation

In [21]:
def lda_build_dataset(data, phi, vectorizer=None, lda=None, should_parse=True):
    X = [data.loc[i, 'sentence'] for i in range(len(data.index))]
    y = [data.loc[i, 'spoiler'] for i in range(len(data.index))]
    feat_matrix, vectorizer = vectorize_X(X, phi, vectorizer, should_parse=should_parse)
    if lda==None:
        lda = LatentDirichletAllocation(n_components=500)
        feat_matrix = lda.fit_transform(feat_matrix)
    else:
        feat_matrix = lda.transform(feat_matrix)
    return (normalize(feat_matrix), y, vectorizer, lda)

In [22]:
def lda_experiment(phi, train_func, train_data, test_data, should_parse=True):
    X, y, vectorizer, lda = lda_build_dataset(train_data, phi, should_parse=should_parse)
    print(len(X[0]))
    mod = train_func(X, y)
    X_test, y_test, vectorizer, lda = lda_build_dataset(test_data, phi, vectorizer=vectorizer, lda=lda, should_parse=should_parse)
    predictions = mod.predict(X_test)
    print('Accuracy: %0.03f' % accuracy_score(y_test, predictions))
    print(classification_report(y_test, predictions, digits=3))
    return f1_score(y_test, predictions, average = 'macro', pos_label=None)

In [23]:
lda_experiment(dep_mass_phi, fit_svc, train, test, should_parse=False)



500
Accuracy: 0.563
             precision    recall  f1-score   support

      False      0.546     0.464     0.502       700
       True      0.575     0.653     0.611       777

avg / total      0.561     0.563     0.559      1477



0.5565710375836959