In [1]:
%load_ext autoreload
%autoreload 2

import re
import nltk
import numpy as np
import pandas as pd
import os
from pronounResolution import *
from relationExtract import *
from collections import defaultdict

## Prepare Dataset

In [2]:
# returns dataframe with script annotations
def loadScript(file_name):
    # read file
    df = pd.read_csv('prep_scripts/' + file_name)[['speaker', 'dialogue', 'sentences', 'sentiment', 'entities', 'tokens']]

    # evaluate strings for lists/dicts of tokens, sentiment, entities
    df['tokens'] = df['tokens'].apply(lambda x: eval(x))
    df['sentiment'] = df['sentiment'].apply(lambda x: eval(x))
    df['speaker'] = df['speaker'].apply(lambda x: x.strip())
    df['entities'] = df['entities'].apply(lambda x: eval(x))
    
    return df

# enhances annotations with pronoun counts, nearby speakers, and sentiments for each line
def annotateScript(df):
   
    # number of pronouns for each line
    df['num_pron'] = df['tokens'].apply(lambda x: sum([int(t['pos'] == 'PRON') for t in x]))

    # total sentiment score for each line
    df['total_sent'] = df['sentiment'].apply(lambda x: x['score'] * x['magnitude'])

    # previous and next speaker for each line
    df['speaker_prev'] = df.speaker.shift(1)
    df['speaker_next'] = df.speaker.shift(-1)

    # nearby speakers for each line - 2 speakers before and after current speaker
    df['nearbyChars'] = None
    nearbyChars = np.dstack((df.speaker.shift(i).values for i in range(-2, 3)[::-1]))[0]
    for i in range(len(df)):
        df.set_value(i, 'nearbyChars', nearbyChars[i])

    return df

View files for annotated movie scripts.

In [4]:
# get files for annotated scripts
files = [x for x in os.listdir('prep_scripts') if '_gapi.csv' in x]

print 'annotated scripts:'
for i, f in enumerate(files):
    print i, f

annotated scripts:
0 ant-man_tw_gapi.csv
1 avengers_age_of_ultron_tw_gapi.csv
2 captain_america_civil_war_tw_gapi.csv
3 captain_america_the_first_avenger_tw_gapi.csv
4 captain_america_the_winter_soldier_tw_gapi.csv
5 fantastic_four_imsdb_gapi.csv
6 iron_man_3_tw_gapi.csv
7 lego_marvel_super_heroes_tw_gapi.csv
8 spider-man_imsdb_gapi.csv
9 the_amazing_spider-man_2_tw_gapi.csv
10 the_amazing_spider-man_tw_gapi.csv
11 the_avengers_tw_gapi.csv
12 the_wolverine_tw_gapi.csv
13 thor_the_dark_world_tw_gapi.csv
14 thor_tw_gapi.csv
15 x-men_apocalypse_tw_gapi.csv
16 x-men_days_of_future_past_tw_gapi.csv
17 x-men_imsdb_gapi.csv
18 x-men_the_last_stand_tw_gapi.csv


1. Load set of annotated scripts to be analyzed.
2. Enhance annotated scripts with features for speakers, sentiment, and pronouns 

In [6]:
 # list of file indexes for Avengers (1,11) and X-Men movies (15-19)
fileIndex = [1, 11, 15, 16, 18]

# dict to hold name, annotations, characters, and other info for scripts
scripts = defaultdict(lambda: defaultdict())

for i in fileIndex:
    # load annotated script
    df = loadScript(files[i])
    
    # add features to annotated script
    df = annotateScript(df)
    scripts[i]['name'] = files[i]
    scripts[i]['df'] = df
    
    # list of unique characters, mentions, overall sentiment
    charMentions = dict(df.groupby('speaker').speaker.count())
#     charSentiment = dict(df.groupby('speaker').total_sent.sum())
    charList = charMentions.keys()

    # add unique characters
    scripts[i]['chars'] = charMentions
    
# print scripts[1]['name']
# print scripts[1]['chars']
scripts[11]['df'].head()

Unnamed: 0,speaker,dialogue,sentences,sentiment,entities,tokens,num_pron,total_sent,speaker_prev,speaker_next,nearbyChars
0,narrator,first lines; Loki has allied with the alien ra...,[{'content': u'first lines; Loki has allied wi...,"{u'score': -0.1, u'magnitude': 0.1}","[{u'type': u'OTHER', u'meta': {}, u'salience':...","[{u'index': 1, u'begin': 0, u'pos': u'ADJ', u'...",1,-0.01,,The Other,"[nan, nan, narrator, The Other, narrator]"
1,The Other,[voice over] The Tesseract has awakened. It is...,[{'content': u'[voice over] The Tesseract has ...,"{u'score': 0.1, u'magnitude': 1.6}","[{u'type': u'OTHER', u'meta': {}, u'salience':...","[{u'index': 1, u'begin': 0, u'pos': u'PUNCT', ...",11,0.16,narrator,narrator,"[nan, narrator, The Other, narrator, Nick Fury]"
2,narrator,Nick Fury and Maria Hill arrive at a remote re...,[{'content': u'Nick Fury and Maria Hill arrive...,"{u'score': 0.4, u'magnitude': 0.4}","[{u'type': u'PERSON', u'meta': {u'mid': u'/m/0...","[{u'index': 1, u'begin': 0, u'pos': u'NOUN', u...",0,0.16,The Other,Nick Fury,"[narrator, The Other, narrator, Nick Fury, Age..."
3,Nick Fury,How bad is it?,"[{'content': u'How bad is it?', 'begin': 0, 's...","{u'score': -0.4, u'magnitude': 0.4}",[],"[{u'index': 1, u'begin': 0, u'pos': u'ADV', u'...",1,-0.16,narrator,Agent Phil Coulson,"[The Other, narrator, Nick Fury, Agent Phil Co..."
4,Agent Phil Coulson,"That's the problem, sir. We don't know.","[{'content': u""That's the problem, sir."", 'beg...","{u'score': 0, u'magnitude': 0}","[{u'type': u'OTHER', u'meta': {}, u'salience':...","[{u'index': 1, u'begin': 0, u'pos': u'DET', u'...",1,0.0,Nick Fury,narrator,"[narrator, Nick Fury, Agent Phil Coulson, narr..."


## Task 1: Pronoun Resolution

Apply each model for pronoun resolution to multiple scripts.  Evaluate model by manually checking if characters for resolved pronouns are correct.

In [16]:
def selectEvalLines(df, numExamples):
    
    # indexes for lines of dialogue with resolved pronouns
    pronIndex = list(df[df.num_pron > 0].index)
    
    # sample random line to evaluate resolved pronoun
    evalLines = np.random.choice(pronIndex, min(len(pronIndex), numExamples), replace=False)
    
    return evalLines

Identify characters and choose lines to evaluate for each script

In [17]:
# for all scripts
for i in fileIndex:
    
    # get lines to evaluate for each script
    scripts[i]['eval'] = selectEvalLines(scripts[i]['df'], numExamples=20)

print scripts[1]['eval']

[245 768 512 874 196 606 781 451 599 906 248 202 222 955 198 378 752 103
 195 115]


Model 0 (pronResolution_base): sets reference as random character from script

In [18]:
# copy scripts
scripts0 = scripts.copy()

# apply model to all scripts
for i in fileIndex:
    charList = scripts0[i]['chars'].keys()  
    scripts0[i]['df'].apply(lambda x: pronResolution_base(charList, x), axis=1)
    
# manually evaluate results for all scripts
pronEval(scripts0)


******** line 116 ********
114. narrator:
in Washington D.C., Trask is meeting with Congress to lobby his Sentinel program

115. Senator Brickman:
We are reviewing all of our defense expenditures... and all the black books are being opened. We can't support a weapon that targets our own citizens. If these mutants as you describe... are already living among us... then they are living here peacefully. We haven't had an incident in over 10 years.

=> 116. Senator Brickman:
=> We are talking about a tenth of a tenth... of a tenth population.

117. Dr. Trask:
Allow me to read something to you.

118. Senator Parker:
Please.

******** evaluate line 116 in x-men_days_of_future_past_tw_gapi.csv ********
1 pronouns resolved
1. We => Quicksilver

how many are correctly identified? 0

******** line 170 ********
168. narrator:
Wolverine arrives at the X-Mansion which looks abandoned and run down, he notices the sign for Xavier School fallen on the ground, he drives to the front door, gets out of h

In [19]:
df = scripts[1]['df']
df[df.correct.notnull()]

Unnamed: 0,speaker,dialogue,sentences,sentiment,entities,tokens,num_pron,total_sent,speaker_prev,speaker_next,nearbyChars,correct
103,Tony Stark,"Yeah, I got...something bigger.","[{'content': u'Yeah, I got...something bigger....","{u'score': 0.1, u'magnitude': 0.1}","[{u'type': u'OTHER', u'meta': {}, u'salience':...","[{u'index': 3, u'begin': 0, u'pos': u'X', u'la...",1,0.01,Steve Rogers,narrator,"[narrator, Steve Rogers, Tony Stark, narrator,...",0.0
115,narrator,the Avengers are on an aircraft heading out of...,"[{'content': u""the Avengers are on an aircraft...","{u'score': 0, u'magnitude': 0}","[{u'type': u'PERSON', u'meta': {u'mid': u'/m/0...","[{u'index': 1, u'begin': 0, u'pos': u'DET', u'...",2,0.0,narrator,Natasha Romanoff,"[narrator, narrator, narrator, Natasha Romanof...",0.0
195,Tony Stark,"If you had to guess, what's it look like it's ...","[{'content': u""If you had to guess, what's it ...","{u'score': -0.1, u'magnitude': 0.1}",[],"[{u'index': 2, u'begin': 0, u'pos': u'ADP', u'...",4,-0.01,Bruce Banner,Bruce Banner,"[narrator, Bruce Banner, Tony Stark, Bruce Ban...",0.0
196,Bruce Banner,Like it's thinking. I mean this could be a...i...,"[{'content': u""Like it's thinking."", 'begin': ...","{u'score': 0.3, u'magnitude': 0.7}","[{u'type': u'OTHER', u'meta': {}, u'salience':...","[{u'index': 3, u'begin': 0, u'pos': u'ADP', u'...",4,0.21,Tony Stark,Tony Stark,"[Bruce Banner, Tony Stark, Bruce Banner, Tony ...",0.0
198,Bruce Banner,"I mean, look at this! They're like neurons fir...","[{'content': u'I mean, look at this!', 'begin'...","{u'score': 0.4, u'magnitude': 0.8}","[{u'type': u'OTHER', u'meta': {}, u'salience':...","[{u'index': 1, u'begin': 0, u'pos': u'PRON', u...",2,0.32,Tony Stark,Tony Stark,"[Bruce Banner, Tony Stark, Bruce Banner, Tony ...",0.0
202,Bruce Banner,I thought Ultron was a fantasy.,[{'content': u'I thought Ultron was a fantasy....,"{u'score': -0.1, u'magnitude': 0.1}","[{u'type': u'PERSON', u'meta': {u'mid': u'/m/0...","[{u'index': 1, u'begin': 0, u'pos': u'PRON', u...",1,-0.01,Tony Stark,Tony Stark,"[Bruce Banner, Tony Stark, Bruce Banner, Tony ...",0.0
222,JARVIS,I am a program. I am without form.,"[{'content': u'I am a program.', 'begin': 0, '...","{u'score': 0.1, u'magnitude': 0.2}","[{u'type': u'OTHER', u'meta': {}, u'salience':...","[{u'index': 1, u'begin': 0, u'pos': u'PRON', u...",2,0.02,Ultron,Ultron,"[JARVIS, Ultron, JARVIS, Ultron, JARVIS]",0.0
245,Maria Hill,"Hey, what about Jane? Where are the ladies, ge...","[{'content': u'Hey, what about Jane?', 'begin'...","{u'score': -0.1, u'magnitude': 0.4}","[{u'type': u'PERSON', u'meta': {u'mid': u'/m/0...","[{u'index': 2, u'begin': 0, u'pos': u'X', u'la...",1,-0.04,Tony Stark,Tony Stark,"[James Rhodes, Tony Stark, Maria Hill, Tony St...",0.0
248,Tony Stark,And the company that Pepper runs is the larges...,[{'content': u'And the company that Pepper run...,"{u'score': 0.4, u'magnitude': 0.8}","[{u'type': u'ORGANIZATION', u'meta': {}, u'sal...","[{u'index': 6, u'begin': 0, u'pos': u'CONJ', u...",1,0.32,Thor,Thor,"[Tony Stark, Thor, Tony Stark, Thor, Maria Hill]",0.0
378,Tony Stark,We didn't. We weren't even close. Were we clos...,"[{'content': u""We didn't."", 'begin': 0, 'score...","{u'score': -0.4, u'magnitude': 1.3}","[{u'type': u'OTHER', u'meta': {}, u'salience':...","[{u'index': 1, u'begin': 0, u'pos': u'PRON', u...",3,-0.52,Bruce Banner,Steve Rogers,"[Tony Stark, Bruce Banner, Tony Stark, Steve R...",0.0


Model 1 (pronResolution_nn): sets reference for first-person pronouns to speaker; for second-person pronouns to random choice between previous and next speaker

In [24]:
# copy scripts
scripts1 = scripts.copy()

# apply model to all scripts
for i in fileIndex:
    charList = scripts1[i]['chars'].keys()  
    scripts1[i]['df'].apply(lambda x: pronResolution_nn(charList, x), axis=1)
    
# manually evaluate results for all scripts
pronEval(scripts1)


******** line 116 ********
114. narrator:
in Washington D.C., Trask is meeting with Congress to lobby his Sentinel program

115. Senator Brickman:
We are reviewing all of our defense expenditures... and all the black books are being opened. We can't support a weapon that targets our own citizens. If these mutants as you describe... are already living among us... then they are living here peacefully. We haven't had an incident in over 10 years.

=> 116. Senator Brickman:
=> We are talking about a tenth of a tenth... of a tenth population.

117. Dr. Trask:
Allow me to read something to you.

118. Senator Parker:
Please.

******** evaluate line 116 in x-men_days_of_future_past_tw_gapi.csv ********
1 pronouns resolved
1. We => Guard

how many are correctly identified? 0

******** line 170 ********
168. narrator:
Wolverine arrives at the X-Mansion which looks abandoned and run down, he notices the sign for Xavier School fallen on the ground, he drives to the front door, gets out of his car

In [27]:
# write dfs with pronoun references added
for fileName in files:
    df = loadScript(f)
    df = annotateScript(df)
    df.apply(lambda x: pronResolution_nn(charList, x), axis=1)
    df.to_csv(fileName[:-4] + '_prons.csv')

Model 2 (pronResolution_nnMod): sets reference for first-person pronouns to speaker; for second-person pronouns to random choice between previous and next speaker

In [33]:
# copy scripts
scripts2 = scripts.copy()

# apply model to all scripts
for i in fileIndex:
    charList = scripts2[i]['chars'].keys()  
    scripts2[i]['df'].apply(lambda x: pronResolution_nnMod(charList, x), axis=1)
    
# manually evaluate results for all scripts
pronEval(scripts2)

KeyboardInterrupt: 

## Task 2: Relation Extraction

In [None]:
df['relations'] = df.tokens.apply(lambda x:simpleRE(x))

In [None]:
df[df.relations.notnull()].head().relations.values[1]

In [None]:
df.head().tokens.apply(lambda x:simpleRE(x))