In [1]:
import re
import nltk
import pandas as pd
import numpy as np
from google.cloud import language
import random
import ast
import os

In [2]:
language_client = language.Client()

## Parsing Preprocessed Scripts

In [3]:
def googleAnnotate(fileName, nrows=None, saveFile=False):
    '''
    Takes a csv file in the form of [speaker, dialogue]
    Pass each dialogue into google language API
    Attach the output to the dataframe: [speaker, dialogue, sentences, sentiment, entities, tokens]
    sentences contain a dict {content, begin(index), (sentiment)score, (sentiment)magnitude} for each sentence
    sentiment contain {score, magnitude} for the entire dialogue
    entities contain {name, type, metadata, salience, mentions} for each entity
    tokens contain {content, begin(index), pos(partofspeech), index(edge index), label(edge label), lemma} for each token
    nrows can be used to read small number of rows for testing
    '''
    language_client = language.Client()
    df = pd.read_csv(fileName, names=['speaker', 'dialogue'], nrows=nrows)
    df['sentences'] = None
    df['sentiment'] = None
    df['entities'] = None
    df['tokens'] = None
    
    for i in range(len(df)):
        document = language_client.document_from_text(df.loc[i, 'dialogue'])
        annText = document.annotate_text()
        
        df.set_value(i, 'sentences', [{'content':x.content,
                                       'begin':x.begin,
                                       'score':x.sentiment.score,
                                       'magnitude':x.sentiment.magnitude} for x in annText.sentences])
        
        df.set_value(i, 'sentiment', {'score':annText.sentiment.score,
                                      'magnitude':annText.sentiment.magnitude})
        
        df.set_value(i, 'entities', [{'name':x.name,
                                      'type':x.entity_type,
                                      'meta':x.metadata,
                                      'salience':x.salience,
                                      'mentions':[m.text.content for m in x.mentions]} for x in annText.entities])
        
        df.set_value(i, 'tokens', [{'content':x.text_content,
                                    'begin':x.text_begin,
                                    'pos':x.part_of_speech,
                                    'index':x.edge_index,
                                    'label':x.edge_label,
                                    'lemma':x.lemma} for x in annText.tokens])
    if saveFile:
        df.to_csv(fileName[:-4] + '_gapi.csv')
        #print(fileName[:-4])
    return df
        

The code below runs googleAnnotate() on all preprocessed movie scripts and returns annotations with sentences, sentiment, entities, and tokens for each (character, dialogue) tuple.  The annotations are saved to a CSV file for later analysis and modeling, so this code only needs to be run once.

In [219]:
# set working directory
wd = os.path.join(os.getcwd(), 'prep_scripts')
print wd, '\n'

# view script files
files = os.listdir(wd)
print files, '\n'

# get files for scripts from transcripts.wikia
# guardians of the galaxy and x-men first class are detected as non-English?  breaks syntax and sentiment analysis
prep_scripts = [file for file in files if (file.endswith('_tw.csv') or file.endswith('_imsdb.csv')) and
                not (file.endswith('galaxy_tw.csv') or file.endswith('class_tw.csv'))]
print prep_scripts, '\n'

# parse all preprocessed scripts and save to csv
for script in prep_scripts:
    fileName = os.path.join(os.getcwd(), 'prep_scripts', script)
    
    # if parsed script file doesn't exist, annotate script
    if not(os.path.exists(fileName[:-4] + '_gapi.csv')):
        print 'Annotating script: ' + fileName 
        googleAnnotate(fileName, saveFile=True)
        print 'Finished annotating: ' + fileName

/Users/winlin/Desktop/w266-nlp/w266_project/prep_scripts 

['ant-man_tw.csv', 'ant-man_tw_gapi.csv', 'avengers_age_of_ultron_tw.csv', 'avengers_age_of_ultron_tw_gapi.csv', 'captain_america_civil_war_tw.csv', 'captain_america_civil_war_tw_gapi.csv', 'captain_america_the_first_avenger_tw.csv', 'captain_america_the_first_avenger_tw_gapi.csv', 'captain_america_the_winter_soldier_tw.csv', 'captain_america_the_winter_soldier_tw_gapi.csv', 'fantastic_four_imsdb.csv', 'fantastic_four_imsdb_gapi.csv', 'guardians_of_the_galaxy_tw.csv', 'iron_man_3_tw.csv', 'iron_man_3_tw_gapi.csv', 'lego_marvel_super_heroes_tw.csv', 'lego_marvel_super_heroes_tw_gapi.csv', 'spider-man_imsdb.csv', 'spider-man_imsdb_gapi.csv', 'the_amazing_spider-man_2_tw.csv', 'the_amazing_spider-man_2_tw_gapi.csv', 'the_amazing_spider-man_tw.csv', 'the_amazing_spider-man_tw_gapi.csv', 'the_avengers_tw.csv', 'the_avengers_tw_gapi.csv', 'the_wolverine_tw.csv', 'the_wolverine_tw_gapi.csv', 'thor_the_dark_world_tw.csv', 'thor_the_dar

In [4]:
# df = googleAnnotate('prep_scripts/the_amazing_spider-man_tw.csv',saveFile=True)
# googleAnnotate('prep_scripts/thor_tw.csv',saveFile=True)

Unnamed: 0,speaker,dialogue,sentences,sentiment,entities,tokens
0,narrator,first lines,"[{u'content': u'first lines', u'begin': 0, u's...","{u'score': 0, u'magnitude': 0}","[{u'salience': 1, u'meta': {}, u'type': u'OTHE...","[{u'index': 1, u'begin': 0, u'pos': u'ADJ', u'..."
1,Jane Foster,Wait for it.,"[{u'content': u'Wait for it.', u'begin': 0, u'...","{u'score': 0, u'magnitude': 0}",[],"[{u'index': 0, u'begin': 0, u'pos': u'VERB', u..."
2,Darcy Lewis,Can I turn on the radio?,"[{u'content': u'Can I turn on the radio?', u'b...","{u'score': 0, u'magnitude': 0}","[{u'salience': 1, u'meta': {}, u'type': u'OTHE...","[{u'index': 2, u'begin': 0, u'pos': u'VERB', u..."
3,Jane Foster,No!,"[{u'content': u'No!', u'begin': 0, u'score': -...","{u'score': -0.4, u'magnitude': 0.4}",[],"[{u'index': 0, u'begin': 0, u'pos': u'X', u'la..."
4,Erik Selvig,"Jane, you can't keep doing this.","[{u'content': u'Jane, you can't keep doing thi...","{u'score': -0.6, u'magnitude': 0.6}","[{u'salience': 1, u'meta': {u'mid': u'/m/05d4r...","[{u'index': 5, u'begin': 0, u'pos': u'NOUN', u..."
5,Jane Foster,The last seventeen occurrences had been predic...,[{u'content': u'The last seventeen occurrences...,"{u'score': -0.2, u'magnitude': 0.2}","[{u'salience': 0.583006, u'meta': {}, u'type':...","[{u'index': 3, u'begin': 0, u'pos': u'DET', u'..."
6,Erik Selvig,"Jane, you're an astrophysicist, not some storm...","[{u'content': u'Jane, you're an astrophysicist...","{u'score': 0.2, u'magnitude': 0.2}","[{u'salience': 0.8433054, u'meta': {u'mid': u'...","[{u'index': 3, u'begin': 0, u'pos': u'NOUN', u..."
7,Jane Foster,"I'm telling you, there's a connection between ...","[{u'content': u'I'm telling you, there's a con...","{u'score': 0, u'magnitude': 1.1}","[{u'salience': 0.29372552, u'meta': {}, u'type...","[{u'index': 2, u'begin': 0, u'pos': u'PRON', u..."
8,Erik Selvig,I thought you said it was a subtle aurora!,[{u'content': u'I thought you said it was a su...,"{u'score': 0.1, u'magnitude': 0.1}","[{u'salience': 1, u'meta': {}, u'type': u'LOCA...","[{u'index': 1, u'begin': 0, u'pos': u'PRON', u..."
9,Jane Foster,[to Darcy] Go!,"[{u'content': u'[to Darcy] Go!', u'begin': 0, ...","{u'score': 0.2, u'magnitude': 0.2}","[{u'salience': 1, u'meta': {}, u'type': u'PERS...","[{u'index': 4, u'begin': 0, u'pos': u'PUNCT', ..."


## Loading Saved Parsed Scripts

In [5]:
# read annotated script
fileName = os.path.join(os.getcwd(), 'prep_scripts', 'x-men_imsdb'+'_gapi.csv')
df = pd.read_csv(fileName, index_col=0, quotechar='"')
print df.head(5)

# convert annnotation strings to lists and dicts
parse = lambda x: ast.literal_eval(x)

# convert annotation strings for sentences, tokens, entities to parse trees
df.sentences = df.sentences.apply(parse)
df.entities = df.entities.apply(parse)
df.tokens = df.tokens.apply(parse)
df.sentiment = df.sentiment.apply(parse)

print df.head(5)

print df.sentences[0][0]['content']
print df.tokens[0][0]['index']
print df.entities[0][0]['salience']

        speaker                                           dialogue  \
0      narrator  BLACK Sounds of a train rolling to a halt, a s...   
1  FRECKLED KID                   Man, what's the matter with you?   
2      narrator  His friend is on the toilet with his head in h...   
3  FRECKLED KID  Dude. Lighten up. She's just a girl. You just ...   
4         SCOTT             No, my eyes... my eyes are killing me.   

                                           sentences  \
0  [{'content': u'BLACK Sounds of a train rolling...   
1  [{'content': u"Man, what's the matter with you...   
2  [{'content': u'His friend is on the toilet wit...   
3  [{'content': u'Dude.', 'begin': 0, 'score': 0....   
4  [{'content': u'No, my eyes... my eyes are kill...   

                           sentiment  \
0    {'score': 0, 'magnitude': 36.3}   
1       {'score': 0, 'magnitude': 0}   
2     {'score': 0.3, 'magnitude': 1}   
3     {'score': 0, 'magnitude': 0.5}   
4  {'score': -0.4, 'magnitude': 0.4}   

 

In [226]:
print sum(df.entities.apply(len)), 'entities identified'

entList = df.entities.apply(lambda x: [ent['name'] for ent in x]).tolist()
entSet = set([val for x in entList for val in x])
print len(entSet), 'unique entities identified'

df['totalSent'] = df.sentiment.apply(lambda x: x['score']*x['magnitude'])

print('*'*50)
df_totalSent = df[['speaker', 'totalSent']].groupby('speaker').sum().reset_index()
print('good guys from the script:')
print(df_totalSent[df_totalSent.totalSent >= 0].sort_values(by='totalSent', ascending=False))

print('*'*50)
print('bad guys from the script:')
print(df_totalSent[df_totalSent.totalSent < 0].sort_values(by='totalSent'))

6127 entities identified
2331 unique entities identified
**************************************************
good guys from the script:
                   speaker  totalSent
34                narrator      26.40
33                  XAVIER       4.34
16                 MAGNETO       2.57
29                   STORM       1.79
32                   VOICE       0.49
3                    BEAST       0.46
9                  CYCLOPS       0.35
15                   LOGAN       0.31
4                   BRAD 1       0.30
19              NEWSCASTER       0.25
20     OPERATION COMMANDER       0.24
17                MYSTIQUE       0.12
30                    TOAD       0.08
21                    PYRO       0.07
6                   BRAD 3       0.07
7                   BRAD 4       0.05
1            AGENTíS VOICE       0.04
0           AGENT/MYSTIQUE       0.01
18  MYSTIQUE/AGENTíS VOICE       0.00
22       REPORTERS/VARIOUS       0.00
26   SECRET SERVICE MAN #3       0.00
27           SENATOR ROWEE   

## Models for Relationship Extraction

In [13]:
def simpleRE(tokens):
    relation = []
    nsubj = -1
    verb = -1
    for i, token in enumerate(tokens):
        if token['label'] == 'NSUBJ':
            nsubj = i
            verb = token['index']
        if 'OBJ' in token['label'] and token['index'] == verb:
            relation.append({'verb':tokens[verb]['content'], 'noun':tokens[nsubj]['content'], 'obj':tokens[i]['content']})
        
    if relation:
        return relation
    else:
        return None

#basic relationship extraction
df['relations'] = None
for i in range(len(df)):
    df.set_value(i, 'relations', simpleRE(df.loc[i, 'tokens']))
    
df.head(10)

Unnamed: 0,speaker,dialogue,sentences,sentiment,entities,tokens,totalSent,relations
0,narrator,first lines; Loki has allied with the alien ra...,[{u'content': u'first lines; Loki has allied w...,"{u'score': -0.1, u'magnitude': 0.1}","[{u'salience': 0.34743184, u'meta': {}, u'type...","[{u'index': 1, u'begin': 0, u'pos': u'ADJ', u'...",-0.01,"[{u'obj': u'Earth', u'verb': u'subjugate', u'n..."
1,The Other,[voice over] The Tesseract has awakened. It is...,[{u'content': u'[voice over] The Tesseract has...,"{u'score': 0.1, u'magnitude': 1.6}","[{u'salience': 0.31685632, u'meta': {}, u'type...","[{u'index': 1, u'begin': 0, u'pos': u'PUNCT', ...",0.16,"[{u'obj': u'power', u'verb': u'wield', u'noun'..."
2,narrator,Nick Fury and Maria Hill arrive at a remote re...,[{u'content': u'Nick Fury and Maria Hill arriv...,"{u'score': 0.4, u'magnitude': 0.4}","[{u'salience': 0.5691925, u'meta': {u'mid': u'...","[{u'index': 1, u'begin': 0, u'pos': u'NOUN', u...",0.16,
3,Nick Fury,How bad is it?,"[{u'content': u'How bad is it?', u'begin': 0, ...","{u'score': -0.4, u'magnitude': 0.4}",[],"[{u'index': 1, u'begin': 0, u'pos': u'ADV', u'...",-0.16,
4,Agent Phil Coulson,"That's the problem, sir. We don't know.","[{u'content': u'That's the problem, sir.', u'b...","{u'score': 0, u'magnitude': 0}","[{u'salience': 0.85199714, u'meta': {}, u'type...","[{u'index': 1, u'begin': 0, u'pos': u'DET', u'...",0.0,
5,narrator,an elevator goes down,"[{u'content': u'an elevator goes down', u'begi...","{u'score': 0, u'magnitude': 0}","[{u'salience': 1, u'meta': {}, u'type': u'OTHE...","[{u'index': 1, u'begin': 0, u'pos': u'DET', u'...",0.0,
6,Agent Phil Coulson,Dr. Selvig read an energy surge from the Tesse...,[{u'content': u'Dr. Selvig read an energy surg...,"{u'score': 0.1, u'magnitude': 0.1}","[{u'salience': 0.6954503, u'meta': {}, u'type'...","[{u'index': 1, u'begin': 0, u'pos': u'NOUN', u...",0.01,"[{u'obj': u'surge', u'verb': u'read', u'noun':..."
7,Nick Fury,NASA didn't authorize Selvig to test the Tesse...,[{u'content': u'NASA didn't authorize Selvig t...,"{u'score': -0.8, u'magnitude': 0.8}","[{u'salience': 0.638119, u'meta': {u'mid': u'/...","[{u'index': 3, u'begin': 0, u'pos': u'NOUN', u...",-0.64,"[{u'obj': u'Selvig', u'verb': u'authorize', u'..."
8,Agent Phil Coulson,"He wasn't testing it, he wasn't even in the ro...","[{u'content': u'He wasn't testing it, he wasn'...","{u'score': -0.2, u'magnitude': 1}","[{u'salience': 0.5917055, u'meta': {}, u'type'...","[{u'index': 3, u'begin': 0, u'pos': u'PRON', u...",-0.2,"[{u'obj': u'it', u'verb': u'testing', u'noun':..."
9,Agent Maria Hill,It just turned itself on?,"[{u'content': u'It just turned itself on?', u'...","{u'score': -0.5, u'magnitude': 0.5}",[],"[{u'index': 2, u'begin': 0, u'pos': u'PRON', u...",-0.25,"[{u'obj': u'itself', u'verb': u'turned', u'nou..."


In [8]:
for i in range(50):
    t = df[df.relations.notnull()].sample(1)
    print 'dialogue:', t.dialogue
    #print len(t.relations)
    for relation in t.relations.values[0]:
        print relation

dialogue: 227    I told him.
Name: dialogue, dtype: object
{'obj': u'him', 'verb': u'told', 'noun': u'I'}
dialogue: 510    Why have you done this?
Name: dialogue, dtype: object
{'obj': u'this', 'verb': u'done', 'noun': u'you'}
dialogue: 173    Darcy hold her taser gun at him
Name: dialogue, dtype: object
{'obj': u'gun', 'verb': u'hold', 'noun': u'Darcy'}
dialogue: 340    Well, they just stole my entire life's work. I...
Name: dialogue, dtype: object
{'obj': u'work', 'verb': u'stole', 'noun': u'they'}
dialogue: 516    [shouting] I will not fight you, brother!
Name: dialogue, dtype: object
{'obj': u'you', 'verb': u'fight', 'noun': u'I'}
dialogue: 116    We will accept, your most gracious offer.
Name: dialogue, dtype: object
{'obj': u'offer', 'verb': u'accept', 'noun': u'We'}
dialogue: 365    No. I'm gonna fly out. [he leaves her] [after ...
Name: dialogue, dtype: object
{'obj': u'her', 'verb': u'leaves', 'noun': u'he'}
dialogue: 69    Thor gives him a knowing look
Name: dialogue, dtype: 

## Validating Extracted Relations

In [None]:
def REPrecision(df, num=50):
    goodRE = 0.0
    for i in range(num):
        t = df[df.relations.notnull()].sample(1)
        print 'DIALOGUE:', t.dialogue
        print 'RELATION:', random.choice(t.relations.values[0])
        x = raw_input('Is this a good relation?')
        if not x or x[0] == 'y':
            goodRE += 1
    print 'precision: ', goodRE/num
    
REPrecision(df)

DIALOGUE: 16    {voice over] Our armies drove the Frost Giants...
Name: dialogue, dtype: object
RELATION: {'obj': u'Giants', 'verb': u'drove', 'noun': u'armies'}
Is this a good relation?yes
DIALOGUE: 296    Yes. All the answers you seek will be yours, o...
Name: dialogue, dtype: object
RELATION: {'obj': u'Mj\xf6llnir', 'verb': u'reclaim', 'noun': u'I'}
Is this a good relation?yes
DIALOGUE: 50    He just broke your truce! They know you are vu...
Name: dialogue, dtype: object
RELATION: {'obj': u'truce', 'verb': u'broke', 'noun': u'He'}
Is this a good relation?yes
DIALOGUE: 464    Found you! [Thor looks up. Jane, Darcy and Eri...
Name: dialogue, dtype: object
RELATION: {'obj': u'cup', 'verb': u'drops', 'noun': u'Erik'}
