## Load Libraries & Data

In [16]:
import textacy

In [3]:
# IMPORTS
import re, spacy, textacy
import numpy as np, pandas as pd

# If needed
parentheticals = [ "\(laughter\)", "\(applause\)", "\(music\)",  
                  "\(video\)", "\(laughs\)", "\(applause ends\)", 
                  "\(audio\)", "\(singing\)", "\(music ends\)", 
                  "\(cheers\)", "\(cheering\)", "\(recording\)", 
                  "\(beatboxing\)", "\(audience\)", "\(guitar strum\)", 
                  "\(clicks metronome\)", "\(sighs\)", "\(guitar\)", 
                  "\(marimba sounds\)", "\(drum sounds\)" ]

def remove_parentheticals(text):
    global parentheticals
    new_text = text
    for rgx_match in parentheticals:
        new_text = re.sub(rgx_match, ' ', new_text.lower(), 
                          flags=re.IGNORECASE)
    return new_text

# Loading the Data in a gendered partitioned fashion: 
talks_m = pd.read_csv('talks_male.csv', index_col='Talk_ID')
talks_f = pd.read_csv('talks_female.csv', index_col='Talk_ID')
talks_nog = pd.read_csv('talks_nog.csv', index_col='Talk_ID')
talks_all = pd.concat([talks_m, talks_f, talks_nog])

# And then grabbing on the texts of the talks:
texts = talks_all.text.tolist()
texts_f = talks_f.text.tolist()
texts_m = talks_m.text.tolist()

print(f"From our {talks_all.shape[0]}x{talks_all.shape[1]} CSV, \
we have a list of {len(texts)} talks: {len(texts_f)} by women and \
{len(texts_m)} by men.")

From our 992x14 CSV, we have a list of 992 talks: 260 by women and 720 by men.


## Textacy

Textacy is fussy about the size of texts being fed it, responding with `ValueError`s for `nlp.maxlength`. The workaround here is to create a `docs` object which is a list of spaCy `doc`s. The preview below demonstrates that each item in the list has the characteristics of a spaCy doc.

In [36]:
# Load the Space pipeline to be used
nlp = spacy.load('en_core_web_lg')

# Use the pipe method to feed documents 
docs = list(nlp.pipe(texts_f))

In [27]:
# corpus = textacy.Corpus("en_core_web_sm", data=docs)

In [37]:
docs[0]._.preview

'Doc(3740 tokens: "  If you\'re here today — and I\'m very happy tha...")'

In [38]:
for token in docs[0][0:20]:
    print (token, token.tag_, token.pos_) # spacy.explain(token.tag_)

   _SP SPACE
If IN SCONJ
you PRP PRON
're VBP AUX
here RB ADV
today NN NOUN
— : PUNCT
and CC CCONJ
I PRP PRON
'm VBP AUX
very RB ADV
happy JJ ADJ
that IN SCONJ
you PRP PRON
are VBP AUX
— : PUNCT
you PRP PRON
've VBP AUX
all DT PRON
heard VBN VERB


In [39]:
for token in docs[0][0:20]:
    print (token.text, token.tag_, token.head.text, token.dep_)

   _SP heard dep
If IN 're mark
you PRP 're nsubj
're VBP heard advcl
here RB 're advmod
today NN 're npadvmod
— : 're punct
and CC 're cc
I PRP 'm nsubj
'm VBP 're conj
very RB happy advmod
happy JJ 'm acomp
that IN are mark
you PRP are nsubj
are VBP happy ccomp
— : heard punct
you PRP heard nsubj
've VBP heard aux
all DT heard dep
heard VBN heard ROOT


In [28]:
pattern = r'(<VBN>)'
verb_phrases = textacy.extract.matches.regex_matches(docs[0], pattern)

len(list(verb_phrases))
# for chunk in verb_phrases:
#     print(chunk.text)

0

In [25]:
patterns = [{"POS": {"IN": ["ADJ", "DET"]}, "OP": "+"}, {"ORTH": {"REGEX": "workers?"}}]
print(list(textacy.extract.token_matches(docs[0], patterns)))

[]


In [40]:
SVOs = list(textacy.extract.triples.subject_verb_object_triples(docs[0]))
len(SVOs)

146

In [41]:
for item in SVOs[0:10]:
    print(item)

SVOTriple(subject=[development], verb=[will, save], object=[us])
SVOTriple(subject=[She], verb=[turned], object=[to, be, a, much, bigger, dog, than, I, 'd, anticipated])
SVOTriple(subject=[part], verb=[handled], object=[percent])
SVOTriple(subject=[that], verb=[bring], object=[truck, trips])
SVOTriple(subject=[area], verb=[has], object=[one])
SVOTriple(subject=[I], verb=[was, contacted], object=[Parks, Department])
SVOTriple(subject=[I], verb=[mentioned], object=[that])
SVOTriple(subject=[she], verb=[pulled], object=[me])
SVOTriple(subject=[she], verb=[were], object=[dragging, me])
SVOTriple(subject=[I], verb=[wo, n't, mention], object=[that])


In [44]:
for item in SVOs[0:10]:
    print(item[0])

[development]
[She]
[part]
[that]
[area]
[I]
[I]
[she]
[she]
[I]


In [59]:
# Grab one of the triples
svo = SVOs[0]

# Access the first item, the subject of the svo
s = (svo[0])

# Print both
print(svo)
print(s)

SVOTriple(subject=[development], verb=[will, save], object=[us])
[will, save]


In [60]:
for item in SVOs[0:10]:
    if str(item[0]) == '[I]':
        print(item)

SVOTriple(subject=[I], verb=[was, contacted], object=[Parks, Department])
SVOTriple(subject=[I], verb=[mentioned], object=[that])
SVOTriple(subject=[I], verb=[wo, n't, mention], object=[that])


In [61]:
for item in SVOs:
    if str(item[0]) == '[I]':
        print(item)

SVOTriple(subject=[I], verb=[was, contacted], object=[Parks, Department])
SVOTriple(subject=[I], verb=[mentioned], object=[that])
SVOTriple(subject=[I], verb=[wo, n't, mention], object=[that])
SVOTriple(subject=[I], verb=['m, going], object=[to, exchange, marriage, vows, with, my, beloved])
SVOTriple(subject=[I], verb=[do], object=[which])
SVOTriple(subject=[I], verb=[watched], object=[half])
SVOTriple(subject=[I], verb=[told], object=[you])
SVOTriple(subject=[I], verb=[wrote], object=[dollar, transportation, grant])
SVOTriple(subject=[I], verb=[like], object=[that])
SVOTriple(subject=[I], verb=[have], object=[all])
SVOTriple(subject=[I], verb=[do, not, expect], object=[individuals, corporations, government])
SVOTriple(subject=[I], verb=['ll, tell], object=[you])
SVOTriple(subject=[I], verb=[like], object=[what])
SVOTriple(subject=[I], verb=[told], object=[you])
SVOTriple(subject=[I], verb=['ve, embraced], object=[capitalist])
SVOTriple(subject=[I], verb=[do, n't, have], object=[proble

The next step will be to locate the last item in the verb list.

In [65]:
for item in SVOs:
    if str(item[0]) == '[I]':
        print(item[1][-1:])

[contacted]
[mentioned]
[mention]
[going]
[do]
[watched]
[told]
[wrote]
[like]
[have]
[expect]
[tell]
[like]
[told]
[embraced]
[have]
[have]
[trying]
[have]
[asked]


In [66]:
for item in SVOs:
    if str(item[0]) == '[I]':
        print(item[1][-1:], item[2])

[contacted] [Parks, Department]
[mentioned] [that]
[mention] [that]
[going] [to, exchange, marriage, vows, with, my, beloved]
[do] [which]
[watched] [half]
[told] [you]
[wrote] [dollar, transportation, grant]
[like] [that]
[have] [all]
[expect] [individuals, corporations, government]
[tell] [you]
[like] [what]
[told] [you]
[embraced] [capitalist]
[have] [problem]
[have] [problem]
[trying] [to, build]
[have] [time]
[asked] [him]


In [74]:
for item in SVOs:
    if str(item[0]) == '[She]':
        print(item[1][-1:], item[2])
    if str(item[0]) == '[she]':
        print(item[1][-1:], item[2])

[turned] [to, be, a, much, bigger, dog, than, I, 'd, anticipated]
[pulled] [me]
[were] [dragging, me]
[kept] [dragging, me]
