## Imports & Functions

In [34]:
# =-=-=-=-=-=-=-=-=-=-=
# IMPORTS
# =-=-=-=-=-=-=-=-=-=-= 

import pandas as pd, spacy, textacy
from spacy import displacy

from nltk.stem.wordnet import WordNetLemmatizer
from spacy.lang.en import English
from pathlib import Path # This is to export diSplacy SVGs to files.

In [2]:
# =-=-=-=-=-=-=-=-=-=-=
# FUNCTIONS
# =-=-=-=-=-=-=-=-=-=-= 

def string_test(s):
    if s is None:
        return ''
    else:
        return str(s)

# SVO TRIPLES
# =-=-=-=-=-=

SUBJECTS = ["nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl"]
OBJECTS = ["dobj", "dative", "attr", "oprd"]
ADJECTIVES = ["acomp", "advcl", "advmod", "amod", "appos", "nn", "nmod", "ccomp", "complm",
              "hmod", "infmod", "xcomp", "rcmod", "poss"," possessive"]
COMPOUNDS = ["compound"]
PREPOSITIONS = ["prep"]

def getSubsFromConjunctions(subs):
    moreSubs = []
    for sub in subs:
        # rights is a generator
        rights = list(sub.rights)
        rightDeps = {tok.lower_ for tok in rights}
        if "and" in rightDeps:
            moreSubs.extend([tok for tok in rights if tok.dep_ in SUBJECTS or tok.pos_ == "NOUN"])
            if len(moreSubs) > 0:
                moreSubs.extend(getSubsFromConjunctions(moreSubs))
    return moreSubs

def getObjsFromConjunctions(objs):
    moreObjs = []
    for obj in objs:
        # rights is a generator
        rights = list(obj.rights)
        rightDeps = {tok.lower_ for tok in rights}
        if "and" in rightDeps:
            moreObjs.extend([tok for tok in rights if tok.dep_ in OBJECTS or tok.pos_ == "NOUN"])
            if len(moreObjs) > 0:
                moreObjs.extend(getObjsFromConjunctions(moreObjs))
    return moreObjs

def getVerbsFromConjunctions(verbs):
    moreVerbs = []
    for verb in verbs:
        rightDeps = {tok.lower_ for tok in verb.rights}
        if "and" in rightDeps:
            moreVerbs.extend([tok for tok in verb.rights if tok.pos_ == "VERB"])
            if len(moreVerbs) > 0:
                moreVerbs.extend(getVerbsFromConjunctions(moreVerbs))
    return moreVerbs

def findSubs(tok):
    head = tok.head
    while head.pos_ != "VERB" and head.pos_ != "NOUN" and head.head != head:
        head = head.head
    if head.pos_ == "VERB":
        subs = [tok for tok in head.lefts if tok.dep_ == "SUB"]
        if len(subs) > 0:
            verbNegated = isNegated(head)
            subs.extend(getSubsFromConjunctions(subs))
            return subs, verbNegated
        elif head.head != head:
            return findSubs(head)
    elif head.pos_ == "NOUN":
        return [head], isNegated(tok)
    return [], False

def isNegated(tok):
    negations = {"no", "not", "n't", "never", "none"}
    for dep in list(tok.lefts) + list(tok.rights):
        if dep.lower_ in negations:
            return True
    return False

def findSVs(tokens):
    svs = []
    verbs = [tok for tok in tokens if tok.pos_ == "VERB"]
    for v in verbs:
        subs, verbNegated = getAllSubs(v)
        if len(subs) > 0:
            for sub in subs:
                svs.append((sub.orth_, "!" + v.orth_ if verbNegated else v.orth_))
    return svs

def getObjsFromPrepositions(deps):
    objs = []
    for dep in deps:
        if dep.pos_ == "ADP" and dep.dep_ == "prep":
            objs.extend([tok for tok in dep.rights if tok.dep_  in OBJECTS or (tok.pos_ == "PRON" and tok.lower_ == "me")])
    return objs

def getAdjectives(toks):
    toks_with_adjectives = []
    for tok in toks:
        adjs = [left for left in tok.lefts if left.dep_ in ADJECTIVES]
        adjs.append(tok)
        adjs.extend([right for right in tok.rights if tok.dep_ in ADJECTIVES])
        tok_with_adj = " ".join([adj.lower_ for adj in adjs])
        toks_with_adjectives.extend(adjs)

    return toks_with_adjectives

def getObjsFromAttrs(deps):
    for dep in deps:
        if dep.pos_ == "NOUN" and dep.dep_ == "attr":
            verbs = [tok for tok in dep.rights if tok.pos_ == "VERB"]
            if len(verbs) > 0:
                for v in verbs:
                    rights = list(v.rights)
                    objs = [tok for tok in rights if tok.dep_ in OBJECTS]
                    objs.extend(getObjsFromPrepositions(rights))
                    if len(objs) > 0:
                        return v, objs
    return None, None

def getObjFromXComp(deps):
    for dep in deps:
        if dep.pos_ == "VERB" and dep.dep_ == "xcomp":
            v = dep
            rights = list(v.rights)
            objs = [tok for tok in rights if tok.dep_ in OBJECTS]
            objs.extend(getObjsFromPrepositions(rights))
            if len(objs) > 0:
                return v, objs
    return None, None

def getAllSubs(v):
    verbNegated = isNegated(v)
    subs = [tok for tok in v.lefts if tok.dep_ in SUBJECTS and tok.pos_ != "DET"]
    if len(subs) > 0:
        subs.extend(getSubsFromConjunctions(subs))
    else:
        foundSubs, verbNegated = findSubs(v)
        subs.extend(foundSubs)
    return subs, verbNegated

def getAllObjs(v):
    # rights is a generator
    rights = list(v.rights)
    objs = [tok for tok in rights if tok.dep_ in OBJECTS]
    objs.extend(getObjsFromPrepositions(rights))

    potentialNewVerb, potentialNewObjs = getObjFromXComp(rights)
    if potentialNewVerb is not None and potentialNewObjs is not None and len(potentialNewObjs) > 0:
        objs.extend(potentialNewObjs)
        v = potentialNewVerb
    if len(objs) > 0:
        objs.extend(getObjsFromConjunctions(objs))
    return v, objs

def getAllObjsWithAdjectives(v):
    # rights is a generator
    rights = list(v.rights)
    objs = [tok for tok in rights if tok.dep_ in OBJECTS]

    if len(objs)== 0:
        objs = [tok for tok in rights if tok.dep_ in ADJECTIVES]

    objs.extend(getObjsFromPrepositions(rights))

    potentialNewVerb, potentialNewObjs = getObjFromXComp(rights)
    if potentialNewVerb is not None and potentialNewObjs is not None and len(potentialNewObjs) > 0:
        objs.extend(potentialNewObjs)
        v = potentialNewVerb
    if len(objs) > 0:
        objs.extend(getObjsFromConjunctions(objs))
    return v, objs

def findSVOs(tokens):
    svos = []
    verbs = [tok for tok in tokens if tok.pos_ == "VERB" and tok.dep_ != "aux"]
    for v in verbs:
        subs, verbNegated = getAllSubs(v)
        # hopefully there are subs, if not, don't examine this verb any longer
        if len(subs) > 0:
            v, objs = getAllObjs(v)
            for sub in subs:
                for obj in objs:
                    objNegated = isNegated(obj)
                    svos.append((sub.lower_, "!" + v.lower_ if verbNegated or objNegated else v.lower_, obj.lower_))
    return svos

def findSVAOs(tokens):
    svos = []
    verbs = [tok for tok in tokens if tok.pos_ == "VERB" and tok.dep_ != "aux"]
    for v in verbs:
        subs, verbNegated = getAllSubs(v)
        # hopefully there are subs, if not, don't examine this verb any longer
        if len(subs) > 0:
            v, objs = getAllObjsWithAdjectives(v)
            for sub in subs:
                for obj in objs:
                    objNegated = isNegated(obj)
                    obj_desc_tokens = generate_left_right_adjectives(obj)
                    sub_compound = generate_sub_compound(sub)
                    svos.append((" ".join(tok.lower_ for tok in sub_compound), "!" + v.lower_ if verbNegated or objNegated else v.lower_, " ".join(tok.lower_ for tok in obj_desc_tokens)))
    return svos

def generate_sub_compound(sub):
    sub_compunds = []
    for tok in sub.lefts:
        if tok.dep_ in COMPOUNDS:
            sub_compunds.extend(generate_sub_compound(tok))
    sub_compunds.append(sub)
    for tok in sub.rights:
        if tok.dep_ in COMPOUNDS:
            sub_compunds.extend(generate_sub_compound(tok))
    return sub_compunds

def generate_left_right_adjectives(obj):
    obj_desc_tokens = []
    for tok in obj.lefts:
        if tok.dep_ in ADJECTIVES:
            obj_desc_tokens.extend(generate_left_right_adjectives(tok))
    obj_desc_tokens.append(obj)

    for tok in obj.rights:
        if tok.dep_ in ADJECTIVES:
            obj_desc_tokens.extend(generate_left_right_adjectives(tok))

    return obj_desc_tokens

## Load the Data; Create Two Groups

In [4]:
# Load the CSV as a dataframe
df = pandas.read_csv('./throughputs/clowns_3.csv')
# df.shape 
# df.head()
df.Origin.value_counts()

News Report     162
Social Media     18
Fiction           2
Name: Origin, dtype: int64

In [5]:
# Create two lists:
news = df[df["Origin"] == "News Report"].Text.tolist()
social = df[df["Origin"] == "Social Media"].Text.tolist()

In [6]:
# Clean up our two lists
news_strings = [string_test(i) for i in news]
social_strings = [string_test(i) for i in social]

In [7]:
# Establish which parser spacy is going to use
nlp = spacy.load('en_core_web_sm') # More common is "en_core_web_sm"

In [8]:
news_docs = [nlp(i) for i in news_strings]
social_docs = [nlp(i) for i in social_strings]

In [9]:
# print([token.text for token in docs[0]])

sentences = list(social_docs[0].sents) # spacy's .sents method creates a generator
print(sentences[3:10])



In [None]:
for sentence in sentences[4]:
#     doc = nlp(sentence)
    svg = displacy.render(sentence, style="dep", jupyter=False)
    file_name = 'sentence-' + ".svg"
    output_path = Path("images/" + file_name)
    output_path.open("w", encoding="utf-8").write(svg)

In [10]:
from spacy.lang.en import English
# parser = English()
parser = spacy.load('en') # disable=['ner','textcat']

In [11]:
parse = parser(social[0])
print(findSVOs(parse))

[('i', '!take', 'video'), ('friend', 'sent', 'these'), ('friend', 'sent', 'to'), ('person', 'playing', 'prank')]


In [14]:
social_SVOs = [ findSVOs(parser(item)) for item in social ]
print(social_SVOs[0])

[('i', '!take', 'video'), ('friend', 'sent', 'these'), ('friend', 'sent', 'to'), ('person', 'playing', 'prank')]


In [16]:
news_SVOs = [ findSVOs(parser(item)) for item in news_strings ]
print(news_SVOs[0])

[('district', 'closed', 'schools'), ('suspect', 'made', 'threat'), ('we', 'take', 'threats'), ('students', 'made', 'decision'), ('decision', 'close', 'school'), ('victim', 'smoking', 'cigarette'), ('clown', 'approached', 'her'), ('i', 'kill', 'you'), ('suspect', 'told', 'her'), ('he', 'wearing', 'outfit'), ('police', 'arrested', 'juvenile'), ('media', 'driving', 'trend'), ('threat', 'triggered', 'lockdown')]


In [21]:
type(news_SVOs)

list

In [32]:
# How to unpack the tuples above:
for item in news_SVOs[0:2]:
    for x, y, z in item:
        print(x)

district
suspect
we
students
decision
victim
clown
i
suspect
he
police
media
threat
officials
officials
clown
clown
police
witness
witness
man
bass
he
children
clown
clown
clown
woman
authorities
it
it


Alternatively, I thought I could simply dump the lot into a pandas dataframe:
```python
svos = pd.DataFrame(news_SVOs, columns =['Subject', 'Verb', 'Object']) 
```
But that returned:
```python
AssertionError: 3 columns passed, passed data had 85 columns
```
When I assessed the situation, I realized that what I have is a list of lists of tuples with one list for each string. Each string has a variable length like:
```
len(news_SVOs[0])
13
```

In [75]:
svo_df = pd.DataFrame(columns = ['Subject', 'Verb', 'Object'])
for item in news_SVOs:
    for x,y,z in item:
        temp_df = pd.DataFrame([x,y,z])
        svo_df.append(temp_df)

In [76]:
svo_df.head()

Unnamed: 0,Subject,Verb,Object
