## Imports & Functions

In [1]:
# =-=-=-=-=-=-=-=-=-=-=
# IMPORTS
# =-=-=-=-=-=-=-=-=-=-= 

import pandas, spacy, textacy
from spacy import displacy

from nltk.stem.wordnet import WordNetLemmatizer
from spacy.lang.en import English
from pathlib import Path # This is to export diSplacy SVGs to files.

In [17]:
# =-=-=-=-=-=-=-=-=-=-=
# FUNCTIONS
# =-=-=-=-=-=-=-=-=-=-= 

def string_test(s):
    if s is None:
        return ''
    else:
        return str(s)

# SVO TRIPLES
# =-=-=-=-=-=

SUBJECTS = ["nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl"]
OBJECTS = ["dobj", "dative", "attr", "oprd"]
ADJECTIVES = ["acomp", "advcl", "advmod", "amod", "appos", "nn", "nmod", "ccomp", "complm",
              "hmod", "infmod", "xcomp", "rcmod", "poss"," possessive"]
COMPOUNDS = ["compound"]
PREPOSITIONS = ["prep"]

def getSubsFromConjunctions(subs):
    moreSubs = []
    for sub in subs:
        # rights is a generator
        rights = list(sub.rights)
        rightDeps = {tok.lower_ for tok in rights}
        if "and" in rightDeps:
            moreSubs.extend([tok for tok in rights if tok.dep_ in SUBJECTS or tok.pos_ == "NOUN"])
            if len(moreSubs) > 0:
                moreSubs.extend(getSubsFromConjunctions(moreSubs))
    return moreSubs

def getObjsFromConjunctions(objs):
    moreObjs = []
    for obj in objs:
        # rights is a generator
        rights = list(obj.rights)
        rightDeps = {tok.lower_ for tok in rights}
        if "and" in rightDeps:
            moreObjs.extend([tok for tok in rights if tok.dep_ in OBJECTS or tok.pos_ == "NOUN"])
            if len(moreObjs) > 0:
                moreObjs.extend(getObjsFromConjunctions(moreObjs))
    return moreObjs

def getVerbsFromConjunctions(verbs):
    moreVerbs = []
    for verb in verbs:
        rightDeps = {tok.lower_ for tok in verb.rights}
        if "and" in rightDeps:
            moreVerbs.extend([tok for tok in verb.rights if tok.pos_ == "VERB"])
            if len(moreVerbs) > 0:
                moreVerbs.extend(getVerbsFromConjunctions(moreVerbs))
    return moreVerbs

def findSubs(tok):
    head = tok.head
    while head.pos_ != "VERB" and head.pos_ != "NOUN" and head.head != head:
        head = head.head
    if head.pos_ == "VERB":
        subs = [tok for tok in head.lefts if tok.dep_ == "SUB"]
        if len(subs) > 0:
            verbNegated = isNegated(head)
            subs.extend(getSubsFromConjunctions(subs))
            return subs, verbNegated
        elif head.head != head:
            return findSubs(head)
    elif head.pos_ == "NOUN":
        return [head], isNegated(tok)
    return [], False

def isNegated(tok):
    negations = {"no", "not", "n't", "never", "none"}
    for dep in list(tok.lefts) + list(tok.rights):
        if dep.lower_ in negations:
            return True
    return False

def findSVs(tokens):
    svs = []
    verbs = [tok for tok in tokens if tok.pos_ == "VERB"]
    for v in verbs:
        subs, verbNegated = getAllSubs(v)
        if len(subs) > 0:
            for sub in subs:
                svs.append((sub.orth_, "!" + v.orth_ if verbNegated else v.orth_))
    return svs

def getObjsFromPrepositions(deps):
    objs = []
    for dep in deps:
        if dep.pos_ == "ADP" and dep.dep_ == "prep":
            objs.extend([tok for tok in dep.rights if tok.dep_  in OBJECTS or (tok.pos_ == "PRON" and tok.lower_ == "me")])
    return objs

def getAdjectives(toks):
    toks_with_adjectives = []
    for tok in toks:
        adjs = [left for left in tok.lefts if left.dep_ in ADJECTIVES]
        adjs.append(tok)
        adjs.extend([right for right in tok.rights if tok.dep_ in ADJECTIVES])
        tok_with_adj = " ".join([adj.lower_ for adj in adjs])
        toks_with_adjectives.extend(adjs)

    return toks_with_adjectives

def getObjsFromAttrs(deps):
    for dep in deps:
        if dep.pos_ == "NOUN" and dep.dep_ == "attr":
            verbs = [tok for tok in dep.rights if tok.pos_ == "VERB"]
            if len(verbs) > 0:
                for v in verbs:
                    rights = list(v.rights)
                    objs = [tok for tok in rights if tok.dep_ in OBJECTS]
                    objs.extend(getObjsFromPrepositions(rights))
                    if len(objs) > 0:
                        return v, objs
    return None, None

def getObjFromXComp(deps):
    for dep in deps:
        if dep.pos_ == "VERB" and dep.dep_ == "xcomp":
            v = dep
            rights = list(v.rights)
            objs = [tok for tok in rights if tok.dep_ in OBJECTS]
            objs.extend(getObjsFromPrepositions(rights))
            if len(objs) > 0:
                return v, objs
    return None, None

def getAllSubs(v):
    verbNegated = isNegated(v)
    subs = [tok for tok in v.lefts if tok.dep_ in SUBJECTS and tok.pos_ != "DET"]
    if len(subs) > 0:
        subs.extend(getSubsFromConjunctions(subs))
    else:
        foundSubs, verbNegated = findSubs(v)
        subs.extend(foundSubs)
    return subs, verbNegated

def getAllObjs(v):
    # rights is a generator
    rights = list(v.rights)
    objs = [tok for tok in rights if tok.dep_ in OBJECTS]
    objs.extend(getObjsFromPrepositions(rights))

    potentialNewVerb, potentialNewObjs = getObjFromXComp(rights)
    if potentialNewVerb is not None and potentialNewObjs is not None and len(potentialNewObjs) > 0:
        objs.extend(potentialNewObjs)
        v = potentialNewVerb
    if len(objs) > 0:
        objs.extend(getObjsFromConjunctions(objs))
    return v, objs

def getAllObjsWithAdjectives(v):
    # rights is a generator
    rights = list(v.rights)
    objs = [tok for tok in rights if tok.dep_ in OBJECTS]

    if len(objs)== 0:
        objs = [tok for tok in rights if tok.dep_ in ADJECTIVES]

    objs.extend(getObjsFromPrepositions(rights))

    potentialNewVerb, potentialNewObjs = getObjFromXComp(rights)
    if potentialNewVerb is not None and potentialNewObjs is not None and len(potentialNewObjs) > 0:
        objs.extend(potentialNewObjs)
        v = potentialNewVerb
    if len(objs) > 0:
        objs.extend(getObjsFromConjunctions(objs))
    return v, objs

def findSVOs(tokens):
    svos = []
    verbs = [tok for tok in tokens if tok.pos_ == "VERB" and tok.dep_ != "aux"]
    for v in verbs:
        subs, verbNegated = getAllSubs(v)
        # hopefully there are subs, if not, don't examine this verb any longer
        if len(subs) > 0:
            v, objs = getAllObjs(v)
            for sub in subs:
                for obj in objs:
                    objNegated = isNegated(obj)
                    svos.append((sub.lower_, "!" + v.lower_ if verbNegated or objNegated else v.lower_, obj.lower_))
    return svos

def findSVAOs(tokens):
    svos = []
    verbs = [tok for tok in tokens if tok.pos_ == "VERB" and tok.dep_ != "aux"]
    for v in verbs:
        subs, verbNegated = getAllSubs(v)
        # hopefully there are subs, if not, don't examine this verb any longer
        if len(subs) > 0:
            v, objs = getAllObjsWithAdjectives(v)
            for sub in subs:
                for obj in objs:
                    objNegated = isNegated(obj)
                    obj_desc_tokens = generate_left_right_adjectives(obj)
                    sub_compound = generate_sub_compound(sub)
                    svos.append((" ".join(tok.lower_ for tok in sub_compound), "!" + v.lower_ if verbNegated or objNegated else v.lower_, " ".join(tok.lower_ for tok in obj_desc_tokens)))
    return svos

def generate_sub_compound(sub):
    sub_compunds = []
    for tok in sub.lefts:
        if tok.dep_ in COMPOUNDS:
            sub_compunds.extend(generate_sub_compound(tok))
    sub_compunds.append(sub)
    for tok in sub.rights:
        if tok.dep_ in COMPOUNDS:
            sub_compunds.extend(generate_sub_compound(tok))
    return sub_compunds

def generate_left_right_adjectives(obj):
    obj_desc_tokens = []
    for tok in obj.lefts:
        if tok.dep_ in ADJECTIVES:
            obj_desc_tokens.extend(generate_left_right_adjectives(tok))
    obj_desc_tokens.append(obj)

    for tok in obj.rights:
        if tok.dep_ in ADJECTIVES:
            obj_desc_tokens.extend(generate_left_right_adjectives(tok))

    return obj_desc_tokens

## Load the Data; Create Two Groups

In [3]:
# Load the CSV as a dataframe
# colnames = ['Title' , 'Date', 'Author', 'Origin', 'URL', 'Text']
df = pandas.read_csv('./clowns_3.csv')
# df.shape 
# df.head()
df.Origin.value_counts()

News Report     162
Social Media     18
Fiction           2
Name: Origin, dtype: int64

In [11]:
# Create two lists:
news = df[df["Origin"] == "News Report"].Text.tolist()
social = df[df["Origin"] == "Social Media"].Text.tolist()

In [14]:
print(news[0][0:50])

An Ohio school district closed schools today after


In [15]:
# Clean up our two lists
news_strings = [string_test(i) for i in news]
social_strings = [string_test(i) for i in social]

In [None]:
# Eliminate carriage returns
# legends = []
# for string in strings:
#     string = string.replace(u'\xa0', u' ')
#     legends.append(string)

In [23]:
# Establish which parser spacy is going to use
nlp = spacy.load('en_core_web_sm') # More common is "en_core_web_sm"

In [24]:
news_docs = [nlp(i) for i in news_strings]
social_docs = [nlp(i) for i in social_strings]

In [38]:
# print([token.text for token in docs[0]])

sentences = list(social_docs[0].sents) # spacy's .sents method creates a generator
print(sentences[3:10])



In [42]:
displacy.render(sentences[4], style='dep')

In [53]:
Path.open("images/sentence-4.svg").write(svg)

AttributeError: 'str' object has no attribute '_closed'

In [52]:
for sentence in sentences[4]:
#     doc = nlp(sentence)
    svg = displacy.render(sentence, style="dep", jupyter=False)
    file_name = 'sentence-' + ".svg"
    output_path = Path("images/" + file_name)
    output_path.open("w", encoding="utf-8").write(svg)

TypeError: 'spacy.tokens.token.Token' object is not iterable

In [28]:
from spacy.lang.en import English
# parser = English()
parser = spacy.load('en', disable=['ner','textcat'])

In [30]:
parse = parser(social[0])
print(findSVOs(parse))

[('i', '!take', 'video'), ('friend', 'sent', 'these'), ('friend', 'sent', 'to'), ('person', 'playing', 'prank')]


In [34]:
for item in social:
    parse = parser(item)
    print(findSVOs(parse))

[('i', '!take', 'video'), ('friend', 'sent', 'these'), ('friend', 'sent', 'to'), ('person', 'playing', 'prank')]
[('you', 'scare', 'someone'), ('batman', 'had', 'enough'), ('he', 'chasing', 'them'), ('clowns', 'giving', 'kids'), ('clowns', 'giving', 'nights'), ('batman', 'taking', 'care')]
[('sightings', 'reported', 'atm')]
[('teenager', 'protect', 'age'), ('i', 'call', 'jake'), ('we', 'heard', 'noise'), ('we', 'saw', 'man'), ('we', 'saw', 'knife'), ('he', 'chasing', 'us'), ('clown', 'chasing', 'us'), ('he', 'followed', 'us'), ('i', 'feel', 'him'), ('he', '!see', 'us'), ('he', 'left', 'bush'), ('we', 'ran', 'pack'), ('we', '!tell', 'them'), ('i', 'said', 'goodbye')]
[('weapon', 'scare', 'clowns'), ('i', 'saw', 'clowns'), ('i', 'introduce', 'you'), ('guys', 'called', 'smith'), ('guys', 'called', 'goldreply'), ('bencoffee', 'get', 'shotgun'), ('you', 'shoot', 'them'), ('you', 'shoot', 'goldreply'), ('i', 'prefer', 'girl'), ('you', 'hold', 'clowns'), ('you', 'fucking', 'pussies')]
[('advi

[('i', 'remember', 'everything'), ('i', 'got', 'invitation'), ('i', 'spend', 'night'), ('i', 'made', 'sure'), ('i', 'head', 'road'), ('things', 'got', 'creepier'), ('he', 'gave', 'me'), ('he', 'gave', 'creepy'), ('friend', 'met', 'me'), ('stuff', 'happening', 'lot'), ('i', 'call', 'friend'), ('i', 'asked', 'him'), ('you', 'expecting', 'anyone'), ('plans', 'cancelled', 'all'), ('parents', 'called', 'me'), ('i', 'telling', 'you'), ('i', 'telling', 'i'), ('i', 'saw', 'thing'), ('sign', 'read', 'candy'), ('whoever', 'follows', 'me'), ('henry', 'saw', 'him'), ('he', 'called', 'parents'), ('parents', 'lock', 'door'), ('henry', 'hung', 'phone'), ('i', 'gave', 'him'), ('i', 'gave', 'nod'), ('me', 'clean', 'place'), ('day', 'helping', 'him'), ('i', 'saw', 'sign'), ('he', 'left', 'sign'), ('we', 'locked', 'everything'), ('thought', 'gave', 'us'), ('thought', 'gave', 'chills'), ('henry', 'asked', 'me'), ('he', 'thanked', 'me'), ('i', 'told', 'henry'), ('i', 'told', 'him'), ('i', 'asked', 'him'), 