In [1]:
import re
from collections import defaultdict

import pandas as pd
import spacy
from IPython.core.display import HTML
from bs4 import BeautifulSoup
from markdown import markdown
from spacy import displacy
from spacy.matcher import Matcher

In [2]:
# spacy.cli.download('en_core_web_lg')

In [3]:
df = pd.read_json('../data/interim/labeling_contributions_preprocessed_no_bot.jsonl', orient='records', nrows=1000, lines=True)[[ 'author', 'created_utc',
       'permalink', 'retrieved_on', 'rte_mode', 'score',
       'subreddit',
       'subreddit_type',
       'contribution_type', 'text', 'fullname', 'parent_fullname',
       'link_fullname', 'preprocessed_text', 'processed_text']]

In [4]:
def plain_text_without_quotes(txt):
    # md -> html -> text since BeautifulSoup can extract text cleanly
    html = markdown(txt)

    # remove code snippets
    html = re.sub(r'<pre>(.*?)</pre>', ' ', html)
    html = re.sub(r'<code>(.*?)</code >', ' ', html)
    # extract text
    soup = BeautifulSoup(html, "html.parser")
    for quote in filter(lambda p: p.text.strip().startswith('>'),
                        soup.find_all('p')):
        quote.replaceWith('...[QUOTE]...')

    return '\n'.join(soup.findAll(string=True))


In [5]:
df['dequoted_text'] = df.text.apply(plain_text_without_quotes)

In [9]:
nlp = spacy.load("en_core_web_lg")

matcher = Matcher(nlp.vocab)
matcher.add("conspiracy_labeling", [[{"lemma": "conspiracy"}, {"lemma": "theorist"}],
                           [{"lemma": "conspiracist"}],
                           ])

matcher.add('pron_poss', [[{"TAG":'PRP'}],
                          [{"TAG":'PRP$'}],
                          # [{"TAG":'POS'}],
                          [{"TAG":'WP'}],
                          [{"TAG":'WP$'}],
                          [{"DEP":'poss', 'TAG':'DET'}],
                          ])
# 18.	PRP	Personal pronoun
# 19.	PRP$	Possessive pronoun
# 34.	WP	Wh-pronoun
# 35.	WP$	Possessive wh-pronoun
# to consider:
# 17.	POS	Possessive ending
# 33.	WDT	Wh-determiner


cases:
- you conspiracy theorist
- your conspiracy theorist friends
- you are a conspiracy theorist
- are you a conspiracy theorist?
- I'll call you a conspiracy theorist
- you're not a conspiracy theorist but...
- they are conspiracy theorists like you

outside the pattern:
- Everyone who says otherwise is an extremist  conspiracy theorist.

In [10]:
example_sentences = [
    "you conspiracy theorist",
    "your conspiracy theorist friends",
    "you are a conspiracy theorist",
    "are you a conspiracy theorist?",
    "I'll call you a conspiracy theorist",
    "you're not a conspiracy theorist but...",
    "they are conspiracy theorists like you",
]



strategy:
1. 1st/2nd pp and labeling in same sentence
2. ...and pp is closest by sentence idx
3. ...and pp is closest by dep links
4. ...and are tied by verb, or poss (your ct friends), or ...(as a ct I, like a ct I)

In [24]:
strategy_1 = list()
strategy_2 = list()
strategy_3 = list()


for txt in df.dequoted_text.head(100):
    doc = nlp(txt)
    matches = matcher(doc)

    match_dict = defaultdict(list)
    for match_id, start, end in matches:
        string_id = nlp.vocab.strings[match_id]  # Get string representation
        span = doc[start:end]  # The matched span
        match_dict[string_id].append(span)
    for labeling_span in match_dict['conspiracy_labeling']:
        sent = labeling_span.sent
        related_pron_spans = list()
        min_linear_distance_pron_span = None
        min_linear_distance_pron_subtree = None
        min_linear_distance = None
        min_dependency_distance_pron_span = None
        min_dependency_distance_pron_subtree = None
        min_dependency_distance = None
        for pron_span in match_dict['pron_poss']:
            if (sent == pron_span.sent) and (pron_span.root.morph.to_dict().get('Person', None) in {'1', '2'}):
                lca_mat = pron_span.root.sent.get_lca_matrix()
                lca_idx = lca_mat[pron_span.root.i-sent.start, labeling_span.root.i-sent.start]
                subtree = list(sent[lca_idx].subtree)
                distance_linear = abs(pron_span.root.i-labeling_span.root.i)
                distance_dependency = 0
                for tok in (pron_span.root, labeling_span.root):
                    for ancestor in tok.ancestors:
                        distance_dependency+=1
                        if tok.i-sent.start==lca_idx:
                            break
                if (min_linear_distance is None) or (distance_linear<min_linear_distance):
                    min_linear_distance=distance_linear
                    min_linear_distance_pron_span=pron_span
                    min_linear_distance_pron_subtree=subtree

                if (min_dependency_distance is None) or (distance_dependency<min_dependency_distance):
                    min_dependency_distance=distance_dependency
                    min_dependency_distance_pron_span=pron_span
                    min_dependency_distance_pron_subtree=subtree

                strategy_1.append((pron_span, subtree))
        if min_linear_distance is not None:
            strategy_2.append((min_linear_distance_pron_span, min_linear_distance_pron_subtree))
        if min_dependency_distance is not None:
            strategy_3.append((min_dependency_distance_pron_span, min_dependency_distance_pron_subtree))

In [26]:
for pron_span, subtree in strategy_3:
    # print(list(subtree))
    print(f"{pron_span}:{pron_span.root.tag_}:{pron_span.root.dep_}:{pron_span.root.morph.to_dict().get('Person', None)} {''.join(i.text_with_ws for i in subtree).strip()}")


I:PRP:nsubj:1 I used to ignore conspiracy theorists but unfortunately, recently, I have started watching their videos and reading their articles.
I:PRP:nsubj:1 (I did create /r/fuckyouall)


Indifference to the opinions of others (except conspiracy theorists)


A tendency to argue (before you try to say I don't tend to argue, consider this: your statements are completely unconnected to reality)
I:PRP:nsubj:1 As an amateur art historian, conspiracy theorist and wanna-be-treasure-hunter I loved sinking my teeth into "The Da Vinci Code" and "Angels and Demons."
I:PRP:nsubj:1 Not that I am a conspiracy theorist, but my own ISP, Time Warner, has H-1Bs that I suspect of targeting me.
me:PRP:pobj:1 Yes, he is a conspiracy theorist, and yes the the trailer quotes "The Onion" but to me it just seems common-sense to invest more in commodities and skills with intrinsic value such as farmland, seeds and gold.
we:PRP:nsubj:1 From 
TwoChoice's history
 we know he's a conspiracy theorist and put on a

In [73]:
labeling_span.root,

AttributeError: 'spacy.tokens.morphanalysis.MorphAnalysis' object has no attribute 'to_dict'

In [70]:
doc = nlp("I'm not a conspiracy theorist but I believe in ufos")
for chunk in doc.noun_chunks:
    print(chunk)

{74: 95, 'PronType_prs': True}

In [75]:
print(chunk.root.sent, chunk.root.sent.get_lca_matrix())

'2.3.9'

In [40]:
# find the labeling span
# get the root of the span
# get the sentence
# find all pronouns (you)/det poss (your)/propn? (yours)
# find the common ancestor between pronouns* and labeling root
# check for verbs in the middle; look for negated

anyone
who
the real Alex jones
he
a conspiracy theorist
the main suspects
name
Alex jones
the father
a doomsday prepper
anyone
I


In [41]:
options = {"compact": True, "bg": "#09a3d5",
           "color": "white", "font": "Source Sans Pro"}
displacy.render(nlp('Everyone who says otherwise is an extremist  conspiracy theorist.'), style="dep", options=options)


knows INTRANVERB
is INTRANVERB
's INTRANVERB
pointing INTRANVERB
is INTRANVERB
is INTRANVERB
does INTRANVERB
feel INTRANVERB
am INTRANVERB
being INTRANVERB


In [53]:
options = {"compact": True, "bg": "#09a3d5",
           "color": "white", "font": "Source Sans Pro"}
displacy.render(span.sent, style="dep", options=options)


In [29]:
sent = span.sent
"you" in {tok.text.lower() for tok in sent}