In [1]:
import re
from collections import defaultdict

import pandas as pd
import spacy
from IPython.core.display import HTML
from bs4 import BeautifulSoup
from markdown import markdown
from spacy import displacy
from spacy.matcher import Matcher

In [2]:
df = pd.read_json('../data/interim/labeling_contributions_preprocessed_no_bot.jsonl', orient='records', nrows=1000, lines=True)[[ 'author', 'created_utc',
       'permalink', 'retrieved_on', 'rte_mode', 'score',
       'subreddit',
       'subreddit_type',
       'contribution_type', 'text', 'fullname', 'parent_fullname',
       'link_fullname', 'preprocessed_text', 'processed_text']]

In [3]:
def plain_text_without_quotes(txt):
    # md -> html -> text since BeautifulSoup can extract text cleanly
    html = markdown(txt)

    # remove code snippets
    html = re.sub(r'<pre>(.*?)</pre>', ' ', html)
    html = re.sub(r'<code>(.*?)</code >', ' ', html)
    # extract text
    soup = BeautifulSoup(html, "html.parser")
    for quote in filter(lambda p: p.text.strip().startswith('>'),
                        soup.find_all('p')):
        quote.replaceWith('...[QUOTE]...')

    return '\n'.join(soup.findAll(string=True))


In [4]:
df['dequoted_text'] = df.text.apply(plain_text_without_quotes)

In [5]:
nlp = spacy.load("en_core_web_lg")

matcher = Matcher(nlp.vocab)
matcher.add("conspiracy_labeling", [[{"lemma": "conspiracy"}, {"lemma": "theorist"}],
                           [{"lemma": "conspiracist"}],
                           ])

matcher.add('pron_poss', [[{"TAG":'PRP'}],
                          [{"TAG":'PRP$'}],
                          [{"TAG":'POS'}],
                          [{"TAG":'WP'}],
                          [{"TAG":'WP$'}],
                          [{"DEP":'poss'}],
                          ])
# 17.	POS	Possessive ending
# 18.	PRP	Personal pronoun
# 19.	PRP$	Possessive pronoun
# 34.	WP	Wh-pronoun
# 35.	WP$	Possessive wh-pronoun
# to consider:
# 33.	WDT	Wh-determiner


cases:
- you conspiracy theorist
- your conspiracy theorist friends
- you are a conspiracy theorist
- are you a conspiracy theorist?
- I'll call you a conspiracy theorist
- you're not a conspiracy theorist but...
- they are conspiracy theorists like you

outside the pattern:
- Everyone who says otherwise is an extremist  conspiracy theorist.

In [6]:
example_sentences = [
    "you conspiracy theorist",
    "your conspiracy theorist friends",
    "you are a conspiracy theorist",
    "are you a conspiracy theorist?",
    "I'll call you a conspiracy theorist",
    "you're not a conspiracy theorist but...",
    "they are conspiracy theorists like you",
]



In [9]:
for txt in df.dequoted_text:
    doc = nlp(txt)
    matches = matcher(doc)

    match_dict = defaultdict(list)
    for match_id, start, end in matches:
        string_id = nlp.vocab.strings[match_id]  # Get string representation
        span = doc[start:end]  # The matched span
        # print(match_id, string_id, start, end, span.text)
        match_dict[string_id].append(span)
        # if {"you", 'your'}.intersection({tok.text.lower() for tok in sent}):
        #     print(sent)
    for labeling_span in match_dict['conspiracy_labeling']:
        sent = labeling_span.sent
        related_pron_spans = list()
        for pron_span in match_dict['pron_poss']:
            if sent == pron_span.sent:
                lca_mat = pron_span.root.sent.get_lca_matrix()
                print(f"pron:{pron_span}: {sent}")


pron:It: It sounds like this is what many of the conspiracy theorists are suggesting.  
pron:what: It sounds like this is what many of the conspiracy theorists are suggesting.  
pron:I: I used to ignore conspiracy theorists
pron:I: (I did create /r/fuckyouall)


Indifference to the opinions of others (except conspiracy theorists)


A tendency to argue (
pron:What: What is someone as an advocate of liberty and limited government to do in a political environment who's only allies include jackass conspiracy theorist religious fundamentalists who seem to hate homosexuals and the idea (edit for grammar) of a  secular country to do?
pron:who: What is someone as an advocate of liberty and limited government to do in a political environment who's only allies include jackass conspiracy theorist religious fundamentalists who seem to hate homosexuals and the idea (edit for grammar) of a  secular country to do?
pron:who: What is someone as an advocate of liberty and limited government to do in a p

KeyboardInterrupt: 

In [13]:
pron_span, labeling_span, sent

(who,
 conspiracy theorist,
 Everyone who says otherwise is an extremist  conspiracy theorist.)

In [14]:
labeling_span.root,

theorist

In [45]:
doc = nlp("I'm not a conspiracy theorist but I believe in ufos")
for chunk in doc.noun_chunks:
    print(chunk)

I
a conspiracy theorist
I
ufos


In [65]:
print(chunk.root.sent, chunk.root.sent.get_lca_matrix())

but I believe in ufos [[0 2 2 2 2]
 [2 1 2 2 2]
 [2 2 2 2 2]
 [2 2 2 3 3]
 [2 2 2 3 4]]


In [None]:
# find the labeling span
# get the root of the span
# get the sentence
# find all pronouns (you)/det poss (your)/propn? (yours)
# find the common ancestor between pronouns* and labeling root
# check for verbs in the middle; look for negated

In [10]:
options = {"compact": True, "bg": "#09a3d5",
           "color": "white", "font": "Source Sans Pro"}
displacy.render(nlp('Everyone who says otherwise is an extremist  conspiracy theorist.'), style="dep", options=options)


In [24]:
options = {"compact": True, "bg": "#09a3d5",
           "color": "white", "font": "Source Sans Pro"}
displacy.render(span.sent, style="dep", options=options)


In [29]:
sent = span.sent
"you" in {tok.text.lower() for tok in sent}