In [1]:
import re
from collections import defaultdict

import pandas as pd
import spacy
from IPython.core.display import HTML
from bs4 import BeautifulSoup
from markdown import markdown
from spacy import displacy
from spacy.matcher import Matcher

In [7]:
# spacy.cli.download('en_core_web_lg')

✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_lg')


In [3]:
# df = pd.read_json('../data/interim/labeling_contributions_preprocessed_no_bot.jsonl', orient='records', lines=True)[[ 'author', 'created_utc',
#        'permalink', 'retrieved_on', 'rte_mode', 'score',
#        'subreddit',
#        'subreddit_type',
#        'contribution_type', 'text', 'fullname', 'parent_fullname',
#        'link_fullname', 'preprocessed_text', 'processed_text']]
# df = df[df.contribution_type=='comment'] # limit to comments
with pd.read_json('../data/interim/labeling_contributions_preprocessed_no_bot.jsonl', orient='records', lines=True, chunksize=500) as reader:
    df = pd.concat(chunk[chunk.contribution_type=='comment'][[ 'author', 'created_utc',
           'subreddit','text', 'fullname', 'parent_fullname',
           'link_fullname',]] for chunk in reader)

In [4]:
def plain_text_without_quotes(txt):
    # md -> html -> text since BeautifulSoup can extract text cleanly
    html = markdown(txt)

    # remove code snippets
    html = re.sub(r'<pre>(.*?)</pre>', ' ', html)
    html = re.sub(r'<code>(.*?)</code >', ' ', html)
    # extract text
    soup = BeautifulSoup(html, "html.parser")
    for quote in filter(lambda p: p.text.strip().startswith('>'),
                        soup.find_all('p')):
        quote.replaceWith('...[QUOTE]...')

    return '\n'.join(soup.findAll(string=True))


In [5]:
df['dequoted_text'] = df.text.apply(plain_text_without_quotes)

In [8]:
nlp = spacy.load("en_core_web_lg")

matcher = Matcher(nlp.vocab)
matcher.add("conspiracy_labeling", [[{"lemma": "conspiracy"}, {"lemma": "theorist"}],
                           [{"lemma": "conspiracist"}],
                           ])

matcher.add('pron_poss', [[{"TAG":'PRP'}],
                          [{"TAG":'PRP$'}],
                          # [{"TAG":'POS'}],
                          [{"TAG":'WP'}],
                          [{"TAG":'WP$'}],
                          [{"DEP":'poss', 'TAG':'DET'}],
                          ])
# 18.	PRP	Personal pronoun
# 19.	PRP$	Possessive pronoun
# 34.	WP	Wh-pronoun
# 35.	WP$	Possessive wh-pronoun
# to consider:
# 17.	POS	Possessive ending
# 33.	WDT	Wh-determiner


cases:
- you conspiracy theorist
- your conspiracy theorist friends
- you are a conspiracy theorist
- are you a conspiracy theorist?
- I'll call you a conspiracy theorist
- you're not a conspiracy theorist but...
- they are conspiracy theorists like you

outside the pattern:
- Everyone who says otherwise is an extremist  conspiracy theorist.
- the conspiracy theorist in me
- Meanwhile, if any of you Reddit pharma conspiracy theorist know where I can find busty blondes handing out envelopes of cash in exchange for prescriptions let me know because apparently my attending forgot to tell me.

In [9]:
example_sentences = [
    "you conspiracy theorist",
    "your conspiracy theorist friends",
    "you are a conspiracy theorist",
    "are you a conspiracy theorist?",
    "I'll call you a conspiracy theorist",
    "you're not a conspiracy theorist but...",
    "they are conspiracy theorists like you",
]



In [10]:
len(df)

1132689

strategy:
1. 1st/2nd pp and labeling in same sentence
2. ...and pp is closest by sentence idx
3. ...and pp is closest by dep links
4. ...and are tied by verb, or poss (your ct friends), or ...(as a ct I, like a ct I)
5. coref group

In [32]:
strategy_1 = list()
strategy_2 = list()
strategy_3 = list()

sample = df[['dequoted_text', 'fullname']].sample(1000)
for _, row in sample.iterrows():
    doc = nlp(row.dequoted_text)
    matches = matcher(doc)

    match_dict = defaultdict(list)
    for match_id, start, end in matches:
        string_id = nlp.vocab.strings[match_id]  # Get string representation
        span = doc[start:end]  # The matched span
        match_dict[string_id].append(span)
    for labeling_span in match_dict['conspiracy_labeling']:
        sent = labeling_span.sent
        related_pron_spans = list()
        min_linear_distance_pron_span = None
        min_linear_distance_pron_subtree = None
        min_linear_distance = None
        min_dependency_distance_pron_span = None
        min_dependency_distance_pron_subtree = None
        min_dependency_distance = None
        for pron_span in match_dict['pron_poss']:
            if (sent == pron_span.sent) and (pron_span.root.morph.to_dict().get('Person', None) in {'1', '2'}):
                lca_mat = pron_span.root.sent.get_lca_matrix()
                lca_idx = lca_mat[pron_span.root.i-sent.start, labeling_span.root.i-sent.start]
                subtree = list(sent[lca_idx].subtree)
                distance_linear = abs(pron_span.root.i-labeling_span.root.i)
                distance_dependency = 0
                for tok in (pron_span.root, labeling_span.root):
                    for ancestor in tok.ancestors:
                        distance_dependency+=1
                        if tok.i-sent.start==lca_idx:
                            break
                if (min_linear_distance is None) or (distance_linear<min_linear_distance):
                    min_linear_distance=distance_linear
                    min_linear_distance_pron_span=pron_span
                    min_linear_distance_pron_subtree=subtree

                if (min_dependency_distance is None) or (distance_dependency<min_dependency_distance):
                    min_dependency_distance=distance_dependency
                    min_dependency_distance_pron_span=pron_span
                    min_dependency_distance_pron_subtree=subtree

                strategy_1.append((pron_span, subtree, row.fullname))
        if min_linear_distance is not None:
            strategy_2.append((min_linear_distance_pron_span, min_linear_distance_pron_subtree, row.fullname))
        if min_dependency_distance is not None:
            strategy_3.append((min_dependency_distance_pron_span, min_dependency_distance_pron_subtree, row.fullname))

In [33]:
fullnames_strategy_1 = set(i[2] for i in strategy_1 if any(tok.morph.to_dict().get('Person', None) =='2' for tok in i[0].sent))
fullnames_strategy_2 = set(i[2] for i in strategy_2 if i[0].root.morph.to_dict().get('Person', None) =='2')
fullnames_strategy_3 = set(i[2] for i in strategy_3 if i[0].root.morph.to_dict().get('Person', None) =='2')

In [57]:
all_othering = df[df.fullname.isin(fullnames_strategy_3.union(fullnames_strategy_2).union(fullnames_strategy_1))].copy()

In [58]:
all_othering['linear_you'] = all_othering.fullname.isin(fullnames_strategy_2)
all_othering['dependency_you'] = all_othering.fullname.isin(fullnames_strategy_3)

In [59]:
def get_permalink(contribution):
    # print(contribution)
    return f'https://new.reddit.com/r/{contribution.subreddit}/comments/{contribution.link_fullname.split("_")[-1]}/comment/{contribution.fullname.split("_")[-1]}/'


all_othering['permalink'] = all_othering.apply(get_permalink, axis=1)
all_othering['othering'] = True

In [60]:
len(fullnames_strategy_1), len(fullnames_strategy_2), len(fullnames_strategy_3), len(fullnames_strategy_3.union(fullnames_strategy_2).union(fullnames_strategy_1))

(237, 203, 196, 237)

In [61]:
non_othering = df[(~df.fullname.isin(fullnames_strategy_1))&((df.fullname.isin(set(sample.fullname))))].copy()
non_othering['permalink'] = non_othering.apply(get_permalink, axis=1)
non_othering['othering'] = False

In [63]:
to_annotate = pd.concat((all_othering[(~all_othering.dependency_you)&(~all_othering.linear_you)].sample(20),
           all_othering[(all_othering.dependency_you)&(~all_othering.linear_you)],
           all_othering[(~all_othering.dependency_you)&(all_othering.linear_you)],
           all_othering[(all_othering.dependency_you)&(all_othering.linear_you)].sample(20),
           non_othering.sample(20)
           ))[['permalink', 'othering', 'dependency_you', 'linear_you', 'subreddit', 'text']]

In [66]:
to_annotate.to_csv('../data/interim/labeling_sample_strategies1--3.csv')

In [34]:
for pron_span, subtree, fullname in strategy_3:
    # print(list(subtree))
    print(f"{pron_span}:{pron_span.root.tag_}:{pron_span.root.dep_}:{pron_span.root.morph.to_dict().get('Person', None)} {''.join(i.text_with_ws for i in subtree).strip()}")


yourself:PRP:pobj:2 When talking with others do not refer to yourself as a "conspiracy theorist."
me:PRP:dobj:1 why it always infuriates me whenever I see a 9/11 conspiracy theorist set up in a little tent with several obscure shots of the WTC/former WTC site with blurry pictures of supposed "Factual evidence"
I:PRP:nsubj:1 Im very blind when it comes to sepcific proven health risks of these, as all I ever see are conspiracy theorists yelling about Global depopulation of America.
I:PRP:nsubj:1 For the record, I am not a conspiracy theorist.
I:PRP:nsubj:1 One last thing, as I said before, I am not a conspiracy theorist.
we:PRP:nsubj:1 In 1993, the Israelis were responsible for the bombing of the World Trade Center and that kind of stuff..."


/r/Israel, do we have stealth antisemitic conspiracy theorist on our hands
us:PRP:dobj:1 label us as conspiracy theorists and traitors
I:PRP:nsubj:1 Edit
 I find this subreddit so interesting...while being superficial, it's also extremely 
honest
 

In [64]:
# find the labeling span
# get the root of the span
# get the sentence
# find all pronouns (you)/det poss (your)/propn? (yours)
# find the common ancestor between pronouns* and labeling root
# check for verbs in the middle; look for negated

In [65]:
options = {"compact": True, "bg": "#09a3d5",
           "color": "white", "font": "Source Sans Pro"}
displacy.render(pron_span.sent, style="dep", options=options)
