In [1]:
import re
from collections import defaultdict

import pandas as pd
import spacy
from IPython.core.display import HTML
from bs4 import BeautifulSoup
from markdown import markdown
from spacy import displacy
from spacy.matcher import Matcher

In [2]:
# spacy.cli.download('en_core_web_lg')

In [None]:
df = pd.read_json('../data/interim/labeling_contributions_preprocessed_no_bot.jsonl', orient='records', lines=True)[[ 'author', 'created_utc',
       'permalink', 'retrieved_on', 'rte_mode', 'score',
       'subreddit',
       'subreddit_type',
       'contribution_type', 'text', 'fullname', 'parent_fullname',
       'link_fullname', 'preprocessed_text', 'processed_text']]

In [None]:
df = df[df.contribution_type=='comment'] # limit to comments

In [None]:
def plain_text_without_quotes(txt):
    # md -> html -> text since BeautifulSoup can extract text cleanly
    html = markdown(txt)

    # remove code snippets
    html = re.sub(r'<pre>(.*?)</pre>', ' ', html)
    html = re.sub(r'<code>(.*?)</code >', ' ', html)
    # extract text
    soup = BeautifulSoup(html, "html.parser")
    for quote in filter(lambda p: p.text.strip().startswith('>'),
                        soup.find_all('p')):
        quote.replaceWith('...[QUOTE]...')

    return '\n'.join(soup.findAll(string=True))


In [None]:
df['dequoted_text'] = df.text.apply(plain_text_without_quotes)

In [None]:
nlp = spacy.load("en_core_web_lg")

matcher = Matcher(nlp.vocab)
matcher.add("conspiracy_labeling", [[{"lemma": "conspiracy"}, {"lemma": "theorist"}],
                           [{"lemma": "conspiracist"}],
                           ])

matcher.add('pron_poss', [[{"TAG":'PRP'}],
                          [{"TAG":'PRP$'}],
                          # [{"TAG":'POS'}],
                          [{"TAG":'WP'}],
                          [{"TAG":'WP$'}],
                          [{"DEP":'poss', 'TAG':'DET'}],
                          ])
# 18.	PRP	Personal pronoun
# 19.	PRP$	Possessive pronoun
# 34.	WP	Wh-pronoun
# 35.	WP$	Possessive wh-pronoun
# to consider:
# 17.	POS	Possessive ending
# 33.	WDT	Wh-determiner


cases:
- you conspiracy theorist
- your conspiracy theorist friends
- you are a conspiracy theorist
- are you a conspiracy theorist?
- I'll call you a conspiracy theorist
- you're not a conspiracy theorist but...
- they are conspiracy theorists like you

outside the pattern:
- Everyone who says otherwise is an extremist  conspiracy theorist.
- the conspiracy theorist in me
- Meanwhile, if any of you Reddit pharma conspiracy theorist know where I can find busty blondes handing out envelopes of cash in exchange for prescriptions let me know because apparently my attending forgot to tell me.

In [None]:
example_sentences = [
    "you conspiracy theorist",
    "your conspiracy theorist friends",
    "you are a conspiracy theorist",
    "are you a conspiracy theorist?",
    "I'll call you a conspiracy theorist",
    "you're not a conspiracy theorist but...",
    "they are conspiracy theorists like you",
]



strategy:
1. 1st/2nd pp and labeling in same sentence
2. ...and pp is closest by sentence idx
3. ...and pp is closest by dep links
4. ...and are tied by verb, or poss (your ct friends), or ...(as a ct I, like a ct I)
5. coref group

In [79]:
strategy_1 = list()
strategy_2 = list()
strategy_3 = list()


for _, row in df[['dequoted_text', 'fullname']].iterrows():
    doc = nlp(row.dequoted_text)
    matches = matcher(doc)

    match_dict = defaultdict(list)
    for match_id, start, end in matches:
        string_id = nlp.vocab.strings[match_id]  # Get string representation
        span = doc[start:end]  # The matched span
        match_dict[string_id].append(span)
    for labeling_span in match_dict['conspiracy_labeling']:
        sent = labeling_span.sent
        related_pron_spans = list()
        min_linear_distance_pron_span = None
        min_linear_distance_pron_subtree = None
        min_linear_distance = None
        min_dependency_distance_pron_span = None
        min_dependency_distance_pron_subtree = None
        min_dependency_distance = None
        for pron_span in match_dict['pron_poss']:
            if (sent == pron_span.sent) and (pron_span.root.morph.to_dict().get('Person', None) in {'1', '2'}):
                lca_mat = pron_span.root.sent.get_lca_matrix()
                lca_idx = lca_mat[pron_span.root.i-sent.start, labeling_span.root.i-sent.start]
                subtree = list(sent[lca_idx].subtree)
                distance_linear = abs(pron_span.root.i-labeling_span.root.i)
                distance_dependency = 0
                for tok in (pron_span.root, labeling_span.root):
                    for ancestor in tok.ancestors:
                        distance_dependency+=1
                        if tok.i-sent.start==lca_idx:
                            break
                if (min_linear_distance is None) or (distance_linear<min_linear_distance):
                    min_linear_distance=distance_linear
                    min_linear_distance_pron_span=pron_span
                    min_linear_distance_pron_subtree=subtree

                if (min_dependency_distance is None) or (distance_dependency<min_dependency_distance):
                    min_dependency_distance=distance_dependency
                    min_dependency_distance_pron_span=pron_span
                    min_dependency_distance_pron_subtree=subtree

                strategy_1.append((pron_span, subtree, row.fullname))
        if min_linear_distance is not None:
            strategy_2.append((min_linear_distance_pron_span, min_linear_distance_pron_subtree, row.fullname))
        if min_dependency_distance is not None:
            strategy_3.append((min_dependency_distance_pron_span, min_dependency_distance_pron_subtree, row.fullname))

In [94]:
fullnames_strategy_1 = set(i[2] for i in strategy_1 if any(tok.morph.to_dict().get('Person', None) =='2' for tok in i[0].sent))
fullnames_strategy_2 = set(i[2] for i in strategy_2 if i[0].root.morph.to_dict().get('Person', None) =='2')
fullnames_strategy_3 = set(i[2] for i in strategy_3 if i[0].root.morph.to_dict().get('Person', None) =='2')

In [100]:
all_othering = df[df.fullname.isin(fullnames_strategy_3.union(fullnames_strategy_2).union(fullnames_strategy_1))].copy()

In [101]:
all_othering['linear_you'] = all_othering.fullname.isin(strategy_2)
all_othering['dependency_you'] = all_othering.fullname.isin(strategy_3)

In [102]:
def get_permalink(contribution):
    if 'permalink' in contribution:
        return 'https://reddit.com'+contribution['permalink']
    elif contribution['contribution_type']=='comment':
        return f'https://new.reddit.com/r/{contribution["subreddit"]}/comments/{contribution["link_fullname"].split("_")[-1]}/comment/{contribution["fullname"].split("_")[-1]}/'
    else:
        return f'https://new.reddit.com/r/{contribution["subreddit"]}/comments/{contribution["fullname"].split("_")[-1]}/'


all_othering['permalink'] = all_othering.permalink.apply(lambda x:'https://reddit.com'+x)

In [None]:
all_othering[['permalink', 'dependency_you', 'linear_you', 'contribution_type', 'subreddit', 'text']]

Unnamed: 0,permalink,dependency_you,linear_you,contribution_type,subreddit,text
0,https://reddit.com/r/AskReddit/comments/8w5ah/...,False,False,selftext_submission,AskReddit,"Am I a paranoid schizophrenic, or is the NWO p..."
1,https://reddit.com/r/entertainment/comments/9l...,False,False,selftext_submission,entertainment,"Hey, Dan Brown, Call Me\nYes, I'm one of the m..."
5,https://reddit.com/r/IAmA/comments/afjtr/i_am_...,False,False,selftext_submission,IAmA,I am a blogger being investigated by the FBI. ...
7,https://reddit.com/r/Libertarian/comments/ag77...,False,False,selftext_submission,Libertarian,"Libertarians of Reddit, what did you think abo..."
8,https://reddit.com/r/reddit.com/comments/ahb9i...,False,False,selftext_submission,reddit.com,FAKE SMOKING GUN: FBI admitted no such fakery ...
...,...,...,...,...,...,...
990,https://reddit.com/r/FaithInFireMC/comments/1z...,False,False,selftext_submission,FaithInFireMC,Let's get to know each other!\nSup guys Hanzo ...
991,https://reddit.com/r/subredditoftheday/comment...,False,False,selftext_submission,subredditoftheday,"March 8th, 2014 - /r/Conspiracy. Darkness cann..."
992,https://reddit.com/r/privacy/comments/1zy9ix/a...,False,False,selftext_submission,privacy,Advise me on forming university org. that exis...
994,https://reddit.com/r/conspiracy/comments/1zz38...,False,False,selftext_submission,conspiracy,"""First, please be assured that the NSA does no..."


In [None]:
def get_permalink(contribution):
    if 'permalink' in contribution:
        return 'https://reddit.com'+contribution['permalink']
    elif contribution['contribution_type']=='comment':
        return f'https://new.reddit.com/r/{contribution["subreddit"]}/comments/{contribution["link_fullname"].split("_")[-1]}/comment/{contribution["fullname"].split("_")[-1]}/'
    else:
        return f'https://new.reddit.com/r/{contribution["subreddit"]}/comments/{contribution["fullname"].split("_")[-1]}/'
        contributions.append(dict(url=get_permalink(contribution),
                                  text=contribution['text'],
                                  contribution_type=contribution['contribution_type'],
                                  subreddit=contribution.get('subreddit', None)))


In [83]:
len(fullnames_strategy_1), len(fullnames_strategy_2), len(fullnames_strategy_3), len(fullnames_strategy_3.union(fullnames_strategy_2).union(fullnames_strategy_1))

(161, 124, 102, 161)

In [75]:
print('\n*************************************\n'.join(df[df.fullname.isin(fullnames_strategy_3.symmetric_difference(fullnames_strategy_1))].dequoted_text))

A serious discussion
Ladies and gents, I'm sure we've all had our own personal experiences with conspiracy theorists at one time or another and, like me, I'm sure you've noticed the growing conspiracy-related narrative within society.


Things like the NSA scandal and such only serve to apparently serve their own world view. This we must admit.


I've watched as good friends of mine turned into a bunch of deluded halfwits, dragged in by shit like Zeitgeist and such. I've even had a friend of mine disappear from my life completely because a mass difference of opinions. (More his opinions and the rest of the world's facts...)


But I must propose the question; what are the long term effects of buying into this crap for our societies? And in addition to this, what possible outcomes do you see from the increasing number of loonies?
*************************************
What are your views on /r/conspiratard?
I post on /r/conspiratard and became interested in conspiracist thinking when my m

In [34]:
for pron_span, subtree, fullname in strategy_3:
    # print(list(subtree))
    print(f"{pron_span}:{pron_span.root.tag_}:{pron_span.root.dep_}:{pron_span.root.morph.to_dict().get('Person', None)} {''.join(i.text_with_ws for i in subtree).strip()}")


yourself:PRP:pobj:2 When talking with others do not refer to yourself as a "conspiracy theorist."
me:PRP:dobj:1 why it always infuriates me whenever I see a 9/11 conspiracy theorist set up in a little tent with several obscure shots of the WTC/former WTC site with blurry pictures of supposed "Factual evidence"
I:PRP:nsubj:1 Im very blind when it comes to sepcific proven health risks of these, as all I ever see are conspiracy theorists yelling about Global depopulation of America.
I:PRP:nsubj:1 For the record, I am not a conspiracy theorist.
I:PRP:nsubj:1 One last thing, as I said before, I am not a conspiracy theorist.
we:PRP:nsubj:1 In 1993, the Israelis were responsible for the bombing of the World Trade Center and that kind of stuff..."


/r/Israel, do we have stealth antisemitic conspiracy theorist on our hands
us:PRP:dobj:1 label us as conspiracy theorists and traitors
I:PRP:nsubj:1 Edit
 I find this subreddit so interesting...while being superficial, it's also extremely 
honest
 

In [28]:
df.head()

Unnamed: 0,author,created_utc,permalink,retrieved_on,rte_mode,score,subreddit,subreddit_type,contribution_type,text,fullname,parent_fullname,link_fullname,preprocessed_text,processed_text,dequoted_text
0,ithkuil,1246093429,/r/AskReddit/comments/8w5ah/am_i_a_paranoid_sc...,1522804000.0,markdown,9,AskReddit,public,selftext_submission,"Am I a paranoid schizophrenic, or is the NWO p...",t3_8w5ah,,t3_8w5ah,"Am I a paranoid schizophrenic, or is the NWO p...",i paranoid schizophrenic nwo prepare million p...,"Am I a paranoid schizophrenic, or is the NWO p..."
1,LarkinVolpatt,1253124031,/r/entertainment/comments/9l6xe/hey_dan_brown_...,1522822000.0,markdown,0,entertainment,public,selftext_submission,"Hey, Dan Brown, Call Me\nYes, I'm one of the m...",t3_9l6xe,,t3_9l6xe,"Hey, Dan Brown, Call Me Yes, I'm one of the mi...",hey dan brown me yes i million eager sucker di...,"Hey, Dan Brown, Call Me\nYes, I'm one of the m..."
2,cojoco,1253496526,/r/conspiracy/comments/9mgv8/whats_with_all_th...,1522823000.0,markdown,3,conspiracy,public,selftext_submission,What's with all the YouTube videos? Can't any...,t3_9mgv8,,t3_9mgv8,What's with all the YouTube videos? Can't any ...,youtube video you guy read i absolutely loathe...,What's with all the YouTube videos? Can't any...
3,YGOfvn,1254117557,/r/Indiekidslaff/comments/9opit/should_i_be_sc...,1522824000.0,markdown,1,Indiekidslaff,restricted,selftext_submission,Should I be scared of DARPA?\nhttp://en.wikipe...,t3_9opit,,t3_9opit,Should I be scared of DARPA? I was reading up ...,should i scare darpa i read darpa yesterday i ...,Should I be scared of DARPA?\nhttp://en.wikipe...
4,Biff_Bifferson,1260843119,/r/politics/comments/aepvd/what_is_a_conservat...,1522867000.0,markdown,0,politics,public,selftext_submission,What is a conservative atheist to do?\nI reali...,t3_aepvd,,t3_aepvd,What is a conservative atheist to do? I realiz...,conservative atheist i realize reddit liberal ...,What is a conservative atheist to do?\nI reali...


In [70]:
doc = nlp("I'm not a conspiracy theorist but I believe in ufos")
for chunk in doc.noun_chunks:
    print(chunk)

{74: 95, 'PronType_prs': True}

In [75]:
print(chunk.root.sent, chunk.root.sent.get_lca_matrix())

'2.3.9'

In [40]:
# find the labeling span
# get the root of the span
# get the sentence
# find all pronouns (you)/det poss (your)/propn? (yours)
# find the common ancestor between pronouns* and labeling root
# check for verbs in the middle; look for negated

anyone
who
the real Alex jones
he
a conspiracy theorist
the main suspects
name
Alex jones
the father
a doomsday prepper
anyone
I


In [41]:
options = {"compact": True, "bg": "#09a3d5",
           "color": "white", "font": "Source Sans Pro"}
displacy.render(nlp('Everyone who says otherwise is an extremist  conspiracy theorist.'), style="dep", options=options)


knows INTRANVERB
is INTRANVERB
's INTRANVERB
pointing INTRANVERB
is INTRANVERB
is INTRANVERB
does INTRANVERB
feel INTRANVERB
am INTRANVERB
being INTRANVERB


In [53]:
options = {"compact": True, "bg": "#09a3d5",
           "color": "white", "font": "Source Sans Pro"}
displacy.render(span.sent, style="dep", options=options)


In [29]:
sent = span.sent
"you" in {tok.text.lower() for tok in sent}