In [1]:
import json
import re
from collections import defaultdict

import pandas as pd
import spacy
from IPython.core.display import HTML
from bs4 import BeautifulSoup
from markdown import markdown
from spacy import displacy
from spacy.matcher import Matcher
from textacy.spacier.utils import get_subjects_of_verb, \
    get_objects_of_verb

from src.features.perspective import parse_summary_scores

In [2]:
def get_permalink(contribution):
    # print(contribution)
    return f'https://new.reddit.com/r/{contribution.subreddit}/comments/{contribution.link_fullname.split("_")[-1]}/comment/{contribution.fullname.split("_")[-1]}/'



In [3]:
# spacy.cli.download('en_core_web_lg')

In [4]:
# df = pd.read_json('../data/interim/labeling_contributions_preprocessed_no_bot.jsonl', orient='records', lines=True)[[ 'author', 'created_utc',
#        'permalink', 'retrieved_on', 'rte_mode', 'score',
#        'subreddit',
#        'subreddit_type',
#        'contribution_type', 'text', 'fullname', 'parent_fullname',
#        'link_fullname', 'preprocessed_text', 'processed_text']]
# df = df[df.contribution_type=='comment'] # limit to comments
with pd.read_json('../data/interim/labeling_contributions_preprocessed_no_bot.jsonl', orient='records', lines=True, chunksize=500) as reader:
    df = pd.concat(chunk[chunk.contribution_type=='comment'][[ 'author', 'created_utc',
           'subreddit','text', 'fullname', 'parent_fullname',
           'link_fullname',]] for chunk in reader)

In [5]:
def plain_text_without_quotes(txt):
    # md -> html -> text since BeautifulSoup can extract text cleanly
    html = markdown(txt)

    # remove code snippets
    html = re.sub(r'<pre>(.*?)</pre>', ' ', html)
    html = re.sub(r'<code>(.*?)</code >', ' ', html)
    # extract text
    soup = BeautifulSoup(html, "html.parser")
    for quote in filter(lambda p: p.text.strip().startswith('>'),
                        soup.find_all('p')):
        quote.replaceWith('...[QUOTE]...')

    return '\n'.join(soup.findAll(string=True))


In [6]:
df['dequoted_text'] = df.text.apply(plain_text_without_quotes)

In [7]:
with open('../data/interim/perspective/labeling_contributions_preprocessed_no_bot_perspective.jsonl', encoding='utf8') as f:
    perspectives = dict()
    for obj in map(json.loads, f):
        k, v = list(obj.items())[0]
        perspectives[k] = parse_summary_scores(v)
perspective_df = pd.DataFrame(perspectives).T


with open('../data/interim/liwc/labeling_contributions_preprocessed_no_bot_liwc.jsonl', encoding='utf8') as f:
    liwcs = dict()
    for obj in map(json.loads, f):
        liwcs.update(obj)
liwc_df = pd.DataFrame(liwcs).T
with open('../data/interim/social_dimensions/labeling_contributions_preprocessed_no_bot_social_dimensions.jsonl', encoding='utf8') as f:
    social_dimensions = dict()
    for obj in map(json.loads, f):
        social_dimensions.update(obj)
social_dimensions_df = pd.DataFrame(social_dimensions).T


In [8]:
df = pd.merge(df, perspective_df, how='left', left_on='fullname', right_index=True)
df = pd.merge(df, social_dimensions_df, how='left', left_on='fullname', right_index=True)
df = pd.merge(df, liwc_df, how='left', left_on='fullname', right_index=True)

In [33]:
correlate_df = pd.read_csv('../data/interim/sampling_features/sampling_features.csv')
correlate_df.head()

Unnamed: 0,fullname,attack,toxic,bonding,rational,status
0,t1_c2608,0.336252,0.025763,-1.628081,0.918836,-0.304072
1,t1_c6146,-0.385578,-0.461117,-1.004141,-0.046233,5.74743
2,t1_c17zz,-0.462917,-0.543348,5.070771,2.95226,6.803712
3,t1_c1bq8,0.192129,-0.22403,-0.678638,-1.072955,-2.297167
4,t1_c3xes,-0.119972,-0.51201,-1.940889,-2.064472,-2.300674


In [34]:
correlate_df.set_index('fullname').corr()

Unnamed: 0,attack,toxic,bonding,rational,status
attack,1.0,0.049203,0.150347,-0.092376,-0.155971
toxic,0.049203,1.0,0.123159,-0.016257,-0.009546
bonding,0.150347,0.123159,1.0,0.20525,0.44968
rational,-0.092376,-0.016257,0.20525,1.0,0.220593
status,-0.155971,-0.009546,0.44968,0.220593,1.0


In [35]:
df = pd.merge(df, correlate_df, how='left', left_on='fullname', right_on='fullname')

positive indicators of prev_conspiracist:
- othering
- toxicity
- threat
- likely_to_reject
- insult
- attack_on_commenter
- attack_on_author
-
- social
- you
- social_support
- respect
- sexually_explicit
- death
- hear
- flirtation
- compare


negative indicators:
- conflict
- unsubstantial
- incoherent
- I
-
- differ
- power
- negemo
- space
- relativ
- cogproc
- knowledge

In [9]:
# parent_fullnames = set(df.parent_fullname)
# parents = list()
# for chunk in pd.read_json('../data/interim/labeling_subthread_all_filtered_preprocessed_no_bot.jsonl.zip', compression='zip', chunksize=1000, lines=True):
#     parents.append(chunk[chunk.fullname.isin(parent_fullnames)])

In [10]:
# parent_df = pd.concat(parents)
# len(parent_df)
# to_annotate = pd.merge(df, parent_df[['text', 'fullname']], left_on='parent_fullname', right_on='fullname', suffixes = ['', '_parent'])[['fullname', 'parent_fullname', 'othering', 'dependency_you', 'linear_you', 'subreddit','permalink', 'text', 'text_parent', ]]

In [11]:
# ntile = 4
# df[pd.qcut(df.Social, ntile, labels=range(ntile))==3].head(10).apply(get_permalink, axis=1).tail()

In [12]:
nlp = spacy.load("en_core_web_lg")

matcher = Matcher(nlp.vocab)
matcher.add("conspiracy_labeling", [[{"lemma": "conspiracy"}, {"lemma": "theorist"}],
                           [{"lemma": "conspiracist"}],
                           ])

matcher.add('pron_poss', [[{"TAG":'PRP'}],
                          [{"TAG":'PRP$'}],
                          # [{"TAG":'POS'}],
                          [{"TAG":'WP'}],
                          [{"TAG":'WP$'}],
                          [{"DEP":'poss', 'TAG':'DET'}],
                          ])
# 18.	PRP	Personal pronoun
# 19.	PRP$	Possessive pronoun
# 34.	WP	Wh-pronoun
# 35.	WP$	Possessive wh-pronoun
# to consider:
# 17.	POS	Possessive ending
# 33.	WDT	Wh-determiner


cases:
- you conspiracy theorist
- your conspiracy theorist friends
- you are a conspiracy theorist
- are you a conspiracy theorist?
- I'll call you a conspiracy theorist
- you're not a conspiracy theorist but...
- they are conspiracy theorists like you

outside the pattern:
- Everyone who says otherwise is an extremist  conspiracy theorist.
- the conspiracy theorist in me
- Meanwhile, if any of you Reddit pharma conspiracy theorist know where I can find busty blondes handing out envelopes of cash in exchange for prescriptions let me know because apparently my attending forgot to tell me.

In [13]:
example_sentences = [
    "you conspiracy theorist",
    "your conspiracy theorist friends",
    "you are a conspiracy theorist",
    "are you a conspiracy theorist?",
    "I'll call you a conspiracy theorist",
    "you're not a conspiracy theorist but...",
    "they are conspiracy theorists like you",
]



In [14]:
len(df)

1132689

In [79]:
df['has_quote'] = df.dequoted_text.apply(lambda x: '...[QUOTE]...' in x)

strategy:
1. 1st/2nd pp and labeling in same sentence
2. ...and pp is closest by sentence idx
3. ...and pp is closest by dep links
4. you (part of subj group)... to be (lemma verb) ... (cop) conspiracist
5. ...and are tied by verb, or poss (your ct friends), or ...(as a ct I, like a ct I)
6. coref group
7. you and not I

In [57]:
def build_match_dict(matcher, sent):
    matches = matcher(sent)
    match_dict = defaultdict(list)
    for match_id, start, end in matches:
        string_id = nlp.vocab.strings[match_id]  # Get string representation
        span = sent[start:end]  # The matched span
        match_dict[string_id].append(span)
    return match_dict

In [15]:


strategy_4 = list()
for _, row in df.iterrows():
    doc = nlp(row.dequoted_text)

# for doc in map(nlp, example_sentences):
    for sent in doc.sents:
        # print(sent)
        # print(sent.root.lemma_=='be')
        # print(get_subjects_of_verb(sent.root))
        # print(get_objects_of_verb(sent.root))
        if sent.root.lemma_=='be':
            match_dict = build_match_dict(matcher, sent)
            noun_chunks = defaultdict(set)
            noun_chunks.update(
                {i: n for n in doc.noun_chunks for i in range(n.start, n.end)})
            you_subj = False
            for pron_span in match_dict['pron_poss']:
                if you_subj: break
                if (pron_span.root.morph.to_dict().get('Person', None) == '2'):
                    # print(pron_span, sent)
                    for subj in get_subjects_of_verb(sent.root):
                        subj_noun_chunk = noun_chunks[subj.i]
                        if pron_span.root in subj_noun_chunk:
                            # print('great success', subj_noun_chunk, sent)
                            you_subj=True
                            break
            if not you_subj: continue
            conspiracy_obj = False
            for labeling_span in match_dict['conspiracy_labeling']:
                if conspiracy_obj: break
                for obj in get_objects_of_verb(sent.root):
                    obj_noun_chunk = noun_chunks[obj.i]
                    if labeling_span.root in obj_noun_chunk:
                        # print('wowzies', obj_noun_chunk)
                        strategy_4.append((subj_noun_chunk, sent, row.fullname))
                        conspiracy_obj = True
                        break
            if conspiracy_obj:
                break


In [16]:
len(strategy_4)

6956

In [72]:
def check_conspiracy_morph(doc, matcher=matcher):

    for sent in doc.sents:
        match_dict = build_match_dict(matcher, sent)
        for labeling_span in match_dict['conspiracy_labeling']:
            span_morph = labeling_span[0].morph.to_dict()
            if "Number" in span_morph:
                if span_morph.get('Number', None)=='Sing':
                    return True
            elif labeling_span[-1].text[-1]!='s':
                # print("ok:", labeling_span)
                return True
    # print("not ok:", doc)
    return False
strategy_4_singular = [i for i in strategy_4 if check_conspiracy_morph(i[-2])]

In [73]:
len(strategy_4_singular)

6943

In [118]:
def check_negated_verb(doc, matcher=matcher):
    for sent in doc.sents:
        match_dict = build_match_dict(matcher, sent)
        for labeling_span in match_dict['conspiracy_labeling']:
            root_verb = labeling_span.sent.root
            if root_verb.lemma_=='be':
                if any(map(lambda x: x.dep_=='neg', root_verb.children)):
                    # print(sent)
                    return True

    return False

In [119]:
strategy_4_singular_not_negated = [i for i in strategy_4_singular if not check_negated_verb(i[-2])]

In [120]:
len(strategy_4_singular_not_negated)

6560

In [121]:
strategy_4_df =df[df.fullname.isin(i[-1] for i in strategy_4_singular_not_negated)]
ntile=4


In [130]:
strategy_4_df.sample(100)[['dequoted_text', 'fullname', 'attack', 'toxic', 'bonding', 'rational', 'status']]

Unnamed: 0,dequoted_text,fullname,attack,toxic,bonding,rational,status
258909,You're quite the conspiracy theorist,t1_d8hsbji,0.418672,0.128050,-1.304470,-1.079517,-2.299046
1064604,You're a conspiracy theorist.,t1_ia7kjw9,0.527308,0.258990,-1.303766,-2.074492,-2.298545
1058370,"""If you disagree with me, you're a conspiracy ...",t1_i8opn25,0.641701,0.044014,0.271339,-1.067166,-1.244225
174425,Lol now you all are the conspiracy theorists,t1_dvmjmc7,0.218504,0.053315,-1.304771,-1.084834,-2.300603
252934,Your friend is a conspiracy theorist. There is...,t1_eqsqgnk,0.545408,-0.147588,-0.678676,-2.064361,-2.184377
...,...,...,...,...,...,...,...
807607,Oh you're a conspiracy theorist too. I bet you...,t1_ewx9mzx,0.459887,0.137038,1.219727,-2.074479,-1.302372
298586,"Mate, you're just a paranoid conspiracy theori...",t1_fsmhyta,0.426194,0.334656,2.171381,-2.048995,-2.242527
203098,If you don't like something Monsanto is doing ...,t1_cqmfe4z,0.512655,0.352367,0.271314,-1.070574,-1.244226
93500,"Oh, so on top of being a Muslim 'scholar', you...",t1_d4bw5t1,0.715528,0.798277,-1.302478,-2.062289,-2.242528


In [122]:
strategy_4_df[(pd.qcut(strategy_4_df.attack, ntile, labels=range(ntile))==3)][['dequoted_text', 'fullname', 'attack']].sample(100)

Unnamed: 0,dequoted_text,fullname,attack
888784,Do we really need a new post about this every ...,t1_i6cqlcy,0.751779
476800,Or you are a conspiracy theorist.\n\n\nYou can...,t1_ho6zbdl,0.684076
819638,...[QUOTE]...\n\n\nIf you disagree with the Go...,t1_hgfpg56,0.598882
271585,Your source is a crackpot conspiracy theorist ...,t1_f1di98h,0.671875
525837,And you still are a conspiracy theorist.\n\n\n...,t1_ic3iy94,0.662274
...,...,...,...
632400,So you are basically a conspiracy theorist. Yo...,t1_etewbu0,0.588707
732807,Within the realm of possibility that this is a...,t1_gikfwps,0.725723
107976,"no, now you are the conspiracy theorist for de...",t1_d7umc3x,0.582371
637329,So you're a conspiracy theorist. 👍,t1_ev60gab,0.623355


In [123]:
strategy_4_df[(pd.qcut(strategy_4_df.toxic, ntile, labels=range(ntile))==3)][['dequoted_text', 'fullname', 'toxic']].sample(100)

Unnamed: 0,dequoted_text,fullname,toxic
125808,If you call them out on their bullshit you are...,t1_dekoodu,0.643071
958093,You're a nutbag conspiracy theorist.,t1_gdso9o6,0.608782
922598,"Damn, you're a conspiracy theorist, too?",t1_j6dpkue,0.520222
434840,"You're a weird conspiracist, vaccinated or not...",t1_h3kk2pd,0.789066
113275,"So, you're a parasitic conspiracy theorist? c...",t1_c5nlyfo,0.597926
...,...,...,...
929416,"You're not an adult though, you're a conspirac...",t1_fpyfhke,0.818369
1085299,You're a ridiculous conspiracy theorist who cl...,t1_hvhakho,0.490527
767948,You and everyone you’re citing are just conspi...,t1_gwe2hft,0.781651
947087,You guys are conspiracy theorists looking for ...,t1_gbo5g2u,0.589631


In [132]:
strategy_4_df[(pd.qcut(strategy_4_df.bonding, ntile, labels=range(ntile))==3)][['dequoted_text', 'fullname', 'bonding']].sample(100)

Unnamed: 0,dequoted_text,fullname,bonding
1081451,"...[QUOTE]...\n\n\nNo, you are a conspiracy th...",t1_im5erxc,20.824612
602121,Now you're a conspiracy theorist... I do not r...,t1_eg706xt,2.482450
837972,If you believe that our government is capable ...,t1_fiowy0k,1.845192
724932,You're a conspiracy theorist. And you know not...,t1_gae4lvp,2.482836
823442,"Well, if you don't believe the MSM, then you a...",t1_hhargig,8.424035
...,...,...,...
542291,"No, you are a conspiracy theorist who is doing...",t1_iorgsjg,1.221092
1099201,Either you're a crazy conspiracy theorists who...,t1_hyfcdx0,1.532683
577944,"...[QUOTE]...\n\n\nAh, so you're literally a c...",t1_e8p1kib,6.945836
1072245,"At this point, you're the conspiracy theorist ...",t1_ijrax6m,2.483302


In [143]:
strategy_4_df[(pd.qcut(strategy_4_df.rational, ntile, labels=range(ntile))==0)][['dequoted_text', 'fullname', 'rational']].sample(100)

Unnamed: 0,dequoted_text,fullname,rational
965210,You guys who still worship Pencil Neck Schiff ...,t1_g11kohq,-2.079209
441647,Ok so you're an antisemitic conspiracy theorists,t1_hc8u2aj,-2.075781
724631,You are another RussiaGate conspiracy theorist...,t1_gabmvwe,-2.077758
143566,"Hey, you're the conspiracy theorist here. I'm...",t1_c8fs3o0,-2.074683
36379,"Ha, you are a bigger conspiracy theorist than ...",t1_c16sjfp,-2.072325
...,...,...,...
421385,You're still a conspiracy theorist.,t1_h0y43tj,-2.076005
1019011,Oh God you're just another dumb right-wing con...,t1_h8j3bj9,-2.072391
1047781,You are a walking conspiracy theorist. And cle...,t1_hltko6k,-2.072596
436383,You are a terrible conspiracy theorist,t1_hbb1tk4,-2.077225


In [148]:
strategy_4_df[(pd.qcut(strategy_4_df.status, ntile, labels=range(ntile))==3)][['dequoted_text', 'fullname', 'status']].sample(100)

Unnamed: 0,dequoted_text,fullname,status
170493,Wow you are a conspiracy theorist. Wilson ran ...,t1_co2tkl9,-1.125538
402228,You are all conspiracy theorists. I have a mod...,t1_czlqr9s,-1.244226
360654,Your comment is basically a conspiracy theoris...,t1_djsoggb,-1.186075
718008,"If you believe his birth certificate is fake, ...",t1_g8q9jwx,-1.186075
720217,"Liberals: ""I'm skeptical, you're a conspiracy ...",t1_g98bl3q,0.694247
...,...,...,...
1010679,Holy shit you really are a fucking conspiracy ...,t1_goriv9c,-1.244221
1087944,Ok you're just a conspiracy theorist who has b...,t1_hvzb8e3,-1.244225
140994,You are quite the conspiracy theorist. I don't...,t1_djdu84w,-0.304074
135446,"So, you're a conspiracy theorist, but \nI'm\n ...",t1_dibwbn4,-0.245925


In [151]:
strategy_4_df[strategy_4_df.rational<0][['dequoted_text', 'fullname']]

Unnamed: 0,dequoted_text,fullname
554,If you think we are leaving the region you are...,t1_c2opia
695,You are my new favourite conspiracy theorist.,t1_c02nlxe
1030,You sir are clearly a wild eyed conspiracy the...,t1_c03kn4z
1091,"Keep voting me down, you are all conspiracy th...",t1_c02aamn
1242,You're a Paul supporter \nand\n a conspiracy t...,t1_c02wap4
...,...,...
1132148,You're just a conspiracy theorist. We dismiss ...,t1_ja8zfpk
1132231,"You're a conspiracy theorist, best of luck.",t1_ja9fong
1132799,You are the conspiracy theorist you hate,t1_jade3be
1132985,You're a pro-government conspiracy theorist?? ...,t1_jaeobiv


In [152]:
strategy_4_df[strategy_4_df.rational>0][['dequoted_text', 'fullname']]

Unnamed: 0,dequoted_text,fullname
13817,When you use the word of one individual agains...,t1_c3by8qk
15293,If you believe in any version of the 9/11 stor...,t1_c40nh2u
16132,You're a overly-sensitive conspiracist bullshi...,t1_c45ixjq
18575,When you're refuting all evidence and trying t...,t1_c65gzzv
19706,I've been told again and again by you people o...,t1_c6e39e2
...,...,...
1123859,If you don't believe that he tied a sheet arou...,t1_j8doecl
1126571,If you still believe that Rebekah Jones lady y...,t1_j8y91ve
1126687,But if you dare discuss such things while not ...,t1_j8yy389
1131046,bro you are the absolute worst conspiracist i’...,t1_ja2yidj


In [154]:
strategy_4_df[strategy_4_df.status<0][['dequoted_text', 'fullname']]

Unnamed: 0,dequoted_text,fullname
554,If you think we are leaving the region you are...,t1_c2opia
695,You are my new favourite conspiracy theorist.,t1_c02nlxe
1030,You sir are clearly a wild eyed conspiracy the...,t1_c03kn4z
1091,"Keep voting me down, you are all conspiracy th...",t1_c02aamn
1242,You're a Paul supporter \nand\n a conspiracy t...,t1_c02wap4
...,...,...
1132231,"You're a conspiracy theorist, best of luck.",t1_ja9fong
1132272,But you repeat qualitatively questionable info...,t1_ja9n8nk
1132799,You are the conspiracy theorist you hate,t1_jade3be
1132985,You're a pro-government conspiracy theorist?? ...,t1_jaeobiv


In [162]:
strategy_4_df[(strategy_4_df.status<0)&(strategy_4_df.rational<0)&(strategy_4_df.bonding<0)&(strategy_4_df.attack>0)&(strategy_4_df.toxic>0)][['dequoted_text', 'fullname']].sample(20)

Unnamed: 0,dequoted_text,fullname
1047801,"So you're a conspiracy theorist, then?",t1_hltpcqi
444192,So you are a conspiracy theorist. Thanks for c...,t1_hcnhplz
624382,You guys are the biggest conspiracy theorists ...,t1_ekit048
583438,You're a Seth Rich conspiracy theorist. Get out.,t1_dhwgm6x
98477,You're definitely a conspiracy theorist.,t1_cd38t1i
693228,"Wow, you are a hateful conspiracy theorist, th...",t1_foyr34c
927426,"Ah, and you are the fact resistant conspiracy ...",t1_fpodenm
268257,You are a conspiracy theorist.,t1_ezxr8xa
47079,you are a conspiracy theorist .come on don't d...,t1_cikstdm
76735,Then you're anti-Israeli conspiracy theorist. ...,t1_c6rse88


In [163]:
strategy_4_df.sample(100)[['dequoted_text', 'fullname']].sample(20)


Unnamed: 0,dequoted_text,fullname
402305,"Man, if you can't back your claims, then you'r...",t1_czmijkq
812194,"Oh, you're a conspiracy theorist..\n\n\nJust k...",t1_eyoqfhb
103859,And your evidence is an alt-right conspiracy t...,t1_d7a0yq6
418418,You're a conspiracy theorist covid denying ant...,t1_h0ds2ix
370823,When your expert witness is a JFK conspiracy t...,t1_cwuyn9p
638857,now you’re just being a conspiracy theorist. W...,t1_f27sslp
543619,And you're a crackhead conspiracy theorist. I ...,t1_ip5nc2c
545080,so you're a conspiracy theorist,t1_ipk0rwb
768248,You are the conspiracy theorist because you ar...,t1_gwfvohp
235505,You're a crazy conspiracy theorist if you thin...,t1_d5q43nw


In [199]:
n_samples = 20
sample_4, sample_4_stricter =strategy_4_df.sample(n_samples), strategy_4_df[(strategy_4_df.status<0)&(strategy_4_df.rational<0)&(strategy_4_df.bonding<0)&(strategy_4_df.attack>0)&(strategy_4_df.toxic>0)].sample(n_samples)
sample_4['sample_strategy'] = 'strategy_4'
sample_4_stricter['sample_strategy'] = 'strategy_4_stricter'


In [215]:
to_annotate = pd.concat((sample_4, sample_4_stricter))

In [200]:
parent_fullnames = set(to_annotate.parent_fullname)
parents = list()
for chunk in pd.read_json('../data/interim/labeling_subthread_all_filtered_preprocessed_no_bot.jsonl.zip', compression='zip', chunksize=1000, lines=True):
    parents.append(chunk[chunk.fullname.isin(parent_fullnames)])
parent_df = pd.concat(parents)
len(parent_df)

40

In [201]:
parent_df.head()

Unnamed: 0,archived,author,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_text_color,author_flair_type,brand_safe,can_gild,...,subcaption,quarantined,associated_award,collapsed_because_crowd_control,comment_type,collapsed_reason_code,author_is_blocked,editable,unrepliable_reason,ad_promoted_user_posts
102227,0.0,Mugatu101116,,,[],,,text,,True,...,,,,,,,,,,
206709,,lessig,,mozilla,,Larry Lessig,,,,,...,,,,,,,,,,
327918,True,sporks49,,,,,,,,,...,,,,,,,,,,
442238,False,iDovke,,,[],,,text,,False,...,,,,,,,,,,
476579,True,jcm267,,,,,,,,,...,,,,,,,,,,


In [202]:
to_annotate.head()

Unnamed: 0,author,created_utc,subreddit,text,fullname,parent_fullname,link_fullname,dequoted_text,ATTACK_ON_AUTHOR,ATTACK_ON_COMMENTER,...,Money,home,Filler,attack,toxic,bonding,rational,status,has_quote,sample_strategy
1094160,xChainfirex,1645118525,WayOfTheBern,"JFC, you are a full-blown anti-vaxxer conspira...",t1_hxby6hf,t1_hxbwocd,t3_sunhdy,"JFC, you are a full-blown anti-vaxxer conspira...",0.809171,0.972669,...,0.0,0.0,0.0,0.773211,0.361773,3.744725,-2.046898,-1.302365,False,strategy_4
1103968,SusanRosenberg,1667698296,Conservative,"According to the MSM, you're a conspiracy theo...",t1_iv88pik,t1_iv5v1o2,t3_yms3zx,"According to the MSM, you're a conspiracy theo...",0.144896,0.898354,...,0.0,0.0,0.0,0.381498,-0.032953,-0.039715,-2.067808,-2.300656,False,strategy_4
1027102,TheSurge305,1629606279,Wrasslin,You are a conspiracy theorist I see. Yeah Vinc...,t1_h9vg7bu,t1_h9vfj5e,t3_p8qiav,You are a conspiracy theorist I see. Yeah Vinc...,0.419108,0.942327,...,1.0,0.0,0.0,0.55477,0.059772,0.272009,-2.053185,-0.129596,False,strategy_4
475798,ee4m,1639134487,JordanPeterson,You are a conspiracy theorist. They arent conn...,t1_hnz2zeg,t1_hnz2jzn,t3_rd2i5m,You are a conspiracy theorist. They arent conn...,0.104536,0.943463,...,0.0,0.0,0.0,0.400892,-0.058353,-0.990291,-2.059104,-2.300676,False,strategy_4
93433,Atlas26,1466082497,news,Unless you're a conspiracy theorist...,t1_d4bin7b,t1_d4b6xgv,t3_4o9f3x,Unless you're a conspiracy theorist...,0.145185,0.911894,...,0.0,0.0,0.0,0.393426,-0.048003,-1.304014,-1.077601,-2.298677,False,strategy_4


In [216]:
to_annotate['permalink'] = to_annotate.apply(get_permalink, axis=1)

In [217]:

to_annotate = pd.merge(to_annotate, parent_df[['text', 'fullname']], left_on='parent_fullname', right_on='fullname', suffixes = ['', '_parent'])[['fullname', 'parent_fullname', 'subreddit','text', 'text_parent', 'sample_strategy','permalink' ]]


In [218]:

to_annotate.to_csv('../data/interim/labeling_sample_strategies4--4s.csv')

In [219]:
to_annotate

Unnamed: 0,fullname,parent_fullname,subreddit,text,text_parent,sample_strategy,permalink
0,t1_hxby6hf,t1_hxbwocd,WayOfTheBern,"JFC, you are a full-blown anti-vaxxer conspira...","It's not safe or effective, you moron. As a ra...",strategy_4,https://new.reddit.com/r/WayOfTheBern/comments...
1,t1_iv88pik,t1_iv5v1o2,Conservative,"According to the MSM, you're a conspiracy theo...",This story gets dumber every day.,strategy_4,https://new.reddit.com/r/Conservative/comments...
2,t1_h9vg7bu,t1_h9vfj5e,Wrasslin,You are a conspiracy theorist I see. Yeah Vinc...,It was a one hour online show. They just HAPPE...,strategy_4,https://new.reddit.com/r/Wrasslin/comments/p8q...
3,t1_hnz2zeg,t1_hnz2jzn,JordanPeterson,You are a conspiracy theorist. They arent conn...,"Funding them, and taking it upon himself to se...",strategy_4,https://new.reddit.com/r/JordanPeterson/commen...
4,t1_d4bin7b,t1_d4b6xgv,news,Unless you're a conspiracy theorist...,It happened with TWA 800\n\nThough there was a...,strategy_4,https://new.reddit.com/r/news/comments/4o9f3x/...
5,t1_c9fvog1,t1_c9fvijm,conspiracy,Now **YOU** are being a conspiracy theorist by...,Before anyone jumps on the conspiracy bandwago...,strategy_4,https://new.reddit.com/r/conspiracy/comments/1...
6,t1_h6vb8zz,t1_h6uf47u,conspiracy,How can you be a conspiracy theorist and have ...,I posted it on my Facebook and zero comments o...,strategy_4,https://new.reddit.com/r/conspiracy/comments/o...
7,t1_h66vs19,t1_h66v53v,walkaway,First you were a conspiracy theorist if you th...,This almost looks made up wtf lol,strategy_4,https://new.reddit.com/r/walkaway/comments/opq...
8,t1_g7irl9e,t1_g7f9ws9,Coronavirus,&gt;...I still firmly believe are just window ...,&gt; They did in fact do enough to get their R...,strategy_4,https://new.reddit.com/r/Coronavirus/comments/...
9,t1_j0nxf1t,t1_j0npo6u,BikiniBottomTwitter,According to your profile you are a crazy cons...,I really like this sub let’s not change that w...,strategy_4,https://new.reddit.com/r/BikiniBottomTwitter/c...


In [35]:
strategy_1 = list()
strategy_2 = list()
strategy_3 = list()
ntile=4
sample = df[pd.qcut(df.INCOHERENT, ntile, labels=range(ntile))==0][['text', 'fullname']].sample(1000)


# sample = df[['dequoted_text', 'fullname']].sample(1000)
for _, row in sample.iterrows():
    # doc = nlp(row.dequoted_text)

    doc = nlp(row.text)
    matches = matcher(doc)

    match_dict = defaultdict(list)
    for match_id, start, end in matches:
        string_id = nlp.vocab.strings[match_id]  # Get string representation
        span = doc[start:end]  # The matched span
        match_dict[string_id].append(span)
    for labeling_span in match_dict['conspiracy_labeling']:
        sent = labeling_span.sent
        related_pron_spans = list()
        min_linear_distance_pron_span = None
        min_linear_distance_pron_subtree = None
        min_linear_distance = None
        min_dependency_distance_pron_span = None
        min_dependency_distance_pron_subtree = None
        min_dependency_distance = None
        for pron_span in match_dict['pron_poss']:
            if (sent == pron_span.sent) and (pron_span.root.morph.to_dict().get('Person', None) in {'1', '2'}):
                lca_mat = pron_span.root.sent.get_lca_matrix()
                lca_idx = lca_mat[pron_span.root.i-sent.start, labeling_span.root.i-sent.start]
                subtree = list(sent[lca_idx].subtree)
                distance_linear = abs(pron_span.root.i-labeling_span.root.i)
                distance_dependency = 0
                for tok in (pron_span.root, labeling_span.root):
                    for ancestor in tok.ancestors:
                        distance_dependency+=1
                        if tok.i-sent.start==lca_idx:
                            break
                if (min_linear_distance is None) or (distance_linear<min_linear_distance):
                    min_linear_distance=distance_linear
                    min_linear_distance_pron_span=pron_span
                    min_linear_distance_pron_subtree=subtree

                if (min_dependency_distance is None) or (distance_dependency<min_dependency_distance):
                    min_dependency_distance=distance_dependency
                    min_dependency_distance_pron_span=pron_span
                    min_dependency_distance_pron_subtree=subtree

                strategy_1.append((pron_span, subtree, row.fullname))
        if min_linear_distance is not None:
            strategy_2.append((min_linear_distance_pron_span, min_linear_distance_pron_subtree, row.fullname))
        if min_dependency_distance is not None:
            strategy_3.append((min_dependency_distance_pron_span, min_dependency_distance_pron_subtree, row.fullname))

In [36]:
fullnames_strategy_1 = set(i[2] for i in strategy_1 if any(tok.morph.to_dict().get('Person', None) =='2' for tok in i[0].sent))
fullnames_strategy_2 = set(i[2] for i in strategy_2 if i[0].root.morph.to_dict().get('Person', None) =='2')
fullnames_strategy_3 = set(i[2] for i in strategy_3 if i[0].root.morph.to_dict().get('Person', None) =='2')

In [37]:
all_othering = df[df.fullname.isin(fullnames_strategy_3.union(fullnames_strategy_2).union(fullnames_strategy_1))].copy()

In [38]:
all_othering['linear_you'] = all_othering.fullname.isin(fullnames_strategy_2)
all_othering['dependency_you'] = all_othering.fullname.isin(fullnames_strategy_3)

In [39]:

all_othering['permalink'] = all_othering.apply(get_permalink, axis=1)
all_othering['othering'] = True

In [40]:
all_othering[all_othering.dependency_you].permalink.head(10)

18368    https://new.reddit.com/r/politics/comments/6f9...
20208    https://new.reddit.com/r/politics/comments/76b...
21133    https://new.reddit.com/r/politics/comments/9j4...
27630    https://new.reddit.com/r/politics/comments/g0e...
31286    https://new.reddit.com/r/Minecraft/comments/kq...
32169    https://new.reddit.com/r/legaladvice/comments/...
33705    https://new.reddit.com/r/WTF/comments/dyq5o/co...
49417    https://new.reddit.com/r/IAmA/comments/1lhtlx/...
50191    https://new.reddit.com/r/BigBrother/comments/1...
51635    https://new.reddit.com/r/gifs/comments/1moczo/...
Name: permalink, dtype: object

In [22]:
len(fullnames_strategy_1), len(fullnames_strategy_2), len(fullnames_strategy_3), len(fullnames_strategy_3.union(fullnames_strategy_2).union(fullnames_strategy_1))

(217, 197, 180, 217)

In [23]:
non_othering = df[(~df.fullname.isin(fullnames_strategy_1))&((df.fullname.isin(set(sample.fullname))))].copy()
non_othering['permalink'] = non_othering.apply(get_permalink, axis=1)
non_othering['othering'] = False

In [24]:
to_annotate = pd.concat((all_othering[(~all_othering.dependency_you)&(~all_othering.linear_you)].sample(20),
           all_othering[(all_othering.dependency_you)&(~all_othering.linear_you)],
           all_othering[(~all_othering.dependency_you)&(all_othering.linear_you)],
           all_othering[(all_othering.dependency_you)&(all_othering.linear_you)].sample(20),
           non_othering.sample(20)
           ))[['permalink', 'othering', 'dependency_you', 'linear_you', 'subreddit', 'text', 'fullname', 'parent_fullname']]

ValueError: Cannot take a larger sample than population when 'replace=False'

In [173]:
parent_fullnames = set(to_annotate.parent_fullname)
parents = list()
for chunk in pd.read_json('../data/interim/labeling_subthread_all_filtered_preprocessed_no_bot.jsonl.zip', compression='zip', chunksize=1000, lines=True):
    parents.append(chunk[chunk.fullname.isin(parent_fullnames)])
parent_df = pd.concat(parents)
len(parent_df)


In [None]:
to_annotate = pd.merge(to_annotate, parent_df[['text', 'fullname']], left_on='parent_fullname', right_on='fullname', suffixes = ['', '_parent'])[['fullname', 'parent_fullname', 'othering', 'dependency_you', 'linear_you', 'subreddit','permalink', 'text', 'text_parent', ]]

In [85]:
to_annotate.to_csv('../data/interim/labeling_sample_strategies1--3.csv')

In [34]:
for pron_span, subtree, fullname in strategy_3:
    # print(list(subtree))
    print(f"{pron_span}:{pron_span.root.tag_}:{pron_span.root.dep_}:{pron_span.root.morph.to_dict().get('Person', None)} {''.join(i.text_with_ws for i in subtree).strip()}")


yourself:PRP:pobj:2 When talking with others do not refer to yourself as a "conspiracy theorist."
me:PRP:dobj:1 why it always infuriates me whenever I see a 9/11 conspiracy theorist set up in a little tent with several obscure shots of the WTC/former WTC site with blurry pictures of supposed "Factual evidence"
I:PRP:nsubj:1 Im very blind when it comes to sepcific proven health risks of these, as all I ever see are conspiracy theorists yelling about Global depopulation of America.
I:PRP:nsubj:1 For the record, I am not a conspiracy theorist.
I:PRP:nsubj:1 One last thing, as I said before, I am not a conspiracy theorist.
we:PRP:nsubj:1 In 1993, the Israelis were responsible for the bombing of the World Trade Center and that kind of stuff..."


/r/Israel, do we have stealth antisemitic conspiracy theorist on our hands
us:PRP:dobj:1 label us as conspiracy theorists and traitors
I:PRP:nsubj:1 Edit
 I find this subreddit so interesting...while being superficial, it's also extremely 
honest
 

In [64]:
# find the labeling span
# get the root of the span
# get the sentence
# find all pronouns (you)/det poss (your)/propn? (yours)
# find the common ancestor between pronouns* and labeling root
# check for verbs in the middle; look for negated

In [65]:
options = {"compact": True, "bg": "#09a3d5",
           "color": "white", "font": "Source Sans Pro"}
displacy.render(pron_span.sent, style="dep", options=options)
