In [7]:
import pandas as pd
import numpy as np
import os
import json

path_1 = 'data/PHEME_veracity/all-rnr-annotated-threads'
def read_data(path_1, rnr):
    events = os.listdir(path_1)
    rumor_data = []
    for event in events:
        path_2 = path_1+'/'+event+'/'+rnr
        rumor_ids = os.listdir(path_2)
        for rid in rumor_ids:
            rumor = {}
            rumor['tweet_id'] = rid
            path_3 = path_2+'/'+rid+'/reactions'
            path_4 = path_2+'/'+rid+'/source-tweets'
            with open(path_2+'/'+rid+'/annotation.json', encoding='utf-8') as a:
                rumor['annotation'] = json.loads(a.readline())
            with open(path_2+'/'+rid+'/structure.json', encoding='utf-8') as s:
                rumor['structure'] = json.loads(s.readline())
            with open(path_4+'/'+rid+'.json', encoding='utf-8') as f:
                rumor['source_tweet'] = json.loads(f.readline())
            re_ids = os.listdir(path_3)
            reactions = []
            for re_id in re_ids:
                if re_id[0]!='.':
                    with open(path_3+'/'+re_id, encoding='utf-8') as re:
                        reactions.append(json.loads(re.readline()))
            rumor['reactions'] = reactions
            rumor['events'] = event
            rumor_data.append(rumor)
    return rumor_data

In [10]:
def ct(a):
    try: 
        t =  a['true'] 
    except: 
        t = 0
    return int(t)
rumor_data = read_data(path_1, 'rumours')
rumor_data = pd.DataFrame(rumor_data)
rumor_data['events'] = rumor_data['events'].apply(lambda e: e.split('-')[0])
rumor_data['true'] =  rumor_data['annotation'].apply(ct) 
rumor_data['fake'] =  rumor_data['annotation'].apply(lambda a: int(a['misinformation']))
rumor_data['text'] = rumor_data['source_tweet'].apply(lambda t: t['text'])
rumor_data['text_len'] = rumor_data['source_tweet'].apply(lambda t: len(nlp(t['text'])))
rumor_data['have_htag'] = rumor_data['source_tweet'].apply(lambda t: 0 if len(t['entities']['hashtags'])==0 else 1)
rumor_data['have_url'] = rumor_data['source_tweet'].apply(lambda t: 0 if len(t['entities']['urls'])==0 else 1)
rumor_data['retweet_count'] = rumor_data['source_tweet'].apply(lambda t: t['retweet_count'])
rumor_data['uid'] = rumor_data['source_tweet'].apply(lambda t: t['user']['id'])
rumor_data['followers_count'] = rumor_data['source_tweet'].apply(lambda t: t['user']['followers_count'])
rumor_data['friends_count'] = rumor_data['source_tweet'].apply(lambda t: t['user']['friends_count'])
rumor_data['created_at'] = rumor_data['source_tweet'].apply(lambda t: t['created_at'])

In [11]:
import re
rumor_data['have_at'] = rumor_data['text'].apply(lambda t : (1 if re.search('@(.*)', t, re.M|re.I) is not None else 0))

In [12]:
rumor_data['verified'] = rumor_data['source_tweet'].apply(lambda t: t['user']['verified'])

In [13]:
import spacy
nlp = spacy.load('en_core_web_sm')

date_loc = []
def id_ent(text):
    doc = nlp(text)
    date = 0
    loc = 0
    labels = []
    for ent in doc.ents:
        labels.append(ent.label_)
    if 'DATE' in labels:
        date = 1
    if 'GPE' in labels:
        loc = 1
    date_loc.append([date, loc])
rumor_data['text'].apply(id_ent)
rumor_data[['have_date', 'have_loc']] = pd.DataFrame(date_loc)

# 计算情绪分布

In [14]:
import pandas as pd
import spacy
nlp = spacy.load('en_core_web_sm')
emo_dict = []
with open('data/NRC-Emotion-Lexicon/NRC-Emotion-Lexicon-v0.92/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt', 'r') as f:
    for eachline in f:
        eachline = eachline.strip('\n')
        l = eachline.split('\t')
        emo_dict.append(l)
emo_dict = pd.DataFrame(emo_dict, columns=['word', 'emotion', 'label'])
emo_words = emo_dict['word'].unique()
doc_words = ' '.join(emo_words)
doc_words = nlp(doc_words)
tags = set([word.tag_ for word in doc_words])

In [15]:
emo_words[:10]

array(['aback', 'abacus', 'abandon', 'abandoned', 'abandonment', 'abate',
       'abatement', 'abba', 'abbot', 'abbreviate'], dtype=object)

In [16]:
emos = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust', 'positive', 'negative']

def emo_dist(text):
    emos_value = {'anger':0, 'anticipation':0, 'disgust':0, 'fear':0, 'joy':0, 'sadness':0, 'surprise':0, 'trust':0, 'positive':0, 'negative':0}
    doc = nlp(text)
    ents = [ent.text for ent in doc.ents]
    for ent in ents:
        text = text.replace(ent, " ")
    text = re.sub("[^a-zA-Z]", " ", text)
    vocab = [v.lower() for v in text.strip().split(' ')]
    for word in vocab:
        es = emo_dict['emotion'][emo_dict['word']==word][emo_dict['label']=='1']
        for e in es:
            emos_value[e] += 1
    return emos_value

In [17]:
rumor_data['emo_words'] = rumor_data['text'].apply(emo_dist)

In [20]:
special_emos = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust']

In [24]:
for emo in emos:
    rumor_data[emo] = rumor_data['emo_words'].apply(lambda ew: ew[emo])

In [25]:
special_emos = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust']
sub_spec_emos = ['anger', 'disgust', 'joy', 'sadness', 'fear']
rumor_data['special_emos_sum'] = rumor_data[special_emos].sum(axis=1)
rumor_data['sub_spec_emos_sum'] = rumor_data[sub_spec_emos].sum(axis=1)
rumor_data.head()

Unnamed: 0,annotation,events,reactions,source_tweet,structure,tweet_id,true,fake,text,text_len,...,disgust,fear,joy,sadness,surprise,trust,positive,negative,special_emos_sum,sub_spec_emos_sum
0,"{'is_rumour': 'rumour', 'category': '(At least...",charliehebdo,"[{'contributors': None, 'truncated': False, 't...","{'contributors': None, 'truncated': False, 'te...",{'552783238415265792': {'552787794503143424': ...,552783238415265792,1,0,"Breaking: At least 10 dead, 5 injured after tO...",27,...,0,2,0,1,0,0,0,1,3,3
1,"{'is_rumour': 'rumour', 'category': '(At least...",charliehebdo,"[{'contributors': None, 'truncated': False, 't...","{'contributors': None, 'truncated': False, 'te...",{'552783667052167168': {'552785374507175936': ...,552783667052167168,1,0,France: 10 people dead after shooting at HQ of...,20,...,0,1,0,0,0,0,0,1,2,2
2,"{'is_rumour': 'rumour', 'category': '(At least...",charliehebdo,"[{'contributors': None, 'truncated': False, 't...","{'contributors': None, 'truncated': False, 'te...",{'552783745565347840': {'552785157959479296': ...,552783745565347840,1,0,Ten killed in shooting at headquarters of Fren...,20,...,0,1,0,0,0,0,0,1,2,2
3,"{'is_rumour': 'rumour', 'category': '(At least...",charliehebdo,"[{'contributors': None, 'truncated': False, 't...","{'contributors': None, 'truncated': False, 'te...",{'552784168849907712': {'552784554650767361': ...,552784168849907712,1,0,BREAKING: 10 dead in shooting at headquarters ...,22,...,0,1,0,0,0,0,0,1,2,2
4,"{'is_rumour': 'rumour', 'category': '(At least...",charliehebdo,"[{'contributors': None, 'truncated': False, 't...","{'contributors': None, 'truncated': False, 'te...",{'552784526955806720': {'552784701107486720': ...,552784526955806720,1,0,Reuters: 10 people shot dead at headquarters o...,23,...,0,1,0,1,1,0,0,1,4,3


In [26]:
special_emos_dist = ['anger_d', 'anticipation_d', 'disgust_d', 'fear_d', 'joy_d', 'sadness_d', 'surprise_d', 'trust_d']
rumor_data = rumor_data[rumor_data['special_emos_sum']!=0]
rumor_data[special_emos_dist] = rumor_data[special_emos]
trans = rumor_data[special_emos_dist].T/rumor_data[special_emos_dist].T.sum()
rumor_data[special_emos_dist] = trans.T

In [27]:
rumor_data['anger_joy_d'] = rumor_data['anger_d'] - rumor_data['joy_d']

In [28]:
cols = special_emos_dist+['anger_joy_d']

In [49]:
import pandas as pd
rumor_data = pd.read_csv('data/PHEME_veracity/fake_news_reg.csv')

In [54]:
special_emos = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust']

In [56]:
rumor_data[rumor_data['fake']==1][special_emos].describe()

Unnamed: 0,anger,anticipation,disgust,fear,joy,sadness,surprise,trust
count,478.0,478.0,478.0,478.0,478.0,478.0,478.0,478.0
mean,0.111943,0.086683,0.029926,0.227414,0.059101,0.071342,0.097929,0.315663
std,0.160513,0.23089,0.093152,0.203439,0.115739,0.146016,0.161664,0.34937
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.25
75%,0.2,0.0,0.0,0.4,0.0,0.125,0.2,0.5
max,0.5,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [57]:
rumor_data[rumor_data['fake']==1][rumor_data['retweet_count']<10][special_emos].describe()

Unnamed: 0,anger,anticipation,disgust,fear,joy,sadness,surprise,trust
count,117.0,117.0,117.0,117.0,117.0,117.0,117.0,117.0
mean,0.020024,0.059972,0.034473,0.116199,0.117552,0.063553,0.139703,0.448525
std,0.073175,0.169725,0.146241,0.146501,0.136819,0.161161,0.200749,0.389446
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25
75%,0.0,0.0,0.0,0.25,0.25,0.0,0.25,1.0
max,0.5,1.0,1.0,0.5,0.5,1.0,1.0,1.0


In [59]:
from scipy.stats import ks_2samp

ks_2samp(rumor_data[rumor_data['fake']==1][rumor_data['retweet_count']>=10]['anger'], rumor_data[rumor_data['fake']==1][rumor_data['retweet_count']<10]['anger'])

Ks_2sampResult(statistic=0.4159149560811611, pvalue=3.042011087472929e-14)