In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import tldextract
import glob
import json
import os
import operator
import numpy as np
import matplotlib.pyplot as plt

def extract_domain(url):
    return tldextract.extract(url).registered_domain

BASE_DIR = '/data/spirits-backup/savvas/news-propagation/'

In [2]:
gabdf = pd.read_csv(BASE_DIR + 'gab_hits_news_propagation.txt', delimiter='\t')
chandf = pd.read_csv(BASE_DIR + '4chan_news_propagation_hits.txt', delimiter='\t')
 
def load_reddit(directory):
    l = []
    with open(directory, 'r') as f:
        for line in f:
            els = line.replace('\n', '').split('\t')
            if len(els)>7:
                # means the line refers to a comment
                item = {'url': els[0], 'author': els[1], 'created_at': els[2], 'subreddit': els[3], 'score': els[4],
                       'id': els[5], 'link_id': els[6], 'gilded':els[7], 'controversiality': els[8]}
            else:
                # means the line refers to a submission
                item = {'url': els[0], 'author': els[1], 'created_at': els[2], 'subreddit': els[3], 'num_comments': els[4],
                       'score': els[5], 'id': els[6]}
            l.append(item)
    return pd.DataFrame(l)
redditdf = load_reddit(BASE_DIR + 'reddit_news_propagation_hits.txt')
redditdf = redditdf.drop_duplicates()

In [3]:
newsguard_dict = json.load(open(BASE_DIR + 'list_of_top_30000_newswebsites_on_newsguard.json', 'r'))
count_mainstream = 0
for k,v in newsguard_dict.items():
    if v>=60.0:
        count_mainstream+=1
        
count_alternative = len(newsguard_dict.items()) - count_mainstream
print("Domains that are alternative = %d. Domains that are mainstream = %d" %(count_alternative, count_mainstream))

Domains that are alternative = 37. Domains that are mainstream = 1036


In [4]:


def read_data_twitter(domains):
    data = []
    all_files = glob.glob("/data/savvas/scylla_fastdata/news-propagation/historic_news_tweets_gardenhose/*.txt")
    for inp_file in all_files:
        with open(inp_file, 'r') as inp:
            for line in inp:
                try:
                    tweet = json.loads(line)
                    #print(type(tweet))
                    for url in tweet['entities']['urls']:
                        url = url['expanded_url']
                        domain = extract_domain(url)
                        if domain in domains:
                            filtered_dict = {}
                            filtered_dict['created_utc'] = int(tweet['timestamp_ms'])/float(1000)
                            filtered_dict['url'] = url
                            #filtered_dict['retweeted'] = tweet['retweeted']
                            #filtered_dict['text'] = tweet['text']
                            filtered_dict['tweet_id'] = tweet['id_str']
                            #filtered_dict['filter_level'] = tweet['filter_level']
                            #filtered_dict['favorited'] = tweet['favorited']
                            filtered_dict['user'] = tweet['user']['screen_name']
                            filtered_dict['user_id'] = tweet['user']['id_str']
                            filtered_dict['user_followers'] = tweet['user']['followers_count']
                            filtered_dict['user_friends'] = tweet['user']['friends_count']
                            #filtered_dict['user_favourites'] = tweet['user']['statuses_count']
                            #filtered_dict['user_listed_count'] = tweet['user']['listed_count']
                            filtered_dict['created_utc_friendly'] = tweet['created_at']
                            #filtered_dict['retweet_count'] = tweet['retweet_count']
                            #filtered_dict['favorite_count'] = tweet['favorite_count']
                            filtered_dict['lang'] = tweet['lang']
                            data.append(filtered_dict)
                except Exception as e: 
                    #print(str(e))
                    pass
            #print("Done with %s. Added = %d" %(inp_file, len(data)))
    return pd.DataFrame(data)
                          
twitterdf = read_data_twitter(newsguard_dict.keys())

In [6]:
redditdf['datetime'] = pd.to_datetime(redditdf['created_at'], unit='s', utc=True)
gabdf['datetime'] = pd.to_datetime(gabdf['published_at'], utc=True)
chandf['datetime'] = pd.to_datetime(chandf['published_at'], unit='s', utc=True)
twitterdf['datetime'] = pd.to_datetime(twitterdf['created_utc'], unit='s', utc=True)

In [17]:
tddf = redditdf[redditdf.subreddit=='The_Donald']

In [8]:
tddf = redditdf[redditdf.subreddit=='The_Donald']

dfs = [gabdf, chandf, redditdf, twitterdf, tddf]
for df in dfs:
    df['domain'] = df['url'].map(extract_domain)
for df in dfs:
    df['newsguard_score'] = df['domain'].map(newsguard_dict)

In [9]:
dfs = [twitterdf, redditdf, tddf, chandf, gabdf]
dfs_names = ['Twitter', 'Reddit', 'The_Donald', '4chan', 'Gab' ]
keys = ['tweet_id', 'id', 'id', 'post_id', 'post_id']
def generate_dataset_table(dfs, dfs_names):
    out = []
    all_ids = []
    all_urls = []
    all_mainstream = []
    all_alternative = []
    all_mainstream_ids = []
    all_alternative_ids = [] 
    for i in range(len(dfs)):
        platform = dfs_names[i]
        df = dfs[i]
        df = df[df.datetime<'2018-11-01']
        all_ids.append(df[keys[i]].tolist())
        all_urls.append(df['url'].tolist())
        mainstreamdf = df[df.newsguard_score>=60]
        alternativedf = df[df.newsguard_score<60]
        print("Platform = %s Mainstream Count = %d Mainstream Percentage = %f Alternative Count = %d Alternative Percentage = %f" 
             %(platform, mainstreamdf.shape[0], mainstreamdf.shape[0]/(mainstreamdf.shape[0]+alternativedf.shape[0])*100,
              alternativedf.shape[0], alternativedf.shape[0]/(mainstreamdf.shape[0]+alternativedf.shape[0])*100))
        print("Mainstream domains Percentage = %f" %(len(set(mainstreamdf['domain'].tolist()))/1073*100))
        mainstream_count = len(set(mainstreamdf[keys[i]].tolist()))
        alternative_count = len(set(alternativedf[keys[i]].tolist()))
        
        all_mainstream_ids.append(mainstreamdf[keys[i]].tolist())
        all_alternative_ids.append(alternativedf[keys[i]].tolist())

        mainstream_unique_urls = len(set(mainstreamdf['url'].tolist()))
        alternative_unique_urls = len(set(alternativedf['url'].tolist()))
        all_mainstream.append(mainstreamdf['url'].tolist())
        all_alternative.append(alternativedf['url'].tolist())
        
        out.append({'Web community': platform, '# of posts (mainstream)': mainstream_count, 
                   '# of posts (alternative)': alternative_count,
                   '# unique URLs (mainstream)': mainstream_unique_urls, 
                   '# of unique URLs (alternative)': alternative_unique_urls})
    flat_ids = [item for sublist in all_ids for item in sublist]
    flat_urls = [item for sublist in all_urls for item in sublist]
    flat_mainstream = [item for sublist in all_mainstream for item in sublist]
    flat_alternative = [item for sublist in all_alternative for item in sublist]
    
    flat_mainstream_ids = [item for sublist in all_mainstream_ids for item in sublist]
    flat_alternative_ids = [item for sublist in all_alternative_ids for item in sublist]

    print("All posts from all platforms = %d" %(len(set(flat_ids))))
    print("All URLs from all platform = %d" %(len(set(flat_urls))))
    print("All mainstream URLs = %d" %(len(set(flat_mainstream))))
    print("All alternative URLs = %d" %(len(set(flat_alternative))))
    print("All mainstream posts = %d" %(len(set(flat_mainstream_ids))))
    print("All alternative posts = %d" %(len(set(flat_alternative_ids))))
    return pd.DataFrame(out), list(set(flat_urls))

dataset_table, all_urls_all_platforms = generate_dataset_table(dfs, dfs_names)
dataset_table = dataset_table[['Web community', '# of posts (mainstream)', '# of posts (alternative)',
                              '# unique URLs (mainstream)','# of unique URLs (alternative)' ]]
dataset_table

Platform = Twitter Mainstream Count = 7275627 Mainstream Percentage = 91.296670 Alternative Count = 693587 Alternative Percentage = 8.703330
Mainstream domains Percentage = 95.992544
Platform = Reddit Mainstream Count = 26232779 Mainstream Percentage = 94.969592 Alternative Count = 1389514 Alternative Percentage = 5.030408
Mainstream domains Percentage = 96.085741
Platform = The_Donald Mainstream Count = 570968 Mainstream Percentage = 74.274096 Alternative Count = 197763 Alternative Percentage = 25.725904
Mainstream domains Percentage = 95.526561
Platform = 4chan Mainstream Count = 591568 Mainstream Percentage = 87.395127 Alternative Count = 85321 Alternative Percentage = 12.604873
Mainstream domains Percentage = 94.874185
Platform = Gab Mainstream Count = 2382626 Mainstream Percentage = 51.211372 Alternative Count = 2269907 Alternative Percentage = 48.788628
Mainstream domains Percentage = 95.899348
All posts from all platforms = 37822312
All URLs from all platform = 15621263
All main

Unnamed: 0,Web community,# of posts (mainstream),# of posts (alternative),# unique URLs (mainstream),# of unique URLs (alternative)
0,Twitter,7123715,686497,3893357,291354
1,Reddit,23605406,1342429,11170005,612213
2,The_Donald,528142,190742,385384,122204
3,4chan,458431,75705,275422,37472
4,Gab,2369149,2265336,749547,385317


In [10]:
entities_types_blacklist = ['TIME', 'DATE', 'PERCENT', 'QUANTITY', 'ORDINAL', 'CARDINAL']
def load_entity_mapping_multifile(file_path, id_key):
    d = {}
    for filename in os.listdir(file_path):
        with open(file_path + filename, 'r') as f:
            for line in f:
                data = json.loads(line)
                post_id = data[id_key]
                entities = []
                list_json = data['entities']
                for ent in list_json:
                    if len(ent['entity_text'])>1 and ent['entity_label'] not in entities_types_blacklist and not ent['entity_text'].startswith('RT'):
                        entities.append(ent['entity_text'])
                d[post_id] = entities
    return d


def load_entity_mapping(filename, id_key):
    d = {}
    entities_blacklist = ['LRB', 'RRB']
    with open(filename, 'r') as f:
        for line in f:
            data = json.loads(line)
            post_id = data[id_key]
            entities = []
            list_json = data['entities']
            for ent in list_json:
                if len(ent['entity_text'])>1 and ent['entity_label'] not in entities_types_blacklist and not ent['entity_text'].startswith('RT'):
                    if ent['entity_text'] not in entities_blacklist:
                        entities.append(ent['entity_text'])
            d[post_id] = entities
    return d
            

gab_entities_dict = load_entity_mapping(BASE_DIR + 'gab_entity_detection_output_truecase_mp.txt', 'post_id')
chan_entities_dict = load_entity_mapping_multifile(BASE_DIR + 'entities_results/4chan/4chan_entities/', 'post_id')
reddit_entities_dict = load_entity_mapping_multifile(BASE_DIR + 'entities_results/reddit_specific/reddit_discussions_entities_specific/', 'post_id')

In [11]:
def map_entities_to_post(post_id, data_dict):
    try:
        return data_dict[post_id]
    except KeyError:
        return []

redditdf['entities_list'] = redditdf['id'].apply(map_entities_to_post, args=(reddit_entities_dict,))
chandf['entities_list'] = chandf['post_id'].apply(map_entities_to_post, args=(chan_entities_dict,))
gabdf['entities_list'] = gabdf['post_id'].apply(map_entities_to_post, args=(gab_entities_dict,))

In [12]:
twitter_entities_dict = load_entity_mapping('/data/savvas/scylla_fastdata/news-propagation/all_tweets_entities.txt', 'post_id')
twitterdf['entities_list'] = twitterdf['tweet_id'].astype(int).apply(map_entities_to_post, args=(twitter_entities_dict,))

In [13]:
all_urls_dict = {}
for url in all_urls_all_platforms:
    all_urls_dict[url] = 1

def url_exists_in_platforms(url):
    try:
        a = all_urls_dict[url]
        return True
    except KeyError:
        return False




In [14]:
import os
article_entities_dir = '/data/savvas/scylla_fastdata/news-propagation/articles_text_entities/'
entities_types_blacklist = ['TIME', 'DATE', 'PERCENT', 'QUANTITY', 'ORDINAL', 'CARDINAL']
def load_article_entities_to_dict(directory):
    output_dict = {}
    for filename in os.listdir(directory):
        with open(directory + filename, 'r') as f:
            for line in f:
                data = json.loads(line)
                url = data['article_url']
                if not url_exists_in_platforms(url):
                    continue
                entities = data['entities']
                output_entities = []
                for ent in entities:
                    t = ent['entity_label']
                    if t not in entities_types_blacklist and ent['entity_text']!='’m':
                        output_entities.append(ent['entity_text'])
                output_dict[url] = list(set(output_entities))
    return output_dict

articles_entities_dict = load_article_entities_to_dict(article_entities_dir)

In [15]:
from collections import Counter
def find_top_entities_from_articles(articles_dict, topn):
    all_posts = len(articles_dict.items())
    list_of_lists_entities = list(articles_dict.values())
    flat_list = [item for sublist in list_of_lists_entities for item in sublist]
    counter = Counter(flat_list).most_common(topn)
    res = []
    for c in counter:
        res.append({'Entity': c[0], 'Percentage (%)': c[1]/all_posts*100})
    return pd.DataFrame(res)
    #return counter

In [16]:
def top_entities_articles_mainstream_alternative(articles_dict, topn):
    all_posts = len(articles_dict.items())
    mainstream_urls = []
    alternative_urls = []
    for k,v in articles_dict.items():
        d = extract_domain(k)
        try:
            score = newsguard_dict[d]
        except KeyError:
            #print(d)
            continue
        if score<60.0:
            alternative_urls.append(k)
        else:
            mainstream_urls.append(k)
            
    mainstream_urls = list(set(mainstream_urls))
    alternative_urls = list(set(alternative_urls))
    
    
    mainstream_entities = []
    alternative_entities = []
    for u in mainstream_urls:
        mainstream_entities.append(list(set(articles_dict[u])))
    for u in alternative_urls:
        alternative_entities.append(list(set(articles_dict[u])))
        
    flat_list_mainstream = [item for sublist in mainstream_entities for item in sublist]
    counter = Counter(flat_list_mainstream).most_common(topn)
    res = []
    for c in counter:
        res.append({'Entity': c[0], 'Percentage (%)': c[1]/len(mainstream_urls)*100})
    df_mainstream = pd.DataFrame(res)
    
    flat_list_alternative = [item for sublist in alternative_entities for item in sublist]
    counter = Counter(flat_list_alternative).most_common(topn)
    res = []
    for c in counter:
        res.append({'Entity': c[0], 'Percentage (%)': c[1]/len(alternative_urls)*100})
    df_alternative = pd.DataFrame(res)
    return pd.concat([df_mainstream, df_alternative], axis=1)

top_entities_articles_m_a = top_entities_articles_mainstream_alternative(articles_entities_dict, 20)
top_entities_articles_m_a

Unnamed: 0,Entity,Percentage (%),Entity.1,Percentage (%).1
0,Trump,17.945819,Trump,27.524544
1,U.S.,15.533495,US,18.740361
2,American,11.156335,Donald Trump,18.301195
3,Donald Trump,11.086959,American,15.548007
4,the United States,10.135122,U.S.,13.767639
5,Republican,9.033982,Russia,13.399152
6,Washington,8.441719,the United States,13.145026
7,America,7.242032,America,11.54024
8,New York,6.87939,Russian,11.145287
9,Americans,6.759449,Obama,10.206797


In [18]:
dfs = [twitterdf, redditdf, tddf, chandf, gabdf]
dfs_names = ['Twitter', 'Reddit', 'The_Donald', '4chan', 'Gab' ]
unique_ids_keys = ['tweet_id', 'id', 'id', 'post_id', 'post_id']

entities_lengths = []
for df in dfs:
    df['entities_len'] = df['entities_list'].map(len)
    entities_lengths.append(df['entities_len'].tolist())

In [19]:
dfs = [twitterdf, redditdf, tddf, chandf, gabdf]
dfs_names = ['Twitter', 'Reddit', 'The_Donald', '4chan', 'Gab' ]
unique_ids_keys = ['tweet_id', 'id', 'id', 'post_id', 'post_id']
def find_top_entities(dfs, dfs_names, topn):
    output_dfs = []
    top_entities_all = []
    for i in range(len(dfs)):
        
        unique_key = unique_ids_keys[i]
        df = dfs[i]
        platform_name = dfs_names[i]
        all_posts = len(set(df[unique_key].tolist()))
      
        entities_list = df['entities_list'].tolist()
        flat_entities_list = [item for sublist in entities_list for item in sublist]
        entities_blacklist = ['LRB', 'RSB', 'RRB', 'Prev', 'PREV >', 'Pres', 'the Best Tl;Dr']
        flat_entities_list2 = [x for x in flat_entities_list if x not in entities_blacklist and not x.startswith('####') and not x.startswith('compose?to')]
        counter = Counter(flat_entities_list2).most_common(topn)
        res = []
        top_entities = []
        for c in counter:
            res.append({'Entity ' + '(' + platform_name + ')' : c[0], 'Percentage (%)': "%.2f%%" %(c[1]/all_posts*100)})
            top_entities.append(c[0])
        top_entities_all.append(top_entities)
        output_dfs.append(pd.DataFrame(res))
    top_entities_flat = [item for sublist in top_entities_all for item in sublist]
    counter_top_entities = Counter(top_entities_flat).most_common(10)
    entities_to_return = []
    for c in counter_top_entities:
        if c[1]>3:
            entities_to_return.append(c[0])
    return pd.concat(output_dfs, axis=1), entities_to_return
    #return pd.concat(output_dfs, axis=1), set(top_entities_all[0]).intersection(*top_entities_all)
        
top_entities_table, common_entities = find_top_entities(dfs, dfs_names, 20)

In [20]:
top_entities_table

Unnamed: 0,Entity (Twitter),Percentage (%),Entity (Reddit),Percentage (%).1,Entity (The_Donald),Percentage (%).2,Entity (4chan),Percentage (%).3,Entity (Gab),Percentage (%).4
0,Trump,7.67%,US,11.04%,Trump,9.46%,Trump,13.28%,Trump,2.98%
1,US,1.46%,Trump,9.03%,Clinton,8.70%,US,10.45%,US,2.16%
2,U.S.,1.26%,Russia,8.28%,Obama,8.05%,UK,8.36%,Obama,2.00%
3,Donald Trump,1.19%,Russian,6.27%,Hillary,7.29%,Israel,8.13%,FBI,1.89%
4,Russia,1.02%,U.S.,5.47%,US,6.66%,Russia,7.75%,Democrats,1.64%
5,Obama,1.02%,China,4.03%,CNN,6.50%,EU,6.62%,America,1.38%
6,Clinton,0.95%,Clinton,3.88%,FBI,5.09%,U.S.,5.76%,CNN,1.34%
7,GOP,0.91%,Obama,3.47%,Russia,5.01%,Russian,5.60%,U.S.,1.21%
8,UK,0.79%,FBI,3.44%,Muslim,4.62%,Donald J,5.42%,Russia,1.15%
9,CNN,0.68%,CNN,3.30%,Muslims,4.55%,TRUMPTV,5.39%,American,1.12%
