## FakeNewsNet (Exploratory Data Analysis)

### Imports

In [None]:
# python imports
import os
import zipfile
import json
import re

from collections import Counter

# external library imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType
from pyspark.sql import DataFrame
from pyspark.rdd import RDD

from wordcloud import WordCloud, STOPWORDS
from efficient_apriori import apriori
from plotnine import *

### Datasets

In [None]:
# ----------------------------------------
# download and unzip dataset if missing
# ----------------------------------------
def get_or_download_dataset():
    path_dataset       = "./dataset"
    path_group_by_user = "./dataset/group_by_user"
    path_unique_users  = "./dataset/unique_users"
    if not os.path.exists(path_dataset):
        print("creating directories here...")
        !pwd
        os.mkdir(path_dataset)
        os.mkdir(path_group_by_user)
        os.mkdir(path_unique_users)
        print("downloading dataset...")
        # https://medium.com/@acpanjan/download-google-drive-files-using-wget-3c2c025a8b99
        file_link_1 = 'https://drive.google.com/file/d/1UBFC0m5F4sln-YSP3zkq-5__ZJVzycme/view?usp=sharing'
        file_link_2 = 'https://drive.google.com/file/d/1gXmSAoH-gT7fAcq0g16tnb-fx1hTDv5M/view?usp=sharing'
        file_link_3 = 'https://drive.google.com/file/d/1KpTogf6HIgicXDjrEANjAO4Bg-5zQ2Iq/view?usp=sharing'
        file_link_4 = 'https://drive.google.com/file/d/1C09l3Mq7SrzJ1hYFK6Y2kbRbZsmko3Ew/view?usp=sharing'
        file_id_1   = '1UBFC0m5F4sln-YSP3zkq-5__ZJVzycme'
        file_id_2   = '1gXmSAoH-gT7fAcq0g16tnb-fx1hTDv5M'
        file_id_3   = '1KpTogf6HIgicXDjrEANjAO4Bg-5zQ2Iq'
        file_id_4   = '1C09l3Mq7SrzJ1hYFK6Y2kbRbZsmko3Ew'
        file_name_1 = 'gossipcop.zip'
        file_name_2 = 'politifact.zip'
        file_name_3 = 'gc_all.csv'
        file_name_4 = 'pf_all.csv'
        !wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1UBFC0m5F4sln-YSP3zkq-5__ZJVzycme' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1UBFC0m5F4sln-YSP3zkq-5__ZJVzycme" -O gossipcop.zip  && rm -rf /tmp/cookies.txt
        !wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1gXmSAoH-gT7fAcq0g16tnb-fx1hTDv5M' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1gXmSAoH-gT7fAcq0g16tnb-fx1hTDv5M" -O politifact.zip && rm -rf /tmp/cookies.txt
        !wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1KpTogf6HIgicXDjrEANjAO4Bg-5zQ2Iq' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1KpTogf6HIgicXDjrEANjAO4Bg-5zQ2Iq" -O gc_all.csv     && rm -rf /tmp/cookies.txt
        !wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1C09l3Mq7SrzJ1hYFK6Y2kbRbZsmko3Ew' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1C09l3Mq7SrzJ1hYFK6Y2kbRbZsmko3Ew" -O pf_all.csv     && rm -rf /tmp/cookies.txt
        !ls -l
        print("extracting dataset...")
        with zipfile.ZipFile(file_name_1, 'r') as zip_ref:
            zip_ref.extractall(path_group_by_user)
            !rm 'gossipcop.zip'
        with zipfile.ZipFile(file_name_2, 'r') as zip_ref:
            zip_ref.extractall(path_group_by_user)
            !rm 'politifact.zip'
        !mv gc_all.csv ./dataset/unique_users
        !mv pf_all.csv ./dataset/unique_users
        print("dataset successfully downloaded and extracted!")
    else:
        print("dataset path exists! skipping...")
    return path_dataset
# ----------------------------------------
# call sites
# ----------------------------------------
path_dataset = get_or_download_dataset()

### Dataframes

In [None]:
# ----------------------------------------
# convert to spark rdd from json
# ----------------------------------------
def to_rdd_spark(sqlcontext, path, head_len = None):
    if head_len is None:
        print("reading " + path + "...")
        df_all = sqlcontext.read.json(path + "/union/*.json")
        return df_all
    directories = os.listdir(path)
    json_files = []
    df_all = spark.createDataFrame([], StructType([]))
    print("reading first " + str(head_len) + " json files in " + str(len(directories)) + " directories in " + path + "...")
    for index,directory in enumerate(directories):
        path_prefix = path + "/" + directory + "/tweets"
        files = os.listdir(path_prefix)
        for file in files:
            json_files.append(path_prefix + "/" + file)
        if head_len is not None and len(json_files) >= head_len:
            break
    if head_len is not None:
        df_all = sqlcontext.read.json(json_files[:head_len])
        assert len(json_files[:head_len]) == df_all.count()
    else:
        df_all = sqlcontext.read.json(json_files)
    return df_all
# ----------------------------------------
# call sites
# ----------------------------------------
sparksession = SparkSession.builder.appName('FakeNewsNet').getOrCreate()
sparkcontext = sparksession.sparkContext
sqlcontext   = SQLContext(sparkcontext)

print("reading RDDs...")
dd_fake_1 = to_rdd_spark(sqlcontext, path_dataset + "/group_by_user" + "/gossipcop"  + "/fake")
dd_real_1 = to_rdd_spark(sqlcontext, path_dataset + "/group_by_user" + "/gossipcop"  + "/real")
dd_fake_2 = to_rdd_spark(sqlcontext, path_dataset + "/group_by_user" + "/politifact" + "/fake")
dd_real_2 = to_rdd_spark(sqlcontext, path_dataset + "/group_by_user" + "/politifact" + "/real")

print("reading Dataframes...")
df_fake_1 = dd_fake_1.toPandas()
df_real_1 = dd_real_1.toPandas()
df_fake_2 = dd_fake_2.toPandas()
df_real_2 = dd_real_2.toPandas()

print("reading CSVs...")
df_uniq_1 = pd.read_csv("./dataset/unique_users/gc_all.csv")
df_uniq_2 = pd.read_csv("./dataset/unique_users/pf_all.csv")

print("reading done!")

### Print Datasets

In [None]:
# ----------------------------------------
# print raw data
# ----------------------------------------
def print_dataset(dataset):
    for data in dataset:
        if isinstance(data, RDD):
            myRDD.take(20).foreach(println)
        if isinstance(data, DataFrame):
            print("(" + str(data.count()) + "," + str(len(data.columns)) + ")")
            print(data.columns)
            print(data.summary().show())
        if isinstance(data, pd.DataFrame):
            print(data.shape)
            print(data.info())
            print(data.columns)
            print(data.describe())
# ----------------------------------------
# call sites
# ----------------------------------------
print("========== RDDs ==========")
print_dataset([dd_fake_1, dd_real_1, dd_fake_2, dd_real_2])
print("========== DFs ==========")
print_dataset([df_fake_1, df_real_1, df_fake_2, df_real_2])
print("========== CSVs ==========")
print_dataset([df_uniq_1, df_uniq_2])

### Pie Charts

In [None]:
# ----------------------------------------
# visualize pie chart
# ----------------------------------------
def visualize_pie_chart(df_fake, df_real, column_name, labels):
    df_sum_fake = df_fake[column_name].sum()
    df_sum_real = df_real[column_name].sum()
    list_sums   = np.array([df_sum_fake, df_sum_real])
    plt.pie(list_sums, labels = labels, autopct='%1.1f%%')
    plt.show()
# ----------------------------------------
# call sites
# ----------------------------------------
print("========== num_of_tweets ==========")
visualize_pie_chart(df_fake_1, df_real_1, "num_of_tweets", ["Gossipcop Fake", "Gossipcop Real"])
visualize_pie_chart(df_fake_2, df_real_2, "num_of_tweets", ["Politifact Fake", "Politifact Real"])

print("========== total_favorite_count ==========")
visualize_pie_chart(df_fake_1, df_real_1, "total_favorite_count", ["Gossipcop Fake", "Gossipcop Real"])
visualize_pie_chart(df_fake_2, df_real_2, "total_favorite_count", ["Politifact Fake", "Politifact Real"])

print("========== total_retweet_count ==========")
visualize_pie_chart(df_fake_1, df_real_1, "total_retweet_count", ["Gossipcop Fake", "Gossipcop Real"])
visualize_pie_chart(df_fake_2, df_real_2, "total_retweet_count", ["Politifact Fake", "Politifact Real"])

### Histogram Bins of Follower Count

In [None]:
# ----------------------------------------
# visualize histogram bin
# ----------------------------------------
def visualize_histogram_bin(df_uniq, column_name, title):
    bin_values = [100, 1000, 10000, 100000, 1000000, 1000000]
    bin_names  = ["<100", "100-1k", "1k-10k", "10k-100k", "100k-1m", ">10m"]
    x_values   = np.arange(len(bin_names))
    y_values   = []
    
    df_notna = df_uniq[df_uniq[column_name].notna()]
    df_first = df_notna[df_notna[column_name] < bin_values[0]]
    y_values.append(len(df_first))
    for index in range(1, len(bin_values)):
        df_current = df_notna[(df_notna[column_name] >= bin_values[index - 1]) & (df_notna[column_name] < bin_values[index])]
        y_values.append(len(df_current))

    plt.bar(x_values, y_values, 0.2, label=column_name)
    plt.xticks(x_values, bin_names)
    plt.xlabel(column_name)
    plt.title(title)
    plt.legend()
    plt.show()
# ----------------------------------------
# call sites
# ----------------------------------------
visualize_histogram_bin(df_uniq_1, "followers_count", "Histogram of Followers Count (Gossipcop)")
visualize_histogram_bin(df_uniq_2, "followers_count", "Histogram of Followers Count (Politifact)")

### Favorites vs Retweets Charts

In [None]:
# ----------------------------------------
# Favorites vs Retweets
# ----------------------------------------
def favorite_vs_retweet(df, title):
    X_val = df['favourites_count']
    Y_val = df['total_retweet_count']
    plt.figure()
    plt.tick_params(labelsize=14)
    plt.scatter(X_val, Y_val)
    plt.title(title, fontsize=14)
    plt.xlabel('favourites_count', fontsize=12)
    plt.ylabel('total_retweet_count', fontsize=12)
    plt.tight_layout()
    plt.show()
# ----------------------------------------
# call sites
# ----------------------------------------
favorite_vs_retweet(df_fake_1, "gossipcop fake")
favorite_vs_retweet(df_real_1, "gossipcop real")
favorite_vs_retweet(df_fake_2, "politifact fake")
favorite_vs_retweet(df_real_2, "politifact real")

### Retweets Fake vs Real Charts

In [None]:
# ----------------------------------------
# Retweets Fake vs Real Chart
# ----------------------------------------
def retweets_fake_vs_real(rdd_fake, rdd_real, ceiling = None):
    df_retweet_fake = rdd_fake.groupBy("user_id").sum("total_retweet_count").toPandas()
    df_retweet_real = rdd_real.groupBy("user_id").sum("total_retweet_count").toPandas()
    if(ceiling is not None):
        df_retweet_fake = df_retweet_fake.drop(df_retweet_fake[df_retweet_fake["sum(total_retweet_count)"] > ceiling].index)
        df_retweet_real = df_retweet_real.drop(df_retweet_real[df_retweet_real["sum(total_retweet_count)"] > ceiling].index)
    df_fake_vs_real = pd.merge(df_retweet_fake, df_retweet_real, on='user_id', how='outer')
    df_fake_vs_real['total_retweets'] = (df_fake_vs_real['sum(total_retweet_count)_x'].fillna(0) + df_fake_vs_real['sum(total_retweet_count)_y'].fillna(0))
    return df_fake_vs_real
# ----------------------------------------
# call sites
# ----------------------------------------
# must call in separate cells for chart to render

In [None]:
print("========== Gossipcop ==========")
df_fake_vs_real_gc = retweets_fake_vs_real(dd_fake_1, dd_real_1)
p = ggplot(aes(x='sum(total_retweet_count)_x', y='sum(total_retweet_count)_y'), df_fake_vs_real_gc)
p + geom_point()

In [None]:
print("========== Gossipcop Zoom In ==========")
df_fake_vs_real_gc_zoom = retweets_fake_vs_real(dd_fake_1, dd_real_1, 50)
p = ggplot(aes(x='sum(total_retweet_count)_x', y='sum(total_retweet_count)_y'), df_fake_vs_real_gc_zoom)
p + geom_point()

In [None]:
print("========== Politifact ==========")
df_fake_vs_real_pf = retweets_fake_vs_real(dd_fake_2, dd_real_2)
p = ggplot(aes(x='sum(total_retweet_count)_x', y='sum(total_retweet_count)_y'), df_fake_vs_real_pf)
p + geom_point()

In [None]:
print("========== Politifact Zoom In ==========")
df_fake_vs_real_pf_zoom = retweets_fake_vs_real(dd_fake_2, dd_real_2, 50)
p = ggplot(aes(x='sum(total_retweet_count)_x', y='sum(total_retweet_count)_y'), df_fake_vs_real_pf_zoom)
p + geom_point()

### Retweets Fake vs Real Histograms

In [None]:
# ----------------------------------------
# visualize top retweeters distribution
# ----------------------------------------
def visualize_retweeters_topk(df_fake_real, dataset_name, topk):
    df_topk   = df_fake_real.sort_values("total_retweets", ascending = False)
    df_topk   = df_topk.head(topk)
    bin_names = np.arange(1, topk + 1)
    x_values  = np.arange(len(bin_names))
    topk_fake = df_topk["sum(total_retweet_count)_x"]
    topk_real = df_topk["sum(total_retweet_count)_y"]

    plt.bar(x_values - 0.2, topk_fake, 0.4, label = 'Fake')
    plt.bar(x_values + 0.2, topk_real, 0.4, label = 'Real')
    plt.xticks(x_values, bin_names)
    plt.ylabel("Number of retweets")
    plt.title(dataset_name + " Top " + str(topk) + " users by number of retweets")
    plt.legend()
    plt.show()
# ----------------------------------------
# call sites
# ----------------------------------------
visualize_retweeters_topk(df_fake_vs_real_gc, "Gossipcop", 20)
visualize_retweeters_topk(df_fake_vs_real_pf, "Politifact", 20)

### Tweet Word Counter

In [None]:
# ----------------------------------------
# count frequency of words in tweets
# ----------------------------------------
def tweet_word_counter(df, sample_percent):
    col_value = df['list_of_tweets']
    row_count = len(col_value)
    word_freq = Counter([])
    tweet_tokenlist = []
    complete_percent = 0
    for index, tweets in enumerate(col_value):
        for curr_tweet in tweets:
            curr_tweet_text = re.search(r"^.*?,\"text\":\"(.*)\",\"truncated\".*$", curr_tweet).group(1)
            words = curr_tweet_text.split()
            words = [word.lower() for word in words]
            word_freq = word_freq + Counter(words)
            if False:
                tweet_tokenlist.append(tuple(set(word_freq)))
        if index % 100 == 0:
            complete_percent += round((index / row_count) * 100, 2)
            print(str(complete_percent) + "% of dataset")
            if sample_percent is not None and complete_percent > sample_percent:
                break
    return word_freq, tweet_tokenlist
# ----------------------------------------
# call sites
# ----------------------------------------
print("processing Gossipcop Fake...")
counter_fake_1, tokenlist_fake_1 = tweet_word_counter(df_fake_1, 5)
print("processing Gossipcop Real...")
counter_real_1, tokenlist_real_1 = tweet_word_counter(df_real_1, 5)
print("processing Politifact Fake...")
counter_fake_2, tokenlist_fake_2 = tweet_word_counter(df_fake_2, 5)
print("processing Politifact Real...")
counter_real_2, tokenlist_real_2 = tweet_word_counter(df_real_2, 5)
print("processing done!")

### Stopwords

In [None]:
# ----------------------------------------
# additional stopwords to remove
# ----------------------------------------
additional_stopwords = {
    'to', 'a', 'the', 'and', 'on', '-', 'is', 'with',
    'i', 'of', 'you', '&amp;', 'for', 'out', 'air',
    'win', 'in', 'her', 'tv', 'at', 'after', 'best',
    'your', 'first', 'new', 'about', 'found', 'as',
    'how', 'via', 'my', 'by', 'if', 'one', 'four',
    'that', 'have', 'was', 'this', 'are', 'be', 'it',
    'has', 'from', 'all', 'but', 'just', 'not', 'u.s.',
    'only', 'two', 'more', 'will', 'an', 'me', 'had',
    'like', 'we', 'so', 'been', 'our', 'or', 'three',
    "i'm", 'years', 'see', 'top', '#eonline', '2018',
    'says', 'us', 'https', 'news', 't', 'co', 's',
    'https://t.co/g79yrlmstd', 'https://t.co/pas8l48opb'
    }
def filter_stopwords(counter, stopwords):

    for stopword in stopwords:
        del counter[stopword]
        del counter[stopword]
        del counter[stopword]
        del counter[stopword]
def show_top_words(counter, topk = 20):
    print("number of unqiue words: " + str(len(counter)))
    topk_words = counter.most_common(topk)
    print("most common words:")
    print(*topk_words, sep = "\n")
# ----------------------------------------
# call sites
# ----------------------------------------
stopwords  = set(STOPWORDS)
stopwords |= additional_stopwords
    
filter_stopwords(counter_fake_1, stopwords)
filter_stopwords(counter_real_1, stopwords)
filter_stopwords(counter_fake_2, stopwords)
filter_stopwords(counter_real_2, stopwords)

print("========== Gossipcop Fake ==========")
show_top_words(counter_fake_1)
print("========== Gossipcop Real ==========")
show_top_words(counter_real_1)
print("========== Politifact Fake ==========")
show_top_words(counter_fake_2)
print("========== Politifact Real ==========")
show_top_words(counter_real_2)

### Word Clouds

In [None]:
# ----------------------------------------
# visualize using word cloud
# ----------------------------------------
def visualize_word_cloud(counter, stopwords):
    words = " ".join(counter.elements())
    wordcloud = WordCloud(
        width = 800, height = 800, background_color ='white',
        collocations = False, stopwords = stopwords,
        min_font_size = 10).generate(words)
    plt.figure(figsize = (8, 8), facecolor = None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad = 0)
    plt.show()
# ----------------------------------------
# call sites
# ----------------------------------------
print("========== Gossipcop Fake ==========")
visualize_word_cloud(counter_fake_1, stopwords)
print("========== Gossipcop Real ==========")
visualize_word_cloud(counter_real_1, stopwords)
print("========== Politifact Fake ==========")
visualize_word_cloud(counter_fake_2, stopwords)
print("========== Politifact Real ==========")
visualize_word_cloud(counter_real_2, stopwords)

### Association Rules with Top Keywords

In [None]:
# ----------------------------------------
# Skipped - takes too long
# ----------------------------------------
# itemsets, rules = apriori(tokenlist_fake_1, min_support=0.9, min_confidence=0.9)
# itemsets, rules = apriori(tokenlist_real_1, min_support=0.9, min_confidence=0.9)
# itemsets, rules = apriori(tokenlist_fake_2, min_support=0.9, min_confidence=0.9)
# itemsets, rules = apriori(tokenlist_real_2, min_support=0.9, min_confidence=0.9)