## FakeNewsNet

### Imports

In [None]:
# python imports
import os.path
import urllib.request
import shutil
import zipfile
import json
import re
from collections import Counter

# external library imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType
from pyspark.sql import DataFrame
from pyspark.sql import Row
from pyspark.rdd import RDD
from wordcloud import WordCloud, STOPWORDS
from efficient_apriori import apriori

# project imports


### PySpark contexts

In [None]:
spark = SparkSession.builder.appName('FakeNewsNet').getOrCreate()
sparkcontext = spark.sparkContext
sqlcontext   = SQLContext(sparkcontext)

### Dataset

In [None]:
# ----------------------------------------
# download or reference the dataset
# ----------------------------------------
def get_or_download_dataset():
    path = "./dataset/tweets"
    if not os.path.exists("./dataset"):
        print("downloading tweets...")
        os.mkdir("./dataset")
        # lionel: no public access to download the dataset directly. manually download and place in the stated directory
        url = "https://nusu-my.sharepoint.com/:u:/g/personal/e0809358_u_nus_edu/ETPkp1-0GbBHgB__wyeCS_QBE7_SFluzSCtocU0mUr3Jng"
        with urllib.request.urlopen(url) as response, open(path, 'wb') as out_file:
            shutil.copyfileobj(response, out_file)
            with zipfile.ZipFile(path + "/results.zip", "r") as zip_ref:
                zip_ref.extractall(path)
    return path
# ----------------------------------------
# convert to pandas dataframe
# ----------------------------------------
def to_dataframe_pandas(path, head_len = None):
    directories = os.listdir(path)
    json_all = []
    df_all = pd.DataFrame
    print("reading first " + str(head_len) + " json files in " + str(len(directories)) + " directories in " + path + "...")
    for index,directory in enumerate(directories):
        path_prefix = path + "/" + directory + "/tweets"
        files = os.listdir(path_prefix)
        # print("reading " + str(len(files)) + " json files from directory " + directory + " (" + str(index) + " of " + str(len(directories)) + ")")
        json_lines = []
        for file in files:
            path_full = path_prefix + "/" + file
            with open(path_full, 'r') as json_file:
                json_lines.append(json.loads(json_file.read()))
        df_current = pd.json_normalize(json_lines)
        assert len(files) == len(json_lines)
        assert len(files) == df_current.shape[0]
        # lionel: fixed schema with columns we want must be known beforehand as columns are mismatched among json files
        # df_all = pd.concat([df_all, df_current], axis=0, join='outer', sort=False)
        json_all.extend(json_lines)
        if head_len is not None and len(json_all) >= head_len:
            break
    if head_len is not None:
        df_all = pd.json_normalize(json_all[:head_len])
    else:
        df_all = pd.json_normalize(json_all)
    return df_all
# ----------------------------------------
# convert to spark dataframe
# ----------------------------------------
def to_dataframe_spark(path, head_len = None):
    # lionel: out-of-memory
    if head_len is None:
        df_all = sqlcontext.read.json(path + "/*/tweets/*.json")
        return df_all
    directories = os.listdir(path)
    json_files = []
    df_all = spark.createDataFrame([], StructType([]))
    print("reading first " + str(head_len) + " json files in " + str(len(directories)) + " directories in " + path + "...")
    for index,directory in enumerate(directories):
        path_prefix = path + "/" + directory + "/tweets"
        files = os.listdir(path_prefix)
        # print("reading " + str(len(files)) + " json files from directory " + directory + " (" + str(index) + " of " + str(len(directories)) + ")")
        for file in files:
            json_files.append(path_prefix + "/" + file)
        # lionel: fixed schema with columns we want must be known beforehand as columns are mismatched among json files
        # df_current = sqlcontext.read.json(path_prefix + "/*.json")
        # assert len(files) == df_current.count()
        # df_all = df_all.unionByName(df_current, allowMissingColumns=True)
        if head_len is not None and len(json_files) >= head_len:
            break
    if head_len is not None:
        df_all = sqlcontext.read.json(json_files[:head_len])
        assert len(json_files[:head_len]) == df_all.count()
    else:
        df_all = sqlcontext.read.json(json_files)
    return df_all
# ----------------------------------------
# print helper function
# ----------------------------------------
def print_df(dfs):
    for df in dfs:
        print("==========")
        if isinstance(df, pd.DataFrame):
            print(df.shape)
            print(df.info())
            print(df.columns)
            print(df.describe())
        if isinstance(df, DataFrame):
            print("(" + str(df.count()) + "," + str(len(df.columns)) + ")")
            print(df.columns)
            print(df.summary().show())
    print("==========")
# ----------------------------------------
# call sites
# ----------------------------------------
!pwd
!ls -l
if False:
    path_dataset = get_or_download_dataset()
    df_fake_1 = to_dataframe_spark(path_dataset + "/gossipcop"  + "/fake", 100000).toPandas()
    print("df_fake_1 done")
    df_real_1 = to_dataframe_spark(path_dataset + "/gossipcop"  + "/real").toPandas()
    print("df_real_1 done")
    df_fake_2 = to_dataframe_spark(path_dataset + "/politifact" + "/fake").toPandas()
    print("df_fake_2 done")
    df_real_2 = to_dataframe_spark(path_dataset + "/politifact" + "/real").toPandas()
    print("df_real_2 done")
    print_df([df_fake_1, df_real_1, df_fake_2, df_real_2])
# lionel: weirdly, pandas' and spark's json APIs yield different number of columns!

In [None]:
# ----------------------------------------
# download or reference the dataset
# ----------------------------------------
def get_or_download_dataset():
    # url = "https://nusu-my.sharepoint.com/personal/e0809358_u_nus_edu/_layouts/15/onedrive.aspx?id=%2Fpersonal%2Fe0809358%5Fu%5Fnus%5Fedu%2FDocuments%2FCS5344&ga=1"
    path = "./dataset/group_by_user"
    return path
# ----------------------------------------
# convert to spark dataframe
# ----------------------------------------
def to_rdd_spark(path, head_len = None):
    # lionel: out-of-memory
    if head_len is None:
        df_all = sqlcontext.read.json(path + "/union/*.json")
        return df_all
    directories = os.listdir(path)
    json_files = []
    df_all = spark.createDataFrame([], StructType([]))
    print("reading first " + str(head_len) + " json files in " + str(len(directories)) + " directories in " + path + "...")
    for index,directory in enumerate(directories):
        path_prefix = path + "/" + directory + "/tweets"
        files = os.listdir(path_prefix)
        # print("reading " + str(len(files)) + " json files from directory " + directory + " (" + str(index) + " of " + str(len(directories)) + ")")
        for file in files:
            json_files.append(path_prefix + "/" + file)
        # lionel: fixed schema with columns we want must be known beforehand as columns are mismatched among json files
        # df_current = sqlcontext.read.json(path_prefix + "/*.json")
        # assert len(files) == df_current.count()
        # df_all = df_all.unionByName(df_current, allowMissingColumns=True)
        if head_len is not None and len(json_files) >= head_len:
            break
    if head_len is not None:
        df_all = sqlcontext.read.json(json_files[:head_len])
        assert len(json_files[:head_len]) == df_all.count()
    else:
        df_all = sqlcontext.read.json(json_files)
    return df_all
# ----------------------------------------
# call sites
# ----------------------------------------
!pwd
!ls -l
path_dataset = get_or_download_dataset()
rdd_fake_1 = to_rdd_spark(path_dataset + "/gossipcop"  + "/fake")
df_fake_1  = rdd_fake_1.toPandas()
print("df_fake_1 done")
rdd_real_1 = to_rdd_spark(path_dataset + "/gossipcop"  + "/real")
df_real_1  = rdd_real_1.toPandas()
print("df_real_1 done")
rdd_fake_2 = to_rdd_spark(path_dataset + "/politifact" + "/fake")
df_fake_2  = rdd_fake_2.toPandas()
print("df_fake_2 done")
rdd_real_2 = to_rdd_spark(path_dataset + "/politifact" + "/real")
df_real_2  = rdd_real_2.toPandas()
print("df_real_2 done")
print_df([df_fake_1, df_real_1, df_fake_2, df_real_2])

### favourite_count

In [None]:
def favourite_count(df_fake_1, df_real_1, df_fake_2, df_real_2):
    X_fake_1_count = df_fake_1['favourites_count']
    X_real_1_count = df_real_1['favourites_count']
    X_fake_2_count = df_fake_2['favourites_count']
    X_real_2_count = df_real_2['favourites_count']
    
    plt.figure()
    plt.tick_params(labelsize=10)
    plt.boxplot([
        X_fake_1_count,
        X_real_1_count,
        X_fake_2_count,
        X_real_2_count])
    ax = plt.gca()
    ax.set_xticklabels([
        'gossipcop fake',
        'gossipcop real',
        'politifact fake',
        'politifact real'])
    plt.tight_layout()
    plt.show()
    
favourite_count(df_fake_1, df_real_1, df_fake_2, df_real_2)

### retweet_count

In [None]:
def retweet_count(df_fake_1, df_real_1, df_fake_2, df_real_2):
    X_fake_1_count = df_fake_1['total_retweet_count']
    X_real_1_count = df_real_1['total_retweet_count']
    X_fake_2_count = df_fake_2['total_retweet_count']
    X_real_2_count = df_real_2['total_retweet_count']
    
    plt.figure()
    plt.tick_params(labelsize=10)
    plt.boxplot([
        X_fake_1_count,
        X_real_1_count,
        X_fake_2_count,
        X_real_2_count])
    ax = plt.gca()
    ax.set_xticklabels([
        'gossipcop fake',
        'gossipcop real',
        'politifact fake',
        'politifact real'])
    plt.tight_layout()
    plt.show()
    
retweet_count(df_fake_1, df_real_1, df_fake_2, df_real_2)

### favorite_count to retweet_count (gossipcop fake)

In [None]:
def favorite_vs_retweet(df, title):
    X_val = df['favourites_count']
    Y_val = df['total_retweet_count']

    plt.figure()
    plt.tick_params(labelsize=14)
    plt.scatter(X_val, Y_val)
    plt.title(title, fontsize=14)
    plt.xlabel('favourites_count', fontsize=12)
    plt.ylabel('total_retweet_count', fontsize=12)
    plt.tight_layout()
    plt.show()
    
favorite_vs_retweet(df_fake_1, "gossipcop fake")
favorite_vs_retweet(df_real_1, "gossipcop real")
favorite_vs_retweet(df_fake_2, "politifact fake")
favorite_vs_retweet(df_real_2, "politifact real")

### top words in text

In [None]:
def text_wordcounter(df, head_limit):
    col_value = df['list_of_tweets']
    row_count = len(col_value)
    word_freq = Counter([])
    tweet_tokenlist = []
    for index, tweets in enumerate(col_value):
        for curr_tweet in tweets:
            curr_tweet_text = re.search(r"^.*?,\"text\":\"(.*)\",\"truncated\".*$", curr_tweet).group(1)
            words = curr_tweet_text.split()
            words = [word.lower() for word in words]
            word_freq = word_freq + Counter(words)
#             tweet_tokenlist.append(tuple(set(word_freq)))
        if index % 1000 == 0:
            print(str(round((index / row_count) * 100, 2)) + "% of dataset")
        if index == head_limit:
            break
    return word_freq, tweet_tokenlist

counter_fake_1, tokenlist_fake_1 = text_wordcounter(df_fake_1, 1000)
print("df_fake_1 done")
counter_real_1, tokenlist_real_1 = text_wordcounter(df_real_1, 1000)
print("df_real_1 done")
counter_fake_2, tokenlist_fake_2 = text_wordcounter(df_fake_2, 1000)
print("df_fake_2 done")
counter_real_2, tokenlist_real_2 = text_wordcounter(df_real_2, 1000)
print("df_real_2 done")

### Association Rules with top keywords

In [None]:
# print(tokenlist_fake_1)
# itemsets, rules = apriori(tokenlist_fake_1, min_support=0.9, min_confidence=0.9)
# print(rules)

In [None]:
additional_stopwords = {
    'to', 'a', 'the', 'and', 'on', '-', 'is', 'with',
    'i', 'of', 'you', '&amp;', 'for', 'out', 'air',
    'win', 'in', 'her', 'tv', 'at', 'after', 'best',
    'your', 'first', 'new', 'about', 'found', 'as',
    'how', 'via', 'my', 'by', 'if', 'one', 'four',
    'that', 'have', 'was', 'this', 'are', 'be', 'it',
    'has', 'from', 'all', 'but', 'just', 'not', 'u.s.',
    'only', 'two', 'more', 'will', 'an', 'me', 'had',
    'like', 'we', 'so', 'been', 'our', 'or', 'three',
    "i'm", 'years', 'see', 'top', '#eonline', '2018',
    'says', 'us', 'https', 'news', 't', 'co', 's',
    'https://t.co/g79yrlmstd', 'https://t.co/pas8l48opb'
    }

stopwords = set(STOPWORDS)
stopwords |= additional_stopwords

for stopword in stopwords:
    del counter_fake_1[stopword]
    del counter_real_1[stopword]
    del counter_fake_2[stopword]
    del counter_real_2[stopword]
    
print("---- number of unqiue words ----")
print(len(counter_fake_1))
print(len(counter_real_1))
print(len(counter_fake_2))
print(len(counter_real_2))

k_count = 10
topk_fake_1 = counter_fake_1.most_common(k_count)
topk_real_1 = counter_real_1.most_common(k_count)
topk_fake_2 = counter_fake_2.most_common(k_count)
topk_real_2 = counter_real_2.most_common(k_count)

print("---- " + "gossipcop fake" + " ----")
print(*topk_fake_1, sep = "\n")
print("---- " + "gossipcop real" + " ----")
print(*topk_real_1, sep = "\n")
print("---- " + "politifact fake" + " ----")
print(*topk_fake_2, sep = "\n")
print("---- " + "politifact real" + " ----")
print(*topk_real_2, sep = "\n")

### Word Cloud

In [None]:
string_fake_1 = " ".join(counter_fake_1.elements())
string_real_1 = " ".join(counter_real_1.elements())
string_fake_2 = " ".join(counter_fake_2.elements())
string_real_2 = " ".join(counter_real_2.elements())

wc_string_list = [string_fake_1, string_real_1, string_fake_2, string_real_2]
for wc_string in wc_string_list:
    wordcloud_fake_1 = WordCloud(
        width = 800, height = 800, background_color ='white',
        collocations = False, stopwords = stopwords,
        min_font_size = 10).generate(wc_string)
    plt.figure(figsize = (8, 8), facecolor = None)
    plt.imshow(wordcloud_fake_1)
    plt.axis("off")
    plt.tight_layout(pad = 0)
    plt.show()

### top id_str

In [None]:
# def top_id_str(df, title):
#     df_top_id = df[['id_str', 'favorite_count', 'retweet_count']]
#     print("---- " + title + " ----")
#     df_top_id = df_top_id.groupby('id_str')['favorite_count', 'retweet_count'].agg('count')
#     print(df_top_id)

# top_id_str(df_fake_1, "gossipcop fake")
# top_id_str(df_real_1, "gossipcop real")
# top_id_str(df_fake_2, "politifact fake")
# top_id_str(df_real_2, "politifact real")

### 