## FakeNewsNet

### Imports

In [None]:
# python imports
import os.path
import urllib.request
import shutil
import zipfile
import json

# external library imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType
from pyspark.sql import DataFrame
from pyspark.sql import Row
from pyspark.rdd import RDD

# project imports


### PySpark contexts

In [None]:
spark = SparkSession.builder.appName('FakeNewsNet').getOrCreate()
sparkcontext = spark.sparkContext
sqlcontext   = SQLContext(sparkcontext)

### Dataset

In [None]:
# ----------------------------------------
# download or reference the dataset
# ----------------------------------------
def get_or_download_dataset():
    path = "./dataset/tweets"
    if not os.path.exists("./dataset"):
        print("downloading tweets...")
        os.mkdir("./dataset")
        # lionel: no public access to download the dataset directly. manually download and place in the stated directory
        url = "https://nusu-my.sharepoint.com/:u:/g/personal/e0809358_u_nus_edu/ETPkp1-0GbBHgB__wyeCS_QBE7_SFluzSCtocU0mUr3Jng"
        with urllib.request.urlopen(url) as response, open(path, 'wb') as out_file:
            shutil.copyfileobj(response, out_file)
            with zipfile.ZipFile(path + "/results.zip", "r") as zip_ref:
                zip_ref.extractall(path)
    return path
# ----------------------------------------
# convert to pandas dataframe
# ----------------------------------------
def to_dataframe_pandas(path, head_len = None):
    directories = os.listdir(path)
    json_all = []
    df_all = pd.DataFrame
    print("reading first " + str(head_len) + " json files in " + str(len(directories)) + " directories in " + path + "...")
    for index,directory in enumerate(directories):
        path_prefix = path + "/" + directory + "/tweets"
        files = os.listdir(path_prefix)
        # print("reading " + str(len(files)) + " json files from directory " + directory + " (" + str(index) + " of " + str(len(directories)) + ")")
        json_lines = []
        for file in files:
            path_full = path_prefix + "/" + file
            with open(path_full, 'r') as json_file:
                json_lines.append(json.loads(json_file.read()))
        df_current = pd.json_normalize(json_lines)
        assert len(files) == len(json_lines)
        assert len(files) == df_current.shape[0]
        # lionel: fixed schema with columns we want must be known beforehand as columns are mismatched among json files
        # df_all = pd.concat([df_all, df_current], axis=0, join='outer', sort=False)
        json_all.extend(json_lines)
        if head_len is not None and len(json_all) >= head_len:
            break
    if head_len is not None:
        df_all = pd.json_normalize(json_all[:head_len])
    else:
        df_all = pd.json_normalize(json_all)
    return df_all
# ----------------------------------------
# convert to spark dataframe
# ----------------------------------------
def to_dataframe_spark(path, head_len = None):
    # lionel: out-of-memory
    # df_all = sqlcontext.read.json(path + "/*/tweets/*.json")
    # return df_all
    directories = os.listdir(path)
    json_files = []
    df_all = spark.createDataFrame([], StructType([]))
    print("reading first " + str(head_len) + " json files in " + str(len(directories)) + " directories in " + path + "...")
    for index,directory in enumerate(directories):
        path_prefix = path + "/" + directory + "/tweets"
        files = os.listdir(path_prefix)
        # print("reading " + str(len(files)) + " json files from directory " + directory + " (" + str(index) + " of " + str(len(directories)) + ")")
        for file in files:
            json_files.append(path_prefix + "/" + file)
        # lionel: fixed schema with columns we want must be known beforehand as columns are mismatched among json files
        # df_current = sqlcontext.read.json(path_prefix + "/*.json")
        # assert len(files) == df_current.count()
        # df_all = df_all.unionByName(df_current, allowMissingColumns=True)
        if head_len is not None and len(json_files) >= head_len:
            break
    if head_len is not None:
        df_all = sqlcontext.read.json(json_files[:head_len])
        assert len(json_files[:head_len]) == df_all.count()
    else:
        df_all = sqlcontext.read.json(json_files)
    return df_all
# ----------------------------------------
# print helper function
# ----------------------------------------
def print_df(dfs):
    for df in dfs:
        print("==========")
        if isinstance(df, pd.DataFrame):
            print(df.shape)
            print(df.info())
            print(df.columns)
            print(df.describe())
        if isinstance(df, DataFrame):
            print("(" + str(df.count()) + "," + str(len(df.columns)) + ")")
            print(df.columns)
            print(df.summary().show())
    print("==========")
# ----------------------------------------
# call sites
# ----------------------------------------
!pwd
!ls -l
path_dataset = get_or_download_dataset()
df_fake_1 = to_dataframe_spark(path_dataset + "/gossipcop"  + "/fake", 10000)
df_real_1 = to_dataframe_spark(path_dataset + "/gossipcop"  + "/real", 10000)
df_fake_2 = to_dataframe_spark(path_dataset + "/politifact" + "/fake", 10000)
df_real_2 = to_dataframe_spark(path_dataset + "/politifact" + "/real", 10000)
df_fake_1 = df_fake_1.toPandas()
df_real_1 = df_real_1.toPandas()
df_fake_2 = df_fake_2.toPandas()
df_real_2 = df_real_2.toPandas()
print_df([df_fake_1, df_real_1, df_fake_2, df_real_2])
# lionel: weirdly, pandas' and spark's json APIs yield different number of columns!

### favourite_count

In [None]:
def favourite_count(df_fake_1, df_real_1, df_fake_2, df_real_2):
    X_fake_1_count = df_fake_1['favorite_count']
    X_real_1_count = df_real_1['favorite_count']
    X_fake_2_count = df_fake_2['favorite_count']
    X_real_2_count = df_real_2['favorite_count']
    
    plt.figure()
    plt.tick_params(labelsize=10)
    plt.boxplot([
        X_fake_1_count,
        X_real_1_count,
        X_fake_2_count,
        X_real_2_count])
    ax = plt.gca()
    ax.set_xticklabels([
        'gossipcop fake',
        'gossipcop real',
        'politifact fake',
        'politifact real'])
    plt.tight_layout()
    plt.show()
    
favourite_count(df_fake_1, df_real_1, df_fake_2, df_real_2)

### retweet_count

In [None]:
def retweet_count(df_fake_1, df_real_1, df_fake_2, df_real_2):
    X_fake_1_count = df_fake_1['retweet_count']
    X_real_1_count = df_real_1['retweet_count']
    X_fake_2_count = df_fake_2['retweet_count']
    X_real_2_count = df_real_2['retweet_count']
    
    plt.figure()
    plt.tick_params(labelsize=10)
    plt.boxplot([
        X_fake_1_count,
        X_real_1_count,
        X_fake_2_count,
        X_real_2_count])
    ax = plt.gca()
    ax.set_xticklabels([
        'gossipcop fake',
        'gossipcop real',
        'politifact fake',
        'politifact real'])
    plt.tight_layout()
    plt.show()
    
retweet_count(df_fake_1, df_real_1, df_fake_2, df_real_2)

### favorite_count to retweet_count (gossipcop fake)

In [None]:
def favorite_vs_retweet(df, title):
    X_val = df['favorite_count']
    Y_val = df['retweet_count']

    plt.figure()
    plt.tick_params(labelsize=14)
    plt.scatter(X_val, Y_val)
    plt.title(title, fontsize=14)
    plt.xlabel('favorite_count', fontsize=12)
    plt.ylabel('retweet_count', fontsize=12)
    plt.tight_layout()
    plt.show()
    
favorite_vs_retweet(df_fake_1, "gossipcop fake")
favorite_vs_retweet(df_real_1, "gossipcop real")
favorite_vs_retweet(df_fake_2, "politifact fake")
favorite_vs_retweet(df_real_2, "politifact real")

### top words in text

In [None]:
def top_interesting_text(df, title):
    col_text = df['text']
    word_freq = {}
    for curr_text in col_text:
        words = curr_text.split()
        for word in words:
            if word not in word_freq:
                word_freq[word] = 0
            word_freq[word] += 1
    top_words = sorted(word_freq.items(), key = lambda item : item[1], reverse=True)
    disable_set = {'to', 'a', 'the', 'and', 'on', '-', 'is', 'with',
                   'i', 'of', 'you', '&amp;', 'for', 'out', 'air',
                   'win', 'in', 'her', 'tv', 'at', 'after', 'best',
                   'your', 'first', 'new', 'about', 'found', 'as',
                   'how', 'via', 'my', 'by', 'if', 'one', 'four',
                   'that', 'have', 'was', 'this', 'are', 'be', 'it',
                   'has', 'from', 'all', 'but', 'just', 'not',
                   'only', 'two', 'more', 'will', 'an', 'me', 'had',
                   'like', 'we', 'so', 'been', 'our', 'or', 'three',
                   "i'm", 'years'
                  }
    k_count = 10
    print("---- " + title + " ----")
    for elem in top_words:
        if elem[0].lower() in disable_set:
            continue
        print(elem)
        k_count -= 1
        if k_count == 0:
            break
    
top_interesting_text(df_fake_1, "gossipcop fake")
top_interesting_text(df_real_1, "gossipcop real")
top_interesting_text(df_fake_2, "politifact fake")
top_interesting_text(df_real_2, "politifact real")

### top id_str

In [None]:
def top_id_str(df, title):
    df_top_id = df[['id_str', 'favorite_count', 'retweet_count']]
    print("---- " + title + " ----")
    df_top_id = df_top_id.groupby('id_str')['favorite_count', 'retweet_count'].agg('count')
    print(df_top_id)

top_id_str(df_fake_1, "gossipcop fake")
top_id_str(df_real_1, "gossipcop real")
top_id_str(df_fake_2, "politifact fake")
top_id_str(df_real_2, "politifact real")

### 

### Association Rules with top keywords