In [None]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [None]:
df = pd.read_json("fake_news_reddit_cikm20.json")

In [None]:
r = np.random.RandomState(0)
random_perm = r.permutation(df.shape[0])
random_indices = random_perm[0:200]

In [None]:
r = requests.get("http://www.huffingtonpost.co.uk/2014/06/06/partially-taxidermied-african-dwarf-crocodile-scottish-golf-course-picture_n_5460051.html")
soup = BeautifulSoup(r.text)

In [None]:
# Request for given number of articles.
# Usde parallel requests to speed things up.

import asyncio
import aiohttp

async def get_url(index):
    try:
        async with aiohttp.ClientSession() as session:
            async with session.get(url=df["url"][index], timeout=20) as response:
                resp = await response.read()
                print(f"Succesfully got {df['url'][index]}")
                return index, resp
    except Exception as e:
        print(f"index:{index}, Unable to get url {df['url'][index]} due to {e.__class__}.")

async def main():
    ret = await asyncio.gather(*[get_url(index) for index in random_indices])
    print("Finalized all. ret is a list of len {} outputs.".format(len(ret)))
    return ret

ret = await main()

In [None]:
def count_words(text):
    words = text.split()
    word_count = len(words)
    return word_count

pd.set_option("display.max_rows", 5)
word_counts = df["text"].apply(count_words)
df["word_count"] = word_counts

In [None]:
success = 0
for r in ret:
    if r:
        index, resp = r
        try:
            soup = BeautifulSoup(resp)
            print(f"index: {index}\n{df['url'][index]}:\n{soup.title.text}\nfake/legit: {df['label'][index]}\n")
            success += 1
        except AttributeError as e:
            print(f"Error with url: {df['url'][index]}\n{e}")
print(f"Got {success} / {len(ret)} title elements.")

In [None]:
def count_words(text):
    words = text.split()
    word_count = len(words)
    return word_count

In [None]:
# Generate histogram for article lengths
word_counts = df["text"].apply(count_words)
import plotly.express as px
fig = px.histogram(df, x="word_count", nbins=100, title="Number of articles by word count" , 
                   labels = {"word_count": "Word count"})
fig.layout.yaxis.title.text = "Number of articles"
fig.show()



In [None]:
# Save the histogram to be uploaded to the website (with interactivity)
fig.write_html("histogram.html", full_html=False, include_plotlyjs=False)

In [None]:
# Bar chart for research organisations.
import plotly.graph_objects as go

fig = go.Figure([go.Bar(x=num_of_docs_per_researched_by["researched_by"], y=num_of_docs_per_researched_by["num_of_docs"])])
fig.update_layout(
    title="Number of documents researched per truth warrior organization",
    xaxis_title="Organization",
    yaxis_title="Number of documents",
    legend_title="Legend Title",
)
fig.show()

In [35]:
fig.write_html("./website/_includes/html/documents_researched_per_organization.html", full_html=False, include_plotlyjs=False)