## Collect All Article Data

In [1]:
# Get the articles
import requests
import json

url = "https://api-hoaxy.p.rapidapi.com/articles"

headers = {
    'x-rapidapi-host': "api-hoaxy.p.rapidapi.com",
    'x-rapidapi-key': "6dda23c3b6msh40fb43640818777p173117jsn2e799b5b84a7"
    }

vbm_query = "(mailin or ballot or votebymail)"

def fetch_articles(querystring):
  query = {"sort_by":"relevant","use_lucene_syntax":"true","query":querystring}
  response = requests.request("GET", url, headers=headers, params=query)
  return json.loads(response.text)

# print(fetch_articles(vbm_query)['articles'])

In [2]:
from datetime import date
from dateutil.relativedelta import relativedelta

def collect_artcicles_over_time(base_query, start_yr=2016, end_yr=2020, step=14):
  articles = []
  start = date(start_yr, 1, 1)
  while start.year < end_yr + 1:
    end = start + relativedelta(days=step)
    # Query for mail in voting articles published within this time period.
    querystring = base_query + f" AND date_published:[{start.isoformat()} TO {end.isoformat()}]"
    res = fetch_articles(querystring)
    if 'articles' in res.keys():
      articles.extend(res['articles'])
    # Update the start date so there's no overlap.
    start = end + relativedelta(days=+1)
  return articles

articles_18 = collect_artcicles_over_time(vbm_query, start_yr=2018, end_yr=2018)
articles_20 = collect_artcicles_over_time(vbm_query, start_yr=2020, end_yr=2020)

In [3]:
raw_articles = articles_18 + articles_20
print("num articles", len(raw_articles))
ids = set(map(lambda x: x["id"], raw_articles))
print("num unique ids", len(set(ids)))

num articles 4403
num unique ids 4403


## Save Article Data

In [15]:
from datetime import datetime

class Article:
    def __init__(self, article_dict):
        self.id = article_dict['id']
        self.title = article_dict['title'].replace(",", "")
        self.date_published = article_dict['date_published']
        self.url = article_dict['canonical_url']
        self.domain = article_dict['domain']
        self.num_tweets = article_dict['number_of_tweets']
        self.score = article_dict['score']
        self.site_type = article_dict['site_type']
    
    @classmethod
    def write_csv_col_names(cls, csv):
        csv.write(f"id,title,date_published,url,domain,num_tweets,score,site_type\n")
    
    def write_to_csv(self, csv):
        csv.write(f"{self.id},{self.title},{self.date_published},{self.url},{self.domain},{self.num_tweets},{self.score},{self.site_type}\n")
    
    def __str__(self):
        return f"Article({self.id}, {self.title}, {self.url})"
    
articles = [Article(a) for a in raw_articles]

In [16]:
article_data_path = "../data/all_articles.csv"

with open(article_data_path, "w") as f:
    Article.write_csv_col_names(f)
    for a in articles:
        a.write_to_csv(f)

## Read articles into a dataframe

In [41]:
import numpy as np
import pandas as pd

article_df = pd.read_csv(article_data_path)
article_df['date_published']= pd.to_datetime(article_df['date_published'])
article_df.head()

Unnamed: 0,id,title,date_published,url,domain,num_tweets,score,site_type
0,817272,Trump’s Porn Star Payoff Is The Final Nail In ...,2018-01-12 00:00:00+00:00,http://www.politicususa.com/2018/01/12/trump-1...,politicususa.com,377,8.927416,claim
1,818574,Journalist Murdered In Mexican Border City Aft...,2018-01-14 01:44:18+00:00,http://www.breitbart.com/texas/2018/01/13/mexi...,breitbart.com,190,6.58273,claim
2,811402,White House to Oprah: Bring It On!,2018-01-08 22:56:22+00:00,https://www.infowars.com/white-house-to-oprah-...,infowars.com,155,7.798293,claim
3,805188,Trump Forced To Shut Down Failed Voter Fraud C...,2018-01-03 19:56:09+00:00,http://www.politicususa.com/2018/01/03/trump-f...,politicususa.com,141,7.021348,claim
4,1124048,Dem strategist: My party’s leaders ‘blew it’ o...,2018-01-01 00:00:00+00:00,https://www.wnd.com/2018/10/dem-strategist-my-...,wnd.com,116,6.428014,claim


In [37]:
claim_df = article_df[article_df.site_type == 'claim']
fact_check_df = article_df[article_df.site_type == 'fact_checking']

print(f"Claim articles: {claim_df.shape[0]}")
print(f"Fact checking articles: {fact_check_df.shape[0]}")

Claim articles: 4021
Fact checking articles: 224


In [38]:
display(claim_df.sort_values('num_tweets',ascending=False)[:5])

Unnamed: 0,id,title,date_published,url,domain,num_tweets,score,site_type
4145,2168163,Detroit Absentee Ballot Counting Chaos Blocked...,2020-11-04T22:30:58.000Z,https://www.breitbart.com/politics/2020/11/04/...,breitbart.com,85074,107.874397,claim
3045,1794041,DOJ: Democrats Paid Pennsylvania Election Offi...,2020-05-21T17:15:16.000Z,https://www.breitbart.com/politics/2020/05/21/...,breitbart.com,80055,89.28273,claim
4146,2176377,Investigators Dispatched After Fulton County D...,2020-11-08T16:45:42.000Z,https://www.breitbart.com/2020-election/2020/1...,breitbart.com,33576,101.00396,claim
3945,2081743,Project Veritas Exposes Ilhan Omar Allies in A...,2020-09-28T02:20:56.000Z,https://www.breitbart.com/politics/2020/09/27/...,breitbart.com,18959,102.269173,claim
3946,2083787,Joe Biden's Texas Political Director Dallas Jo...,2020-09-28T23:16:26.000Z,https://www.thegatewaypundit.com/2020/09/joe-b...,thegatewaypundit.com,16590,98.643723,claim


In [39]:
display(fact_check_df.sort_values('num_tweets',ascending=False)[:5])

Unnamed: 0,id,title,date_published,url,domain,num_tweets,score,site_type
384,898436,Ted Cruz: Beto O'Rourke wants open border and ...,2018-03-15T00:00:00.000Z,https://www.politifact.com/texas/statements/20...,politifact.com,2802,16.74263,fact_checking
2746,1698377,Trump's Latest Voter Fraud Misinformation,2020-04-10T19:17:59.000Z,https://www.factcheck.org/2020/04/trumps-lates...,factcheck.org,2769,9.369649,fact_checking
3248,1858904,Trump's Absentee vs. Mail-In Ballot Spin,2020-06-19T15:02:00.000Z,https://www.factcheck.org/2020/06/trumps-absen...,factcheck.org,2464,105.958138,fact_checking
3054,1804595,Do Mail-In Ballots Increase Risk of Voter Fraud?,2020-05-26T22:22:14.000Z,https://www.snopes.com/fact-check/mail-in-ball...,snopes.com,715,42.956894,fact_checking
3648,1988227,How to make sure your ballot is counted this fall,2020-08-17T04:00:00.000Z,https://www.politifact.com/article/2020/aug/17...,politifact.com,600,100.23687,fact_checking
