<a href="https://colab.research.google.com/github/jcorpac/politifact_predict/blob/master/PolitiFact%20Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Library Imports and metadata

In [0]:
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
from os import path
import requests
import hashlib
import time

In [0]:
from google.colab import auth
auth.authenticate_user()

import gspread
import gspread_dataframe as gs_df
from oauth2client.client import GoogleCredentials

project_id = 'jcorpac'
bucket_name = 'politifact_prediction'

g_sheets_key = "1e0c0Cuv6yIg8Z30vfF8lvHVX9yv6y-ADu56In8ZZUk0"
g_sheets_raw_data_tag = "raw_data"
g_sheets_training_data_tag = "training_data"

bucket_raw_data = "data_set/politifact_data.csv"
bucket_training_data = "data_set/training_data.csv"

In [0]:
TOTAL_PAGES = 566
col_index = ['name', 'quote_desc', 'quote', 'link', 'date_line', 'rating', 'sha256']
local_raw_data = "./politifact_data.csv"
local_training_data = "./training_data.csv"

# **Data Scraping Functions**

## get_politifacts_page()

---
### Parameters
* page_num - The page from Politifact's fact-check list for the function to 
receive. Politifact orders its search from newest to oldest. The first page or two should be sufficient for updating an existing dataset.

---
### Returns
A Pandas DataFrame containing up to 30 rows (determined by the Politifact page). The columns in the dataframe indicate the following.

* name - The name of the person who made the quote.
* quote_desc - The place and date where the quote was made.
* quote - The actual quote. Sometimes preceeded with connecting phrases such as "says" or "tweeted". These need to be cleared out in later data processing.
* link - The URL for the associated Politifact article.
* date_line - The name of the Politifact reporter and the date that the entry was made. Best place to parse out a date if sorting the data by date.
* rating - The rating that Politifact assigned for a given quote.
* sha256 - A unique hexidecimal identifier generated from the above columns. Can be used for duplicate entry detection and repeatable train/validation/test dataset splits.

In [0]:
def get_politifacts_page(page_num):
    page = requests.get(f"https://www.politifact.com/factchecks/list/?page={page_num}")
    soup = BeautifulSoup(page.content, 'html.parser')
    page_data = pd.DataFrame(columns=col_index)
    items = soup.findAll('article', class_="m-statement")
    for item in items:
        # Who (or what) is the quote attributed to
        name = item.find('a', class_="m-statement__name").get_text().strip()
        # Where was the quote made?
        quote_desc = item.find('div', class_="m-statement__desc").get_text().strip()
        # What is the quote
        quote = item.find('div', class_="m-statement__quote").find('a')
        link = f"https://politifact.com{quote['href'].strip()}"
        quote_text = quote.get_text().strip()
        # Date line with attribution, used for SHA-256 signature
        date_line = item.find('footer', class_="m-statement__footer").get_text().strip()
        # Rating - Label to be predicted. Might filter some of these out later
        rating = item.find('div', class_="m-statement__meter").find('picture').find('img')['alt']

        # SHA 256 hash used for identifying duplicate entries.
        # Will also be converted to an int for repeatable train/validation/test splits. 
        sha256 = hashlib.sha256(f"{name}{quote_desc}{quote_text}{link}{date_line}{rating}".encode()).hexdigest()

        new_row = {col_index[0]: name, col_index[1]: quote_desc, col_index[2]: quote_text, col_index[3]: link,
                   col_index[4]: date_line, col_index[5]: rating, col_index[6]: sha256}
        page_data = page_data.append(new_row, ignore_index=True)
    return page_data

## update_politifact_data_set()

---
### Suggested uses

*   `update_politifact_data_set()` - Creates a new data frame with the 30 most recent updates. Good for creating short sample test datasets.
*   `update_politifact_data_set(data_set=data_frame)` - Updates an existing data frame with 1 page of the most recent updates. Useful for daily updates.
*   `update_politifact_data_set(3, data_set=data_frame)` - Updates an existing data frame with 3 pages of the most recent updates. Increase the number to retrieve more pages if the data hasn't been updated for awhile.
*    `update_politifact_data_set(TOTAL_PAGES)` - Scrapes the whole Politifact index. Make sure to update the TOTAL_PAGES variable first to indicate the last page on the site. 

---
### Parameters
* end_page - The index of the last page in the retrieval process. Default set to 1 to update from the latest 30 entries.
* start_page - The index of the first page in the retrival process. Leave this field blank to start from the most recent entries.
* data_set - A Pandas Dataframe, the data set to be updated. If this field is blank, or None, a new DataFrame will be generated.

---
### Returns
A collection of pages in the format of the get_politifacts_page() function listed above, concatenated into one Pandas DataFrame. This data will have the existing data_set appended to it with the most recently posted data first.

In [0]:
def update_politifact_data_set(end_page=1, start_page=1, data_set=None):
    if data_set is None:
        data_set = pd.DataFrame(columns=col_index)

    if end_page > TOTAL_PAGES:
        end_page = TOTAL_PAGES

    for page_number in tqdm(range(end_page, start_page-1, -1)):
        data_set = get_politifacts_page(page_num=page_number).append(data_set, ignore_index=True)
        # 3 second delay between page requests. Don't be rude and slam their server.
        time.sleep(3)

    # Remove any duplicate entries
    data_set = data_set.drop_duplicates(subset="sha256")

    return data_set

# Load Politifact data from CSV or Google Sheets and Update

### Retrieve current csv data file from GCS Storage bucket

In [0]:
from googleapiclient.http import MediaFileUpload
from googleapiclient.discovery import build
from apiclient.http import MediaIoBaseDownload
gcs_service = build('storage', 'v1')

with open(local_raw_data, 'wb') as f:
  # Download the file from a given Google Cloud Storage bucket.
  request = gcs_service.objects().get_media(bucket=bucket_name,
                                            object=bucket_raw_data)
  media = MediaIoBaseDownload(f, request)

  done = False
  while not done:
    # _ is a placeholder for a progress object that we ignore.
    # (Our file is small, so we skip reporting progress.)
    _, done = media.next_chunk()

### Read data from csv data file and update

In [7]:
if path.exists(local_raw_data):
    data = pd.read_csv(local_raw_data, sep='|')
else:
    data = None

data = update_politifact_data_set(data_set=data)

data.to_csv(local_raw_data, header=True, index=False, sep='|')

100%|██████████| 1/1 [00:03<00:00,  3.82s/it]


### Upload CSV data to Google Cloud Storage bucket

In [0]:
media = MediaFileUpload(local_raw_data, 
                        mimetype='text/plain',
                        resumable=True)

request = gcs_service.objects().insert(bucket=bucket_name, 
                                       name=bucket_raw_data,
                                       media_body=media)

response = None
while response is None:
  # _ is a placeholder for a progress object that we ignore.
  # (Our file is small, so we skip reporting progress.)
  _, response = request.next_chunk()

### Read data from Google Sheets and update

In [9]:
gc = gspread.authorize(GoogleCredentials.get_application_default())
sheet = gc.open_by_key(g_sheets_key).worksheet(g_sheets_raw_data_tag)
data = gs_df.get_as_dataframe(sheet)
data = update_politifact_data_set(data_set=data)
gs_df.set_with_dataframe(sheet, data)

100%|██████████| 1/1 [00:03<00:00,  3.27s/it]


# Pre-processing data

### Copy raw data set and extract model-relevant features

In [0]:
new_data_set = data.copy()
# Convert SHA256 value from hex string to integer.
new_data_set.sha256 = new_data_set.sha256.apply(int, base=16)

In [0]:
politifact_data = new_data_set[["quote", "rating", "sha256"]]

# Rows with Flip-related labels are not relevant to the model.
politifact_data = politifact_data[~politifact_data.rating.isin(["full-flop", "half-flip", "no-flip"])]

# Cast ratings as strings to avoid errors when changing case
politifact_data.rating = politifact_data.rating.astype(str)

# Remove case from the rating to merge False/false and True/true ratings
politifact_data.rating = politifact_data.rating.str.lower()

# If the last digit in the converted SHA value is 0-7, label it for training data
# If the last digit is 8 or 9, label it for the test set
politifact_data["is_test"] = politifact_data.sha256 % 10 >= 8

# Once we have the split, we don't need the SHA value anymore
politifact_data.drop(columns="sha256", inplace=True)

# Some duplicate quotes remain, remove them.
politifact_data.drop_duplicates(subset="quote", inplace=True)

# Remove connecting phrases from quotes unlikely to appear when model is in use.
connecting_phrases = ["Says ", "Say ", "Tweeted ", "Quoted ", "Quotes ", "Says of "]
for phrase in connecting_phrases:
    politifact_data.quote = politifact_data.quote.str.replace(phrase, "")

### Update training data in Google Sheets spreadsheet and GCS Bucket

In [0]:
training_sheet = gc.open_by_key(g_sheets_key).worksheet(g_sheets_training_data_tag)
gs_df.set_with_dataframe(training_sheet, politifact_data)

politifact_data.to_csv(local_training_data, header=True, index=False, sep='|')
media = MediaFileUpload(local_training_data, 
                        mimetype='text/plain',
                        resumable=True)

request = gcs_service.objects().insert(bucket=bucket_name, 
                                       name=bucket_training_data,
                                       media_body=media)

response = None
while response is None:
  # _ is a placeholder for a progress object that we ignore.
  # (Our file is small, so we skip reporting progress.)
  _, response = request.next_chunk()

### Split training/test data and convert to numpy arrays for TensorFlow models

In [0]:
training_data, test_data = politifact_data[~politifact_data.is_test], politifact_data[politifact_data.is_test]

training_features = training_data.quote.to_numpy()
training_labels = training_data.rating
training_labels_one_hot = pd.get_dummies(training_labels)
ratings = training_labels_one_hot.columns.to_list()
training_labels_one_hot = training_labels_one_hot.to_numpy()

test_features = test_data.quote.to_numpy()
test_labels = test_data.rating
test_labels_one_hot = pd.get_dummies(test_labels).to_numpy()

In [0]:
# Converting ratings into numeric scores. May be used for regression model later.
rating_scores = {'pants-fire':0.0, 'false':0.2, 'barely-true':0.4, 'half-true':0.6, 'mostly-true':0.8, 'true':1.0}

training_labels_scores = training_labels.map(rating_scores).to_numpy()
test_labels_scores = test_labels.map(rating_scores).to_numpy()

# Experimental Code (WIP)