In [11]:
import requests
from bs4 import BeautifulSoup
import time
import random
import re
import pandas as pd

# Fetch News Quotes and Labels
The function below scans the Politfact webpage and extracts different information related to political quotes.

In [12]:
def FetchNews(label, page_start, page_end, df=None):
    """Function used to scrape the news articles page by page
    
    Args:
        label (str): 'true', 'mostly-true', 'half-true', 'barely-true', 'false', 'pants-fire'
        page_start (int): Number of first page
        page_end (int): Number of last page
        
    Returns:
        df_raw (DataFrame): dataframe containing the news quotes
    
    """
    if df is None:
        existing_quotes = []
    else:
        existing_quotes = list(df['quote'])
    
    # Initialize empty dataframe
    df_raw = pd.DataFrame()

    # Iterate through range of pages (or until last available)
    for page_num in range(page_start, page_end):
        
        # Fetch page = page_num, fetch all news articles
        html = requests.get(f'https://www.politifact.com/factchecks/list/?page={page_num}&ruling={label}')
        soup = BeautifulSoup(html.text, 'html.parser')
        articles = soup.findAll('div', {'class': 'm-statement__quote'})

        # If 'pfhead' class is found, it means the page couldn't be found; otherwise returns None
        error = soup.find('div', {'class': 'pfhead'})

        if error == None:
            
            print(f'Fetching {label} news page {page_num}...')

            # Iterate through articles
            for article in articles:
                
                # Fetch artcile page
                url = re.search(r'<a href="(.*)">', str(article)).group(1)
                html = requests.get(f'https://www.politifact.com{url}')
                soup = BeautifulSoup(html.text, 'html.parser')

                # Fetch raw content from divs
                quote_raw = soup.find('div', {'class': 'm-statement__quote'}).text
                author_raw = soup.find('a', {'class': 'm-statement__name'}).text
                context_raw = str(soup.find('div', {'class': 'm-statement__desc'}))
                categories_raw = soup.findAll('li', {'class': 'm-list__item'})
                staff_raw = str(soup.findAll('div', {'class': 'm-author__content'}))

                # Clean up data a little
                quote = quote_raw.strip()
                author = author_raw.strip()
                date_regex = re.search(r'on ([A-Za-z]+ \d{1,2}, \d{4}) in', context_raw)
                date = date_regex.group(1) if date_regex is not None else 'unspecified'
                context_regex = re.search(r'\d{4} in?(.*)', context_raw)
                context = context_regex.group(1).strip().strip(':') if context_regex is not None else 'unspecified'
                categories = ', '.join(re.findall(r'title=\"(.*)\">', str(categories_raw[:-1])))
                staff = ', '.join(re.findall(r'>(.*)</a>', staff_raw))
                
                if quote not in existing_quotes:
    
                    # Create row
                    row = pd.DataFrame({
                        'label': [label], 
                        'quote': [quote], 
                        'context': [context], 
                        'author': [author], 
                        'date': [date], 
                        'categories': [categories],
                        'staff': [staff]
                    })

                    # Append row to dataframe
                    df_raw = df_raw.append(row, ignore_index=True)

                    # Sleep for a few seconds, be nice to web servers :)
                    pause = random.randint(3, 5)
                    #print(f'Fetched news from page {page_num}, sleeping for {pause} seconds.')
                    time.sleep(pause)
                else:
                    print(f'Entry already exists! Stopping execution...')
                    print(f'Done! Updated dataset with {label} news from pages {page_start} to {page_num}.')
                    return(df_raw.append(df, ignore_index=True).reset_index())

        else:
            page_end = page_num
            break
    
    
    print(f'Done! Fetched all {label} news from pages {page_start} to {page_end}.')
    return(df_raw.reset_index())


Since it takes some time to retrieve the data, it's best to separate by label and do the extraction in segments. The `page_start` and `page_end` parameters allow us to set the range of pages to collect. Since the maximum number of pages changes as more news items are added with time, simply set `page_end` to a high value (ie. `1000`).

## True News

In [13]:
# true_df = FetchNews('true', 1, 3)
true_df = FetchNews('true', 1, 3, pd.read_csv('datasets/true.csv', dtype={'label':str}))

Fetching true news page 1...
Entry already exists! Stopping execution...
Done! Updated dataset with true news from pages 1 to 1.


## Mostly True News

In [14]:
#mostly_true_df = FetchNews('mostly-true', 1, 3)
mostly_true_df = FetchNews('mostly-true', 1, 1000, pd.read_csv('datasets/mostly-true.csv', dtype={'label':str}))

Fetching mostly-true news page 1...
Entry already exists! Stopping execution...
Done! Updated dataset with mostly-true news from pages 1 to 1.


## Half True News

In [15]:
#half_true_df = FetchNews('half-true', 1, 3)
half_true_df = FetchNews('half-true', 1, 1000, pd.read_csv('datasets/half-true.csv', dtype={'label':str}))

Fetching half-true news page 1...
Entry already exists! Stopping execution...
Done! Updated dataset with half-true news from pages 1 to 1.


## Barely True News

In [16]:
#barely_true_df = FetchNews('barely-true', 1, 3)
barely_true_df = FetchNews('barely-true', 1, 1000, pd.read_csv('datasets/barely-true.csv', dtype={'label':str}))

Fetching barely-true news page 1...
Entry already exists! Stopping execution...
Done! Updated dataset with barely-true news from pages 1 to 1.


## False News

In [17]:
#false_df = FetchNews('false', 1, 3)
false_df = FetchNews('false', 1, 1000, pd.read_csv('datasets/false.csv', dtype={'label':str}))

Fetching false news page 1...
Fetching false news page 2...
Fetching false news page 3...
Entry already exists! Stopping execution...
Done! Updated dataset with false news from pages 1 to 3.


## Pants on Fire News

In [18]:
#pants_fire_df = FetchNews('pants-fire', 1, 3)
pants_fire_df = FetchNews('pants-fire', 1, 1000, pd.read_csv('datasets/pants-fire.csv', dtype={'label':str}))

Fetching pants-fire news page 1...
Entry already exists! Stopping execution...
Done! Updated dataset with pants-fire news from pages 1 to 1.


In [19]:
mostly_true_df.head(10)

Unnamed: 0,index,label,quote,context,author,date,categories,staff
0,0,mostly-true,“West Virginia is near last in the U.S.” ranki...,a tweet,Paula Jean Swearengin,"September 10, 2020","West Virginia, Environment, Children, Education","Morgan Akers, Rylan Toledo"
1,1,mostly-true,"Says Illinois has made a ""nation-leading"" inve...",a tweet,JB Pritzker,"October 13, 2020","Census, Illinois",Deborah Wilber
2,2,mostly-true,"""Mike Parson opposes protections for pre-exist...",a campaign ad,Nicole Galloway,"September 26, 2020","Health Care, Missouri",William Skipworth
3,3,mostly-true,"""I am for protecting pre-existing conditions. ...",a debate,Chris Jacobs,"October 21, 2020","Health Care, New York",Jill Terreri Ramos
4,4,mostly-true,Says U.S. Rep. Ann Wagner “voted five times ag...,a Facebook post,Jill Schupp,"September 16, 2020","Health Care, Missouri",Noah Crider
5,5,mostly-true,“Now with the COVID-19 … more people are depen...,an interview with the Washington Examiner,Joe Manchin,"August 19, 2020","West Virginia, Drugs, Public Health","Delaney Geiger, Julia Maltby"
6,6,mostly-true,Now with the COVID-19 … more people are depend...,an interview with the Washington Examiner,Joe Manchin,"August 19, 2020","West Virginia, Drugs, Public Health","Delaney Geiger, Julia Maltby"
7,7,mostly-true,An “anti-Black Lives Matter” flag replaced the...,Facebook post,Facebook posts,"October 24, 2020","National, Elections, Legal Issues, Wisconsin",Laura Schulte
8,8,mostly-true,Upton “voted a dozen times to kick thousands o...,a TV ad,Jon Hoadley,"October 13, 2020","Health Care, Michigan",Clara Hendrickson
9,9,mostly-true,"Says she is ranked ""one of the most bipartisan...",comments at a debate,Joni Ernst,"October 15, 2020","Bipartisanship, Candidate Biography, Iowa","Rachel Schilke, Rylee Wilson"


# Export Tables

In [21]:
def exportDataFrame(df, filename):
    df = df.reset_index(drop=True)
    df.to_csv(filename, index=False)

# Export dataframes
exportDataFrame(true_df, 'datasets/true.csv')
exportDataFrame(mostly_true_df, 'datasets/mostly-true.csv')
exportDataFrame(half_true_df, 'datasets/half-true.csv')
exportDataFrame(barely_true_df, 'datasets/barely-true.csv')
exportDataFrame(false_df, 'datasets/false.csv')
exportDataFrame(pants_fire_df, 'datasets/pants-fire.csv')