In [1]:
import requests
from bs4 import BeautifulSoup
import time
import random
import re
import pandas as pd

# Fetch News Quotes and Labels
The function below scans the Politfact webpage and extracts different information related to political quotes.

In [2]:
def FetchNews(label, page_start, page_end):
    """Function used to scrape the news articles page by page
    
    Args:
        label (str): 'true', 'mostly-true', 'half-true', 'barely-true', 'false', 'pants-fire'
        page_start (int): Number of first page
        page_end (int): Number of last page
        
    Returns:
        df_raw (DataFrame): dataframe containing the news quotes
    
    """
    
    # Initialize empty dataframe
    df_raw = pd.DataFrame()

    # Iterate through range of pages (or until last available)
    for page_num in range(page_start, page_end):
        
        # Fetch page = page_num, fetch all news articles
        html = requests.get(f'https://www.politifact.com/factchecks/list/?page={page_num}&ruling={label}')
        soup = BeautifulSoup(html.text, 'html.parser')
        articles = soup.findAll('div', {'class': 'm-statement__quote'})

        # If 'pfhead' class is found, it means the page couldn't be found; otherwise returns None
        error = soup.find('div', {'class': 'pfhead'})

        if error == None:
            
            print(f'Fetching {label} news page {page_num}...')

            # Iterate through articles
            for article in articles:
                
                # Fetch artcile page
                url = re.search(r'<a href="(.*)">', str(article)).group(1)
                html = requests.get(f'https://www.politifact.com{url}')
                soup = BeautifulSoup(html.text, 'html.parser')

                # Fetch raw content from divs
                quote_raw = soup.find('div', {'class': 'm-statement__quote'}).text
                author_raw = soup.find('a', {'class': 'm-statement__name'}).text
                context_raw = str(soup.find('div', {'class': 'm-statement__desc'}))
                categories_raw = soup.findAll('li', {'class': 'm-list__item'})
                staff_raw = str(soup.findAll('div', {'class': 'm-author__content'}))

                # Clean up data a little
                quote = quote_raw.strip()
                author = author_raw.strip()
                date_regex = re.search(r'on ([A-Za-z]+ \d{1,2}, \d{4}) in', context_raw)
                date = date_regex.group(1) if date_regex is not None else 'unspecified'
                context_regex = re.search(r'\d{4} in?(.*)', context_raw)
                context = context_regex.group(1).strip().strip(':') if context_regex is not None else 'unspecified'
                categories = ', '.join(re.findall(r'title=\"(.*)\">', str(categories_raw[:-1])))
                staff = ', '.join(re.findall(r'>(.*)</a>', staff_raw))
    
                # Create row
                row = pd.DataFrame({
                    'label': [label], 
                    'quote': [quote], 
                    'context': [context], 
                    'author': [author], 
                    'date': [date], 
                    'categories': [categories],
                    'staff': [staff]
                })

                # Append row to dataframe
                df_raw = df_raw.append(row)

                # Sleep for a few seconds, be nice to web servers :)
                pause = random.randint(3, 5)
                #print(f'Fetched news from page {page_num}, sleeping for {pause} seconds.')
                time.sleep(pause)

        else:
            page_end = page_num
            break
    
    
    print(f'Done! Fetched all {label} news from pages {page_start} to {page_end}.')
    return(df_raw)


Since it takes some time to retrieve the data, it's best to separate by label and do the extraction in segments. The `page_start` and `page_end` parameters allow us to set the range of pages to collect. Since the maximum number of pages changes as more news items are added with time, simply set `page_end` to a high value (ie. `1000`).

## True News

In [3]:
true_df = FetchNews('true', 1, 3)

Fetching true news page 1...
Fetching true news page 2...
Done! Fetched all true news from page 1 to 3.


## Mostly True News

In [4]:
mostly_true_df = FetchNews('mostly-true', 1, 3)

Fetching mostly-true news page 1...
Fetching mostly-true news page 2...
Done! Fetched all mostly-true news from page 1 to 3.


## Half True News

In [5]:
half_true_df = FetchNews('half-true', 1, 3)

Fetching half-true news page 1...
Fetching half-true news page 2...
Done! Fetched all half-true news from page 1 to 3.


## Barely True News

In [6]:
barely_true_df = FetchNews('barely-true', 1, 3)

Fetching barely-true news page 1...
Fetching barely-true news page 2...
Done! Fetched all barely-true news from page 1 to 3.


## False News

In [7]:
false_df = FetchNews('false', 1, 3)

Fetching false news page 1...
Fetching false news page 2...
Done! Fetched all false news from page 1 to 3.


## Pants on Fire News

In [8]:
pants_fire_df = FetchNews('pants-fire', 1, 3)

Fetching pants-fire news page 1...
Fetching pants-fire news page 2...
Done! Fetched all pants-fire news from page 1 to 3.


# Export Tables

In [9]:
def exportDataFrame(df, filename):
    df = df.reset_index(drop=True)
    df.to_csv(filename + '.csv', index=False)

# Export dataframes
exportDataFrame(true_df, 'true.csv')
exportDataFrame(mostly_true_df, 'mostly-true.csv')
exportDataFrame(half_true_df, 'half-true.csv')
exportDataFrame(barely_true_df, 'barely-true.csv')
exportDataFrame(false_df, 'false.csv')
exportDataFrame(pants_fire_df, 'pants-fire.csv')