# Web Scraper
In order to fetch the most up-to-date news quotes along with the labels, we will scrape the information from Politifact.

In [1]:
import requests
from bs4 import BeautifulSoup
import time
import random
import re
import pandas as pd

## Fetch News Quotes and Labels
The function below scans the Politfact webpage and extracts different information related to political quotes.

In [2]:
def FetchNews(label, page_start, page_end, df=None):
    """Function used to scrape the news articles page by page
    
    Args:
        label (str): 'true', 'mostly-true', 'half-true', 'barely-true', 'false', 'pants-fire'
        page_start (int): Number of first page
        page_end (int): Number of last page
        
    Returns:
        df_raw (DataFrame): dataframe containing the news quotes
    
    """
    if df is None:
        existing_quotes = []
    else:
        existing_quotes = list(df['quote'])
    
    # Initialize empty dataframe and variables
    df_raw = pd.DataFrame()
    export_vars = ['label', 'quote', 'context', 'author_id', 'author_name', 'date', 'categories', 'staff']

    # Iterate through range of pages (or until last available)
    for page_num in range(page_start, page_end):
        
        # Fetch page = page_num, fetch all news articles
        html = requests.get(f'https://www.politifact.com/factchecks/list/?page={page_num}&ruling={label}')
        soup = BeautifulSoup(html.text, 'html.parser')
        articles = soup.findAll('div', {'class': 'm-statement__quote'})

        # If 'pfhead' class is found, it means the page couldn't be found; otherwise returns None
        error = soup.find('div', {'class': 'pfhead'})

        if error == None:
            
            print(f'Fetching {label} news page {page_num}...')

            # Iterate through articles
            for article in articles:
                
                # Fetch artcile page
                url = re.search(r'<a href="(.*)">', str(article)).group(1)
                html = requests.get(f'https://www.politifact.com{url}')
                soup = BeautifulSoup(html.text, 'html.parser')

                # Fetch raw content from divs
                quote_raw = soup.find('div', {'class': 'm-statement__quote'}).text
                author_id_raw = soup.find('div', {'class': 'm-statement__meta'}).find('a', href=True)['href']
                author_name_raw = soup.find('a', {'class': 'm-statement__name'}).text
                context_raw = str(soup.find('div', {'class': 'm-statement__desc'}))
                categories_raw = soup.findAll('li', {'class': 'm-list__item'})
                staff_raw = str(soup.findAll('div', {'class': 'm-author__content'}))

                # Clean up data a little
                quote = quote_raw.strip()
                author_id = re.search(r'/personalities/(.*)/', author_id_raw).group(1).strip()
                author_name = author_name_raw.strip()
                date_regex = re.search(r'on ([A-Za-z]+ \d{1,2}, \d{4})( in|:)', context_raw)
                date = date_regex.group(1) if date_regex is not None else 'unspecified'
                context_regex = re.search(r'\d{4} in?(.*)', context_raw)
                context = context_regex.group(1).strip().strip(':') if context_regex is not None else 'unspecified'
                categories = ', '.join(re.findall(r'title=\"(.*)\">', str(categories_raw[:-1])))
                staff = ', '.join(re.findall(r'>(.*)</a>', staff_raw))
                
                if quote not in existing_quotes:
    
                    # Create row
                    row = pd.DataFrame({
                        'label': [label], 
                        'quote': [quote], 
                        'context': [context],
                        'author_id': [author_id],
                        'author_name': [author_name], 
                        'date': [date], 
                        'categories': [categories],
                        'staff': [staff]
                    })

                    # Append row to dataframe
                    df_raw = df_raw.append(row, ignore_index=True)

                    # Sleep for a few seconds, be nice to web servers :)
                    pause = random.randint(2, 4)
                    #print(f'Fetched news from page {page_num}, sleeping for {pause} seconds.')
                    time.sleep(pause)
                else:
                    print(f'Entry already exists! Stopping execution...')
                    print(f'Done! Updated dataset with {label} news from pages {page_start} to {page_num}.')
                    return(df_raw.append(df, ignore_index=True).loc[:, export_vars])

        else:
            page_end = page_num
            break
    
    
    print(f'Done! Fetched all {label} news from pages {page_start} to {page_end}.')
    return(df_raw.loc[:, export_vars])


Since it takes some time to retrieve the data, it's best to separate by label and do the extraction in segments. The `page_start` and `page_end` parameters allow us to set the range of pages to collect. Since the maximum number of pages changes as more news items are added with time, simply set `page_end` to a high value (ie. `1000`).

In [3]:
data_path = 'data/'
metadata_path = 'metadata/'

### True News

In [4]:
#true_df = FetchNews('true', 1, 1000)
true_df = FetchNews('true', 1, 1000, pd.read_csv(data_path + 'true.csv', dtype={'label':str}))

Fetching true news page 1...
Entry already exists! Stopping execution...
Done! Updated dataset with true news from pages 1 to 1.


In [5]:
true_df.head()

Unnamed: 0,label,quote,context,author_id,author_name,date,categories,staff
0,True,When Donald Trump lost the Iowa caucus to Ted ...,tweets,tweets,Tweets,"November 18, 2020","Elections, Iowa",Eleanor Hildebrandt
1,True,"""We heard from the Department of Homeland Secu...",a TV interview,tammy-baldwin,Tammy Baldwin,"November 15, 2020","Criminal Justice, Elections, States, Wisconsin",Madeline Heim
2,True,"""I’ve released 22 years of my tax returns. You...",a rally,joe-biden,Joe Biden,"October 31, 2020","National, Candidate Biography, Ethics, Taxes",Bill McCarthy
3,True,"Farm bankruptcies are ""at an eight-year high.""",comments at a campaign rally,theresa-greenfield,Theresa Greenfield,"October 30, 2020","Agriculture, Iowa",Lyle Muller
4,True,Says Dan Forest has “missed almost half of the...,a debate,roy-cooper,Roy Cooper,"October 14, 2020","Education, North Carolina, Coronavirus",Paul Specht


### Mostly True News

In [6]:
#mostly_true_df = FetchNews('mostly-true', 1, 1000)
mostly_true_df = FetchNews('mostly-true', 1, 1000, pd.read_csv(data_path + 'mostly-true.csv', dtype={'label':str}))

Fetching mostly-true news page 1...
Entry already exists! Stopping execution...
Done! Updated dataset with mostly-true news from pages 1 to 1.


### Half True News

In [7]:
#half_true_df = FetchNews('half-true', 1, 1000)
half_true_df = FetchNews('half-true', 1, 1000, pd.read_csv(data_path + 'half-true.csv', dtype={'label':str}))

Fetching half-true news page 1...
Entry already exists! Stopping execution...
Done! Updated dataset with half-true news from pages 1 to 1.


### Barely True News

In [8]:
#barely_true_df = FetchNews('barely-true', 1, 1000)
barely_true_df = FetchNews('barely-true', 1, 1000, pd.read_csv(data_path + 'barely-true.csv', dtype={'label':str}))

Fetching barely-true news page 1...
Entry already exists! Stopping execution...
Done! Updated dataset with barely-true news from pages 1 to 1.


### False News

In [9]:
#false_df = FetchNews('false', 1, 1000)
false_df = FetchNews('false', 1, 1000, pd.read_csv(data_path + 'false.csv', dtype={'label':str}))

Fetching false news page 1...
Entry already exists! Stopping execution...
Done! Updated dataset with false news from pages 1 to 1.


### Pants on Fire News

In [10]:
#pants_fire_df = FetchNews('pants-fire', 1, 1000)
pants_fire_df = FetchNews('pants-fire', 1, 1000, pd.read_csv(data_path + 'pants-fire.csv', dtype={'label':str}))

Fetching pants-fire news page 1...
Entry already exists! Stopping execution...
Done! Updated dataset with pants-fire news from pages 1 to 1.


## Fetch Personalities
The function below scans the Politfact Personalities webpage and extracts different information related to each personality.

In [11]:
def FetchPersonalities(index_start=None, index_end=None, df=None):
    """Function used to scrape the personalities
    
    Args:
        None.
        
    Returns:
        df_person_raw (DataFrame): dataframe containing the personalities
    
    """
    if df is None:
        existing_personalities = []
        df_raw = pd.DataFrame()
    else:
        existing_personalities = list(df['author_id'])
        df_raw = df
    
    # Initalize list for sorting later
    sorter = []
    
    # Fetch page = page_num, fetch all personalities
    html = requests.get(f'https://www.politifact.com/personalities/')
    soup = BeautifulSoup(html.text, 'html.parser')
    subjects = soup.findAll('div', {'class': 'c-chyron'})
    
    if index_start is None:
        index_start = 0
    
    if index_end is None:
        index_end = len(subjects)
        
    # Iterate through personalities
    for subject in subjects[index_start:index_end]:
        
        # Fetch and clean personality and affiliation
        author_id_raw = subject.find('div', {'class': 'c-chyron__value'}).find('a', href=True)['href']
        author_name_raw = subject.find('div', {'class': 'c-chyron__value'}).text
        affiliation_raw = subject.find('div', {'class': 'c-chyron__subline'}).text
        
        author_id = re.search(r'/personalities/(.*)/', author_id_raw).group(1).strip()
        author_name = re.sub(' +', ' ', author_name_raw.strip())
        affiliation = re.sub(' +', ' ', affiliation_raw.strip())
        sorter.append(author_id)
        
        if author_id not in existing_personalities:
            
            print(f'Adding {author_id}')
            
            # Fetch personality page
            url = re.search(r'<a href="(.*)">', str(subject)).group(1)
            html = requests.get(f'https://www.politifact.com{url}')
            soup = BeautifulSoup(html.text, 'html.parser')
            
            error = soup.find('div', {'class': 'pfhead'})
            
            if error is None:
                # Fetch and clean description and link
                description_raw = soup.find('div', {'class': 'm-pageheader__body'}).text
                link_raw = soup.find('footer', {'class': 'm-pageheader__footer'})
                description = description_raw.strip()
                link = re.search(r' href="(.*?)"', str(link_raw)).group(1)
            else:
                description = ""
                link = ""

            # Create row
            row = pd.DataFrame({
                'author_id': [author_id],
                'author_name': [author_name], 
                'affiliation': [affiliation], 
                'description': [description], 
                'link': [link],
            })

            # Append row to dataframe
            df_raw = df_raw.append(row, ignore_index=True)

            # Sleep for a few seconds, be nice to web servers :)
            pause = random.randint(2, 4)
            time.sleep(pause)
        
        else:
            continue
    
    # Sort data the way it is presented originally
    sorterIndex = dict(zip(sorter, range(len(sorter))))
    df_raw['author_rank'] = df_raw['author_id'].map(sorterIndex)
    df_raw.sort_values(by='author_rank', inplace=True)
    df_raw.drop('author_rank', 1, inplace=True)
    
    print('Done fetching personalities!')
    return(df_raw.reset_index(drop=True))

In [12]:
# If running for first time, simply remove argument df
#personalities_df = FetchPersonalities()
personalities_df = FetchPersonalities(df=pd.read_csv(metadata_path + 'personalities.csv'))

Done fetching personalities!


## Export Tables

In [13]:
def exportDataFrame(df, filename):
    """Helper function to export dataframes
    
    Args:
        df (DataFrame): dataframe to export
        filename (str): name of file to export
    Returns:
        None.
        
    """
    df = df.reset_index(drop=True)
    df.to_csv(filename, index=False)

In [14]:
# Export quotes dataframes
#exportDataFrame(true_df, data_path + 'true.csv')
exportDataFrame(mostly_true_df, data_path + 'mostly-true.csv')
exportDataFrame(half_true_df, data_path + 'half-true.csv')
exportDataFrame(barely_true_df, data_path + 'barely-true.csv')
exportDataFrame(false_df, data_path + 'false.csv')
exportDataFrame(pants_fire_df, data_path + 'pants-fire.csv')

In [15]:
# Export personalities dataframes
exportDataFrame(personalities_df, metadata_path + 'personalities.csv')