## Create Eval dataset from politifact data

In [96]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse

def scrape_urls(url):
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the div with id='sources'
        sources_div = soup.find('section', id='sources')
        
        if sources_div:
            # Find all anchor tags within the div
            links = sources_div.find_all('a')
            
            # Extract the URLs
            urls = [link.get('href') for link in links]
            
            # Filter out the URLs that are not relevant
            urls = [url for url in urls if url and urlparse(url).netloc.endswith('.gov')]
            
            return urls
        else:
            print("No div with id='sources' found on the page.")
            return []
    else:
        print("Failed to fetch the page:", response.status_code)
        return []

In [97]:
import pandas as pd

politifact_dataset_path = './data/entered_claims 3.csv'

politifact_data = pd.read_csv(politifact_dataset_path)

politifact_urls = politifact_data.url

In [None]:
gov_links = [scrape_urls(x) for x in politifact_urls]

In [99]:
# Optionally, save the data to a JSON file

# import json

# with open('./data/gov_links.json', 'w') as f:
#     json.dump(gov_links, f)


# Optionally, load the data from a JSON file

# import json

# with open('./data/gov_links.json', 'r') as f:
#     gov_links = json.load(f)


In [100]:
import re

# Create a map to map the bill name to the appropriate abbreviation
bill_name_map = {
    'house-bill': 'HR',
    'senate-bill': 'S',
    'house-resolution': 'HRES',
    'senate-resolution': 'SRES',
    'house-concurrent-resolution': 'HCONRES',
    'senate-concurrent-resolution': 'SCONRES',
    'senate-joint-resolution': 'SJRES',
    'house-joint-resolution': 'HJRES',
}

# Create an empty list to store the evaluation set
eval_set = []

# Iterate over the each politifact URL
for i in range(len(gov_links)):
    
    # Get the list of .gov links for the current politifact URL
    poli_sources = gov_links[i]

    # Iterate over the list of URLs
    for source_url in poli_sources:
        
        # Parse the URL
        parsed_url = urlparse(source_url)

        # For now, only supporting congress.gov URLs
        if 'congress.gov' in parsed_url.netloc:
            
            # Normalize and extract the path
            path = parsed_url.path if parsed_url.path[0] == '/' else '/' + parsed_url.path
            path_parts = path.split('/')
            
            # Check if the URL is a bill URL
            if path_parts[1] != 'bill':
                continue
            
            # Extract the congress number, split by '-' and find first number in e.g., '116th-congress'
            congress_number = path_parts[2]
            congress_number = congress_number.split('-')[0]
            congress_number = re.search(r'\d+', congress_number).group(0)
            
            # Extract the bill name (e.g., 'house-bill')
            bill_name = path_parts[3]
            
            # Check if the bill name is in the map, may need to support more in the future?
            if not bill_name in bill_name_map:
                continue
            
            # Map the bill name to the appropriate abbreviation
            bill_name = bill_name_map[bill_name]

            # Extract the bill number. will be first (and hopefully only number in the path), e.g. '1234'
            bill_number = path_parts[4]
            bill_number = re.search(r'\d+', bill_number).group(0)

            # Add the URL and the bill information to the evaluation set
            eval_set += [(politifact_urls[i], f'{congress_number} {bill_name} {bill_number}')]
            

In [101]:
grouped_bills = pd.DataFrame(eval_set).groupby(0).agg(lambda x: x.tolist()).reset_index().rename(columns={0: 'url', 1: 'sourced_bills'})

In [106]:
eval_dataset = politifact_data.merge(grouped_bills, on='url').drop(columns=['Unnamed: 6', 'bills_found', 'actual_bills', 'intersection'])

In [107]:
eval_dataset.to_csv('./data/politifact-claims.csv', index=True)