# Web Scrape Notebook
### The purpose of this notebook is to test the feasibility of scraping the voting results website using Beautiful Soup

In [37]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [38]:
url = "https://www.presidency.ucsb.edu/statistics/elections/2024"

In [39]:
# Make a GET request to fetch the raw HTML content
response = requests.get(url)

In [40]:
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

In [41]:
# Find the table in the field-body div
table = soup.find('div', class_='field-body').find('table')
print(table)

<table>
<tbody>
<tr>
<td align="center" class="x176" colspan="11">
<table cellpadding="2" cellspacing="2" width="700">
<tbody>
<tr>
<td class="x176" colspan="2" rowspan="2"><strong>Party</strong></td>
<td align="center" class="x176" colspan="2"><strong>Nominees</strong></td>
<td align="center" class="x176" colspan="2" rowspan="2"><strong>Electoral Vote</strong></td>
<td align="center" class="x176" colspan="3" rowspan="2"><strong>Popular Vote</strong></td>
</tr>
<tr>
<td><strong>Presidential</strong></td>
<td><strong>Vice Presidential</strong></td>
</tr>
<tr>
<td><em> Democratic</em></td>
<td> </td>
<td>Kamala Harris</td>
<td>Tim Walz</td>
<td align="center" class="x176">226</td>
<td align="right" class="x176">42%</td>
<td align="right" class="x176">75,019,230</td>
<td align="right" class="x176">48.34%</td>
</tr>
<tr>
<td><em> Republican</em></td>
<td><img alt="" src="https://www.presidency.ucsb.edu/sites/default/files/wysiwyg_template_images/ic_check_circle_black2x.png" style="width: 2

In [42]:
# Extract headers
headers = []
header_row = table.find('thead').find_all('tr')

AttributeError: 'NoneType' object has no attribute 'find_all'

### It appears the headers are embedded within the table body instead of the thead


In [43]:
# Find the table in the field-body div
table = soup.find('div', class_='field-body').find('table')

In [44]:
# Extract table rows
rows = []
tbody = table.find('tbody')

In [45]:
for tr in tbody.find_all('tr'):
    # Get all td elements (including empty ones)
    tds = tr.find_all('td')
    
    # Extract only non-empty cells
    row_data = []
    for td in tds:
        cell_text = td.get_text(strip=True)
        # Only append non-empty cells
        if cell_text:
            row_data.append(cell_text)
    
    # State data rows have either 9 or 11 non-empty columns
    # 9 columns: when some EV values are missing (empty)
    # 11 columns: when all EV values are present (header row, totals row)
    if len(row_data) == 9 or len(row_data) == 11:
        rows.append(row_data)

print(rows)

[['STATE', 'TOTAL VOTES', 'Votes', '%', 'EV', 'Votes', '%', 'EV', 'Votes', '%', 'EV'], ['Alabama', '2,265,090', '772,412', '34.10%', '1,462,616', '64.57%', '9', '30,062', '1.33%'], ['Alaska', '338,177', '140,026', '41.41%', '184,458', '54.54%', '3', '13,693', '4.05%'], ['Arizona', '3,390,161', '1,582,860', '46.69%', '1,770,242', '52.22%', '11', '37,059', '1.09%'], ['Arkansas', '1,182,676', '396,905', '33.56%', '759,241', '64.20%', '6', '26,530', '2.24%'], ['California', '15,865,475', '9,276,179', '58.47%', '54', '6,081,697', '38.33%', '507,599', '3.20%'], ['Colorado', '3,192,745', '1,728,159', '54.13%', '10', '1,377,441', '43.14%', '87,145', '2.73%'], ['Connecticut', '1,759,010', '992,053', '56.40%', '7', '736,918', '41.89%', '30,039', '1.71%'], ['Delaware', '511,697', '289,758', '56.63%', '3', '214,351', '41.89%', '7,588', '1.48%'], ['District of Columbia', '325,869', '294,185', '90.28%', '3', '21,076', '6.47%', '10,608', '3.26%'], ['Florida', '10,893,752', '4,683,038', '42.99%', '6,1

In [46]:
# Process rows to ensure all have 11 columns
processed_rows = []
for row in rows:
    if len(row) == 9:
        # Insert empty strings for missing EV columns
        # Format: STATE, TOTAL, H_Votes, H_%, [H_EV], T_Votes, T_%, [T_EV], O_Votes, O_%, [O_EV]
        # We need to insert empty values at positions 4, 7, and 10 (for EVs)
        new_row = [
            row[0],  # STATE
            row[1],  # TOTAL_VOTES
            row[2],  # Harris_Votes
            row[3],  # Harris_Percent
            '',      # Harris_EV (empty)
            row[4],  # Trump_Votes
            row[5],  # Trump_Percent
            row[6],  # Trump_EV
            row[7],  # Others_Votes
            row[8],  # Others_Percent
            ''       # Others_EV (empty)
        ]
        processed_rows.append(new_row)
    else:
        processed_rows.append(row)

print(processed_rows)

[['STATE', 'TOTAL VOTES', 'Votes', '%', 'EV', 'Votes', '%', 'EV', 'Votes', '%', 'EV'], ['Alabama', '2,265,090', '772,412', '34.10%', '', '1,462,616', '64.57%', '9', '30,062', '1.33%', ''], ['Alaska', '338,177', '140,026', '41.41%', '', '184,458', '54.54%', '3', '13,693', '4.05%', ''], ['Arizona', '3,390,161', '1,582,860', '46.69%', '', '1,770,242', '52.22%', '11', '37,059', '1.09%', ''], ['Arkansas', '1,182,676', '396,905', '33.56%', '', '759,241', '64.20%', '6', '26,530', '2.24%', ''], ['California', '15,865,475', '9,276,179', '58.47%', '', '54', '6,081,697', '38.33%', '507,599', '3.20%', ''], ['Colorado', '3,192,745', '1,728,159', '54.13%', '', '10', '1,377,441', '43.14%', '87,145', '2.73%', ''], ['Connecticut', '1,759,010', '992,053', '56.40%', '', '7', '736,918', '41.89%', '30,039', '1.71%', ''], ['Delaware', '511,697', '289,758', '56.63%', '', '3', '214,351', '41.89%', '7,588', '1.48%', ''], ['District of Columbia', '325,869', '294,185', '90.28%', '', '3', '21,076', '6.47%', '10,6

In [47]:
# Define the column names
headers = ['STATE', 'TOTAL_VOTES', 'Harris_Votes', 'Harris_Percent', 'Harris_EV', 
               'Trump_Votes', 'Trump_Percent', 'Trump_EV', 'Others_Votes', 'Others_Percent', 'Others_EV']

In [50]:
# Create DataFrame
df = pd.DataFrame(processed_rows, columns=headers)
df.head()

Unnamed: 0,STATE,TOTAL_VOTES,Harris_Votes,Harris_Percent,Harris_EV,Trump_Votes,Trump_Percent,Trump_EV,Others_Votes,Others_Percent,Others_EV
0,STATE,TOTAL VOTES,Votes,%,EV,Votes,%,EV,Votes,%,EV
1,Alabama,2265090,772412,34.10%,,1462616,64.57%,9,30062,1.33%,
2,Alaska,338177,140026,41.41%,,184458,54.54%,3,13693,4.05%,
3,Arizona,3390161,1582860,46.69%,,1770242,52.22%,11,37059,1.09%,
4,Arkansas,1182676,396905,33.56%,,759241,64.20%,6,26530,2.24%,


In [None]:
# Filter out the header row
df = df[df['STATE'] != 'STATE']
df.head()

Unnamed: 0,STATE,TOTAL_VOTES,Harris_Votes,Harris_Percent,Harris_EV,Trump_Votes,Trump_Percent,Trump_EV,Others_Votes,Others_Percent,Others_EV
1,Alabama,2265090,772412,34.10%,,1462616,64.57%,9,30062,1.33%,
2,Alaska,338177,140026,41.41%,,184458,54.54%,3,13693,4.05%,
3,Arizona,3390161,1582860,46.69%,,1770242,52.22%,11,37059,1.09%,
4,Arkansas,1182676,396905,33.56%,,759241,64.20%,6,26530,2.24%,
5,California,15865475,9276179,58.47%,,54,6081697,38.33%,507599,3.20%,
