# Demographic Data for All States in the U.S.

In [8]:
# Scrapes the site WorldoMeters for coronavirus data and dumps it into a dataframe
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
from webdriver_manager.chrome import ChromeDriverManager
from splinter import Browser

In [10]:
# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False) # headless=True hides the chrome window pop-up

[WDM] - Current google-chrome version is 88.0.4324
[WDM] - Get LATEST driver version for 88.0.4324
[WDM] - Driver [C:\Users\grace\.wdm\drivers\chromedriver\win32\88.0.4324.96\chromedriver.exe] found in cache






In [24]:
# Set up the necessary variables

url = 'https://worldpopulationreview.com/state-rankings/number-of-registered-voters-by-state'

browser.visit(url)

html = browser.html

soup = BeautifulSoup(html, 'html.parser')

In [70]:
# Use soup.find_all() to find data needed for all states.
states = soup.find_all('table')[0]
states

<table class="jsx-1487038798 table table-striped tp-table-body"><thead class="jsx-2642336383" style="font-weight: bold;"><tr class="jsx-2642336383"><th class="jsx-2816426159"><span class="jsx-2816426159"><a class="jsx-2816426159" data-field="State" style="color: black; cursor: pointer; display: inline-block;">State  </a></span></th><th class="jsx-2816426159"><span class="jsx-2816426159"><a class="jsx-2816426159" data-field="totalRegistered" style="color: black; cursor: pointer; display: inline-block;">Number of Registered Voters  </a></span></th><th class="jsx-2816426159"><span class="jsx-2816426159"><a class="jsx-2816426159" data-field="Pop" style="color: black; cursor: pointer; display: inline-block;">2021 Pop.  </a></span></th><th class="jsx-2816426159"><span class="jsx-2816426159"><a class="jsx-2816426159" data-field="registeredPerc" style="color: black; cursor: pointer; display: inline-block;">% Registered  </a></span></th><th class="jsx-2816426159"><span class="jsx-2816426159"><a

In [96]:
# Take only the 'tbody' from the 'table' content from the html, and store in the state_data variable
state_data = states.find('tbody')
state_data

<tbody class="jsx-2642336383"><tr><td><a href="/states/alabama-population">Alabama</a></td><td>3,708,804</td><td>4,934,190</td><td>75.17%</td><td>11/4/2020</td></tr><tr><td><a href="/states/alaska-population">Alaska</a></td><td>597,319</td><td>724,357</td><td>82.46%</td><td>11/3/2020</td></tr><tr><td><a href="/states/arizona-population">Arizona</a></td><td>4,281,152</td><td>7,520,100</td><td>56.93%</td><td>11/4/2020</td></tr><tr><td><a href="/states/arkansas-population">Arkansas</a></td><td>1,755,775</td><td>3,033,950</td><td>57.87%</td><td>6/3/2020</td></tr><tr><td><a href="/states/california-population">California</a></td><td>22,047,448</td><td>39,613,500</td><td>55.66%</td><td>10/19/2020</td></tr><tr><td><a href="/states/colorado-population">Colorado</a></td><td>4,238,513</td><td>5,893,630</td><td>71.92%</td><td>11/1/2020</td></tr><tr><td><a href="/states/connecticut-population">Connecticut</a></td><td>2,375,537</td><td>3,552,820</td><td>66.86%</td><td>10/29/2019</td></tr><tr><td><a

#### Below is to create and test variables using ONE state

In [99]:
state_names = state_data.find_all('tr')
state = state_names[0]
state

<tr><td><a href="/states/alabama-population">Alabama</a></td><td>3,708,804</td><td>4,934,190</td><td>75.17%</td><td>11/4/2020</td></tr>

In [100]:
href = state.find('a')['href']
href

'/states/alabama-population'

In [102]:
state_population = state.find_all('td')[2].text
state_population

'4,934,190'

In [104]:
state_name = state.find('a').text
state_name

'Alabama'

### Create a data frame to store all states data. This will be our fundamental data frame.

In [120]:
# Create an empty list to store states data by row
state_rows = []

# Use a for loop to loop throuh the states:
for state in state_names:
    
    # Use Beautiful Soup's find() method to navigate and retrieve attributes 
    state_name = state.find('a').text
    href = state.find('a')['href']
    state_population = state.find_all('td')[2].text
    
    # Create a dictionary to store the each state's data in a new row
    new_row={
        'state_name': state_name,
        'href': f"https://worldpopulationreview.com{href}",
        'state_population': state_population
    }
    
    # Append/add in each state's data to the state_rows list
    state_rows.append(new_row)

# Make the appended list to a new data frame, where all states' data are stored in here
states_df = pd.DataFrame(state_rows)
states_df.head()

Unnamed: 0,state_name,href,state_population
0,Alabama,https://worldpopulationreview.com/states/alaba...,4934190
1,Alaska,https://worldpopulationreview.com/states/alask...,724357
2,Arizona,https://worldpopulationreview.com/states/arizo...,7520100
3,Arkansas,https://worldpopulationreview.com/states/arkan...,3033950
4,California,https://worldpopulationreview.com/states/calif...,39613500


#### Create a data frame to store the Education Attainment data in

In [119]:
# Create an empty data frame to store the Education Attainment data from each state's website
education_attainment_df = pd.DataFrame({'Education Attained': [], 'Count': [], 'Percentage': [], 'state_name': []})

# Use a for loop to loop to loop through each state's name and href. From each state's href, find the table that has the Education Attainment data. 
for idx in range(len(states_df[['state_name', 'href']])):
    state_name, href = states_df[['state_name', 'href']].iloc[idx]
    tables = pd.read_html(href)
    education_attainment_df_temp = tables[3]
    education_attainment_df_temp['state_name'] = state_name
    # Appending/adding in each state's Education Attainment data into the empty data frame
    education_attainment_df = education_attainment_df.append(education_attainment_df_temp)
    
education_attainment_df    

Unnamed: 0,Education Attained,Count,Percentage,state_name
0,Less Than 9th Grade,142999.0,4.31%,Alabama
1,9th to 12th Grade,315923.0,9.51%,Alabama
2,High School Graduate,1022840.0,30.80%,Alabama
3,Some College,711028.0,21.41%,Alabama
4,Associates Degree,282316.0,8.50%,Alabama
...,...,...,...,...
2,High School Graduate,113535.0,29.07%,Wyoming
3,Some College,99677.0,25.52%,Wyoming
4,Associates Degree,43761.0,11.21%,Wyoming
5,Bachelors Degree,68269.0,17.48%,Wyoming


In [None]:
# education_attainment_df.to_csv('')