# Scrape 

United States - Educational attainment - persons 25 years and over - percent high school graduate or higher by State
https://www.indexmundi.com/facts/united-states/quick-facts/all-states/percent-of-people-25-years-and-over-with-high-school-degree-or-higher#table


United States - Per capita income in past 12 months (in 2018 dollars), 2014-2018 by State
https://www.indexmundi.com/facts/united-states/quick-facts/all-states/income-per-capita#table

In [44]:
# import dependencies 
import requests
import pandas as pd
from bs4 import BeautifulSoup


In [39]:
# request
def get_page_html(url):
    page = requests.get(url)
    return page.content

In [40]:
# urls
hs_degree_url = "https://www.indexmundi.com/facts/united-states/quick-facts/all-states/percent-of-people-25-years-and-over-with-high-school-degree-or-higher"
income_url = "https://www.indexmundi.com/facts/united-states/quick-facts/all-states/income-per-capita"

hs_degree_html = get_page_html(url=hs_degree_url)
income_degree_html = get_page_html(url=income_url)

In [41]:
def convert_string_to_number(number):
    # strip out comma and whitespace
    number = number.replace(",", "").strip()
    try:
        return_val = int(number)
    except ValueError:
        # number is float
        return_val = float(number)
    return return_val
            
def convert_string_to_float(number):
    # strip out comma and whitespace
    number = number.replace(",", "").strip()
    return float(number)


def convert_data_table_to_dict(html):
    soup = BeautifulSoup(html, 'html.parser')
    data_div = soup.find("div", { "id" : "tableTab" })
    table = data_div.find("table")
    
    return_dict = {}
    for row in table.findAll("tr"):
        cells = row.findAll("td")

        # skipping TRs that are headings
        if not cells:
            continue

        key = cells[0].find(text=True)
        value = cells[1].find(text=True)       
        value = convert_string_to_number(value)
        return_dict[key] = value

    return return_dict
    
    

hs_degree_data = convert_data_table_to_dict(html=hs_degree_html)
income_data = convert_data_table_to_dict(html=income_degree_html)

In [42]:
print(hs_degree_data)
print(income_data)

{'Alabama': 85.8, 'Alaska': 92.7, 'Arizona': 86.8, 'Arkansas': 86.2, 'California': 82.9, 'Colorado': 91.4, 'Connecticut': 90.5, 'Delaware': 89.8, 'District of Columbia': 90.6, 'Florida': 88.0, 'Georgia': 86.7, 'Hawaii': 91.8, 'Idaho': 90.6, 'Illinois': 88.9, 'Indiana': 88.6, 'Iowa': 92.0, 'Kansas': 90.7, 'Kentucky': 85.7, 'Louisiana': 84.8, 'Maine': 92.3, 'Maryland': 90.0, 'Massachusetts': 90.4, 'Michigan': 90.5, 'Minnesota': 93.0, 'Mississippi': 83.9, 'Missouri': 89.6, 'Montana': 93.2, 'Nebraska': 91.1, 'Nevada': 86.3, 'New Hampshire': 92.9, 'New Jersey': 89.5, 'New Mexico': 85.3, 'New York': 86.5, 'North Carolina': 87.4, 'North Dakota': 92.5, 'Ohio': 90.1, 'Oklahoma': 87.8, 'Oregon': 90.4, 'Pennsylvania': 90.2, 'Rhode Island': 88.0, 'South Carolina': 87.1, 'South Dakota': 91.7, 'Tennessee': 87.0, 'Texas': 83.2, 'Utah': 92.0, 'Vermont': 92.6, 'Virginia': 89.3, 'Washington': 91.1, 'West Virginia': 86.5, 'Wisconsin': 91.9, 'Wyoming': 92.9}
{'Alabama': 26846, 'Alaska': 35874, 'Arizona': 

In [45]:
# Merge the data and convert to dataframe
states = []
degree_rates = []
incomes = []

for key, val in hs_degree_data.items():
    states.append(key)
    degree_rates.append(val)
    incomes.append(income_data[key])

multi_array = zip(states, degree_rates, incomes)
df = pd.DataFrame(multi_array, columns=["state", "percent_hs_grad", "per_capita_income"])
df.to_csv("state_table.csv", index=False)
