# Population Data for All States in the U.S.

In [1]:
# Scrapes the site World Population Review for US States data and dumps it into a dataframe
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
from webdriver_manager.chrome import ChromeDriverManager
from splinter import Browser

In [2]:
# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=True) # headless=True hides the chrome window pop-up

[WDM] - Current google-chrome version is 88.0.4324
[WDM] - Get LATEST driver version for 88.0.4324
[WDM] - Driver [C:\Users\grace\.wdm\drivers\chromedriver\win32\88.0.4324.96\chromedriver.exe] found in cache






In [3]:
# Ensure browser chrome window resets
# browser.quit()

In [4]:
# Set up the necessary variables

url = 'https://worldpopulationreview.com/states'

browser.visit(url)

html = browser.html

soup = BeautifulSoup(html, 'html.parser')

In [5]:
# Use soup.find_all() to find data needed for all states.
states = soup.find_all('table')[0]
states

<table class="jsx-1487038798 table table-striped tp-table-body"><thead class="jsx-2642336383" style="font-weight: bold;"><tr class="jsx-2642336383"><th class="jsx-2816426159"><span class="jsx-2816426159"><a class="jsx-2816426159" data-field="rank" style="color: black; cursor: pointer; display: inline-block;">Rank  </a></span></th><th class="jsx-2816426159"><span class="jsx-2816426159"><a class="jsx-2816426159" data-field="State" style="color: black; cursor: pointer; display: inline-block;">State  </a></span></th><th class="jsx-2816426159"><span class="jsx-2816426159"><a class="jsx-2816426159" data-field="Pop" style="color: black; cursor: pointer; display: inline-block;">2021 Pop. <svg fill="currentColor" height="1em" stroke="currentColor" stroke-width="0" viewbox="0 0 320 512" width="1em" xmlns="http://www.w3.org/2000/svg"><path d="M31.3 192h257.3c17.8 0 26.7 21.5 14.1 34.1L174.1 354.8c-7.8 7.8-20.5 7.8-28.3 0L17.2 226.1C4.6 213.5 13.5 192 31.3 192z"></path></svg> </a></span></th><th c

In [6]:
# Take only the 'tbody' from the 'table' content from the html, and store in the state_data variable
state_data = states.find('tbody')
state_data

<tbody class="jsx-2642336383"><tr><td>1</td><td><a href="/states/california-population">California</a></td><td>39,613,500</td><td><span style="color: green;">0.38%</span></td><td>39,461,600</td><td>37,319,500</td><td><span style="color: green;">6.15%</span></td><td>11.84%</td><td>254</td></tr><tr><td>2</td><td><a href="/states/texas-population">Texas</a></td><td>29,730,300</td><td><span style="color: green;">3.85%</span></td><td>28,628,700</td><td>25,242,000</td><td><span style="color: green;">17.78%</span></td><td>8.89%</td><td>114</td></tr><tr><td>3</td><td><a href="/states/florida-population">Florida</a></td><td>21,944,600</td><td><span style="color: green;">3.30%</span></td><td>21,244,300</td><td>18,845,500</td><td><span style="color: green;">16.44%</span></td><td>6.56%</td><td>409</td></tr><tr><td>4</td><td><a href="/states/new-york-population">New York</a></td><td>19,300,000</td><td><span style="color: red;">-1.18%</span></td><td>19,530,400</td><td>19,399,900</td><td><span style=

### _Below is to create and test variables using ONE state_

In [7]:
# Find first state's data
state_names = state_data.find_all('tr')
state = state_names[0]
state

<tr><td>1</td><td><a href="/states/california-population">California</a></td><td>39,613,500</td><td><span style="color: green;">0.38%</span></td><td>39,461,600</td><td>37,319,500</td><td><span style="color: green;">6.15%</span></td><td>11.84%</td><td>254</td></tr>

In [8]:
# Find first state's data title
href = state.find('a')['href']
href

'/states/california-population'

In [9]:
# Find first state's name
state_name = state.find('a').text
state_name

'California'

In [10]:
# Find first state's population
state_population = state.find_all('td')[2].text
state_population

'39,613,500'

In [11]:
# Find first state's population growth since previous year
state_pop_growth = state.find_all('td')[3].text
state_pop_growth

'0.38%'

In [12]:
# Find first state's population percentage of the United States population
state_us_percent = state.find_all('td')[7].text
state_us_percent

'11.84%'

In [13]:
# Find first state's population density by p/ml2
state_density = state.find_all('td')[8].text
state_density

'254'

## Create a data frame to store all states data. This will be our fundamental data frame.

In [14]:
# Create an empty list to store states data by row
state_rows = []

# Use a for loop to loop throuh the states:
for state in state_names:
    
    # Use Beautiful Soup's find() method to navigate and retrieve attributes 
    state_name = state.find('a').text
    href = state.find('a')['href']
    state_population = state.find_all('td')[2].text
    state_pop_growth = state.find_all('td')[3].text
    state_us_percent = state.find_all('td')[7].text
    state_density = state.find_all('td')[8].text
    
    # Create a dictionary to store the each state's data in a new row
    new_row={
        'state_name': state_name,
        'href': f"https://worldpopulationreview.com{href}",
        'state_population': state_population,
        'state_pop_growth': state_pop_growth,
        'state_us_percent': state_us_percent,
        'state_density': state_density
    }
    
    # Append/add in each state's data and column name to the state_rows list
    state_rows.append(new_row)

# Make the appended list to a new data frame, where all states' data are stored in here
states_df = pd.DataFrame(state_rows)
states_df.head()

Unnamed: 0,state_name,href,state_population,state_pop_growth,state_us_percent,state_density
0,California,https://worldpopulationreview.com/states/calif...,39613500,0.38%,11.84%,254
1,Texas,https://worldpopulationreview.com/states/texas...,29730300,3.85%,8.89%,114
2,Florida,https://worldpopulationreview.com/states/flori...,21944600,3.30%,6.56%,409
3,New York,https://worldpopulationreview.com/states/new-y...,19300000,-1.18%,5.77%,410
4,Pennsylvania,https://worldpopulationreview.com/states/penns...,12804100,0.02%,3.83%,286


In [15]:
# Editing data types to integers
states_df['state_population'] = states_df['state_population'].str.replace(r'\D', '').astype(int)

  states_df['state_population'] = states_df['state_population'].str.replace(r'\D', '').astype(int)


In [16]:
# Editing data types to integers
states_df['state_pop_growth'] = states_df['state_pop_growth'].str.replace(r'\D', '').astype(int)
states_df['state_pop_growth'] = states_df['state_pop_growth'] / 10000

  states_df['state_pop_growth'] = states_df['state_pop_growth'].str.replace(r'\D', '').astype(int)


In [17]:
# Editing data types to integers
states_df['state_us_percent'] = states_df['state_us_percent'].str.replace(r'\D', '').astype(int)
states_df['state_us_percent'] = states_df['state_us_percent'] / 10000

  states_df['state_us_percent'] = states_df['state_us_percent'].str.replace(r'\D', '').astype(int)


In [18]:
# Editing data types to integers
states_df['state_density'] = states_df['state_density'].str.replace(r'\D', '').astype(int)

  states_df['state_density'] = states_df['state_density'].str.replace(r'\D', '').astype(int)


### _Save the State Population Density data to a CSV file_

In [19]:
# Drop href column for state population data csv
states_population_data_df = states_df.drop(columns=['href'])

In [20]:
# Add 'pop_density_id' column to the data frame
states_population_data_df['pop_density_id'] = list(range(1,53))

# Merge states_population_data_df with states_id_df to add the 'state_id' column
states_id_df = pd.read_csv('States.csv',index_col=False)
states_population_data_df = states_population_data_df.merge(states_id_df, on='state_name')
states_population_data_df = states_population_data_df[['pop_density_id','state_id','state_name','state_population','state_pop_growth','state_us_percent','state_density']]
states_population_data_df

FileNotFoundError: [Errno 2] No such file or directory: 'States.csv'

In [None]:
states_population_data_df.to_csv('population_density.csv', index=False)

In [None]:
# Check column data types
states_population_data_df.dtypes

## Create a data frame to store the Population by Race data in

In [None]:
# Create an empty data frame to store the total Population by Race data from each state's website
population_byrace_df = pd.DataFrame({'state_name': [],'Race': [],'Population': [],'Percentage': [] })

# Use a for loop to loop to loop through each state's name and href, then find the Population Race data
for idx in range(len(states_df[['state_name', 'href']])):
    state_name, href = states_df[['state_name', 'href']].iloc[idx]
    tables = pd.read_html(href)
    population_byrace_df_temp = tables[0]
    population_byrace_df_temp['state_name'] = state_name
    
    # Appending/adding in each state's Population by Race data into the empty data frame
    population_byrace_df = population_byrace_df.append(population_byrace_df_temp)

population_byrace_df

In [None]:
# Editing data types to integers
population_byrace_df['Percentage'] = population_byrace_df['Percentage'].str.replace(r'\D', '').astype(int)
population_byrace_df['Percentage'] = population_byrace_df['Percentage'] / 10000

### _Save the Population by Race data to a CSV file_

In [None]:
# Rename columns for merge
population_byrace_df = population_byrace_df.rename(columns = {"Race":"state_pop_race",
                                       "Population":"state_pop_race_count",
                                       "Percentage":"state_pop_race_percentage"})
population_byrace_df

In [None]:
# Add 'race_pop_id' column to the data frame
population_byrace_df['race_pop_id'] = list(range(1,365))

# Merge population_byrace_df with states_id_df to add the 'state_id' column, and merge with race_id_df to add the 'race_id' column
states_id_df = pd.read_csv('States.csv',index_col=False)
population_byrace_df = population_byrace_df.merge(states_id_df, on='state_name')

race_id_df = pd.read_csv('race_id.csv',index_col=False)
population_byrace_df = population_byrace_df.merge(race_id_df, on='state_pop_race')

population_byrace_df = population_byrace_df[['race_pop_id','state_id','state_name','race_id','state_pop_race','state_pop_race_count','state_pop_race_percentage']]
population_byrace_df

In [None]:
# Create the Population by Race csv
population_byrace_df.to_csv('population_byrace.csv', index=False)

In [None]:
# Check column data types
population_byrace_df.dtypes

## Create a data frame to store the Educational Attainment data in

In [None]:
# Create an empty data frame to store the Educational Attainment data from each state's website
education_attainment_df = pd.DataFrame({'state_name': [],'Education Attained': [],'Count': [], 'Percentage': []})

# Use a for loop to loop to loop through each state's name and href, then find the Educational Attainment data
for idx in range(len(states_df[['state_name', 'href']])):
    state_name, href = states_df[['state_name', 'href']].iloc[idx]
    tables = pd.read_html(href)
    education_attainment_df_temp = tables[3]
    education_attainment_df_temp['state_name'] = state_name
    # Appending/adding in each state's Educational Attainment data into the empty data frame
    education_attainment_df = education_attainment_df.append(education_attainment_df_temp)
    
education_attainment_df

In [None]:
# Editing data types to integers
education_attainment_df['Percentage'] = education_attainment_df['Percentage'].str.replace(r'\D', '').astype(int)
education_attainment_df['Percentage'] = education_attainment_df['Percentage'] / 10000

### _Save the Educational Attainment data to a CSV file_

In [None]:
# Rename columns for merge
education_attainment_df = education_attainment_df.rename(columns = {"Education Attained":"state_edu_attained",
                                                                    "Count":"state_edu_attained_count",
                                                                    "Percentage":"state_edu_attained_percentage"})
education_attainment_df

In [None]:
# Add 'pop_edu_attain_id' column to the data frame
education_attainment_df['pop_edu_attain_id'] = list(range(1,365))

# Merge education_attainment_df with states_id_df to add the 'state_id' column, and merge with education_id_df to add the 'education_id' column

education_attainment_df = education_attainment_df.merge(states_id_df, on='state_name')

education_id_df = pd.read_csv('education_id.csv',index_col=False)
education_id_df_copy = education_id_df.drop(columns=['state_earning_edu'])

education_attainment_df = education_attainment_df.merge(education_id_df_copy, on='state_edu_attained')

education_attainment_df = education_attainment_df[['pop_edu_attain_id','state_id','state_name','state_edu_attained','state_edu_attained_count','state_edu_attained_percentage','education_id']]
education_attainment_df

In [None]:
# Create the Educational Attainment csv
education_attainment_df.to_csv('population_edu_attain.csv', index=False)

In [None]:
# Check column data types
education_attainment_df.dtypes

 ## Create a data frame to store the Earnings by Educational Attainment & Sex data in

In [None]:
# Create an empty data frame to store the Earnings by Educational Attainment % Sex data from each state's website
earnings_df = pd.DataFrame({'state_name': []})

# Use a for loop to loop to loop through each state's name and href, then find the Earnings data
for idx in range(len(states_df[['state_name', 'href']])):
    state_name, href = states_df[['state_name', 'href']].iloc[idx]
    tables = pd.read_html(href)
    earnings_df_temp = tables[5]
    earnings_df_temp['state_name'] = state_name
    # Appending/adding in each state's Earnings data into the empty data frame
    earnings_df = earnings_df.append(earnings_df_temp)
    
earnings_df

In [None]:
# Editing data types to integers
earnings_df['Average'] = earnings_df['Average'].str.replace(r'\D', '').astype(int)

In [None]:
# Editing data types to integers
earnings_df['Male'] = earnings_df['Male'].str.replace(r'\D', '').astype(int)

In [None]:
# Editing data types to integers
earnings_df['Female'] = earnings_df['Female'].str.replace(r'\D', '').astype(int)

### _Save the Earnings data to a CSV file_

In [None]:
# Rename columns for merge
earnings_df = earnings_df.rename(columns = {"Name":"state_earning_edu",
                                            "Average":"state_earning_average",
                                            "Male":"state_earning_male",
                                            "Female":"state_earning_female"})
earnings_df

In [None]:
# Dropped Overall Rows for scalability
earnings_df = earnings_df[earnings_df.state_earning_edu != "Overall"]
earnings_df

In [None]:
# Add 'pop_earnings_id' column to the data frame
earnings_df['pop_earnings_id'] = list(range(1,261))

# Merge earnings_df with states_id_df to add the 'state_id' column, and merge with education_id_df to add the 'education_id' column

earnings_df = earnings_df.merge(states_id_df, on='state_name')

education_id_df = pd.read_csv('education_id.csv',index_col=False)
education_id_df_copy_2 = education_id_df.drop(columns=['state_edu_attained'])

earnings_df = earnings_df.merge(education_id_df_copy_2, on='state_earning_edu')

earnings_df = earnings_df[['pop_earnings_id','state_id','state_name','state_earning_edu','state_earning_average','state_earning_male','state_earning_female','education_id']]
earnings_df

In [None]:
# Create the Earnings csv
earnings_df.to_csv('population_earnings.csv', index=False)

In [None]:
# Check column data types
earnings_df.dtypes