In [126]:
# Dependencies and Setup
import pandas as pd
from pathlib import Path
import plotly.express as px
import scipy.stats as st
import numpy as np
from scipy.stats import linregress
import plotly.graph_objects as go
from plotly.offline import plot


# Files to Load
urban_csv = Path("../datasets/filtered_urban_rural_data.csv")
vaccination_csv = Path("../datasets/cleaned_covid19_vaccinations.csv")
deaths_csv = Path("../datasets/death_rates.csv")

In [57]:
# Read in csv to DF
death_rates_df = pd.read_csv(deaths_csv)
vaccination_df = pd.read_csv(vaccination_csv)
urban_df = pd.read_csv(urban_csv)
vaccination_df

Unnamed: 0,State,Total doses administered by jurisdiction,Residents with a completed primary series,Percent of total pop with a completed primary series,Residents 18+ with a completed primary series,Percent of 18+ pop with a completed primary series,Residents 65+ with a completed primary series,Percent of 65+ pop with a completed primary series
0,United States,676728782,230637348,69.5,204327579,79.1,51708613,94.4
1,Alaska,1328221,477592,65.3,418608,75.9,81999,89.5
2,Alabama,7018011,2611593,53.3,2416344,63.3,725159,85.3
3,Arkansas,4874091,1720209,57.0,1547724,66.8,440981,84.2
4,American Samoa,115281,42495,89.7,29952,95.0,2996,91.4
5,Arizona,14647405,4821350,66.2,4239941,75.2,1188267,90.8
6,California,88487852,29588939,74.9,25681179,83.9,5441414,93.2
7,Colorado,13033446,4248431,73.8,3729317,82.9,818078,95.0
8,Connecticut,9040989,2967081,83.2,2602449,91.7,654355,95.0
9,District of Columbia,2137377,644085,91.3,569871,95.0,95949,95.0


In [58]:
# Filter vaccination data
filtered_vaccination_df = vaccination_df[['State', 'Percent of 18+ pop with a completed primary series']]
filtered_vaccination_df 

Unnamed: 0,State,Percent of 18+ pop with a completed primary series
0,United States,79.1
1,Alaska,75.9
2,Alabama,63.3
3,Arkansas,66.8
4,American Samoa,95.0
5,Arizona,75.2
6,California,83.9
7,Colorado,82.9
8,Connecticut,91.7
9,District of Columbia,95.0


In [59]:
# Filter death rates
filtered_death_rates_df = death_rates_df[['STATE','RATE']]
filtered_death_rates_df

Unnamed: 0,STATE,RATE
0,Alabama,152.8
1,Alaska,109.5
2,Arizona,139.5
3,Arkansas,127.7
4,California,99.9
5,Colorado,84.2
6,Connecticut,56.7
7,Delaware,83.2
8,Florida,111.7
9,Georgia,135.9


In [60]:
# Filter urban rates
filtered_urban_df = urban_df[['STATE NAME', '2020 PCT URBAN POP']]
filtered_urban_df

Unnamed: 0,STATE NAME,2020 PCT URBAN POP
0,Alabama,57.7
1,Alaska,64.9
2,Arizona,89.3
3,Arkansas,55.5
4,California,94.2
5,Colorado,86.0
6,Connecticut,86.3
7,Delaware,82.6
8,District of Columbia,100.0
9,Florida,91.5


In [61]:
# Merge all data into one DF for convenience
merged_df = filtered_urban_df.merge(filtered_death_rates_df, left_on='STATE NAME', right_on='STATE', how='inner')
full_merged_df = merged_df.merge(filtered_vaccination_df, left_on='STATE NAME', right_on='State', how='inner')
full_merged_df

Unnamed: 0,STATE NAME,2020 PCT URBAN POP,STATE,RATE,State,Percent of 18+ pop with a completed primary series
0,Alabama,57.7,Alabama,152.8,Alabama,63.3
1,Alaska,64.9,Alaska,109.5,Alaska,75.9
2,Arizona,89.3,Arizona,139.5,Arizona,75.2
3,Arkansas,55.5,Arkansas,127.7,Arkansas,66.8
4,California,94.2,California,99.9,California,83.9
5,Colorado,86.0,Colorado,84.2,Colorado,82.9
6,Connecticut,86.3,Connecticut,56.7,Connecticut,91.7
7,Delaware,82.6,Delaware,83.2,Delaware,83.2
8,Florida,91.5,Florida,111.7,Florida,79.6
9,Georgia,74.1,Georgia,135.9,Georgia,67.7


In [62]:
# Drop duplicate state columns
full_merged_df = full_merged_df.drop(columns=['STATE', 'State'])
full_merged_df

Unnamed: 0,STATE NAME,2020 PCT URBAN POP,RATE,Percent of 18+ pop with a completed primary series
0,Alabama,57.7,152.8,63.3
1,Alaska,64.9,109.5,75.9
2,Arizona,89.3,139.5,75.2
3,Arkansas,55.5,127.7,66.8
4,California,94.2,99.9,83.9
5,Colorado,86.0,84.2,82.9
6,Connecticut,86.3,56.7,91.7
7,Delaware,82.6,83.2,83.2
8,Florida,91.5,111.7,79.6
9,Georgia,74.1,135.9,67.7


In [71]:
# Rename columns for explictness
full_merged_df2 = full_merged_df.rename(columns={"2020 PCT URBAN POP": "Percent of Population in Urban Areas", "RATE": "Covid Deaths per 100,000 Residents", "Percent of 18+ pop with a completed primary series":
"Percent of Adult Population Completed Primary Vaccination Series"}, errors="raise")
full_merged_df2

Unnamed: 0,STATE NAME,Percent of Population in Urban Areas,"Covid Deaths per 100,000 Residents",Percent of Adult Population Completed Primary Vaccination Series
0,Alabama,57.7,152.8,63.3
1,Alaska,64.9,109.5,75.9
2,Arizona,89.3,139.5,75.2
3,Arkansas,55.5,127.7,66.8
4,California,94.2,99.9,83.9
5,Colorado,86.0,84.2,82.9
6,Connecticut,86.3,56.7,91.7
7,Delaware,82.6,83.2,83.2
8,Florida,91.5,111.7,79.6
9,Georgia,74.1,135.9,67.7


In [94]:
# Scatter plot for urban vs deaths
deaths_vs_urban_fig = px.scatter(full_merged_df2, x="Percent of Population in Urban Areas", y="Covid Deaths per 100,000 Residents", trendline="ols", trendline_color_override="red")
deaths_vs_urban_fig

# Save figure
deaths_vs_urban_fig.write_image("./urban%_corr_deaths.png")

deaths_vs_urban_fig

In [88]:
# Store values
x_val = full_merged_df2["Percent of Population in Urban Areas"]
y_val = full_merged_df2["Covid Deaths per 100,000 Residents"]

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_val, y_val)
regress_values = x_val * slope + intercept

line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

# Calculate Correlation
corr = round(st.pearsonr(x_val, y_val)[0],3)
print(f"The correlation between the percent of each state's urban population to the rate of Covid-19 deaths is: {corr}")


The correlation between the percent of each state's urban population to the rate of Covid-19 deaths is: -0.159


In [96]:
# Scatter plot for urban vs vaccination
vax_vs_urban_fig = px.scatter(full_merged_df2, x="Percent of Population in Urban Areas", y="Percent of Adult Population Completed Primary Vaccination Series", trendline="ols", trendline_color_override="red")


# Save plot
vax_vs_urban_fig.write_image("./vax_vs_urban_fig.png")
vax_vs_urban_fig

In [86]:
# Store values
x_val2 = full_merged_df2["Percent of Population in Urban Areas"]
y_val2 = full_merged_df2["Percent of Adult Population Completed Primary Vaccination Series"]

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_val2, y_val2)
regress_values = x_val2 * slope + intercept

line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

# Calculate Correlation
corr2 = round(st.pearsonr(x_val2, y_val2)[0],3)
print(f"The correlation between the percent of each state's urban population relative to the state's rate of adults with a completed vaccination series is: {corr2}")

The correlation between the percent of each state's urban population relative to the state's rate of adults with a completed vaccination series is: 0.404


In [112]:
# Summary Stats for Urban population
median_state_urban_pct = x_val2.median()

std_state_urban_pct = x_val2.std()
print(f"The median state's urban % is: {round(median_state_urban_pct, 2)}, The standard deviation in the % of urban residents is:  {round(std_state_urban_pct, 2)}")


The median state's urban % is: 72.65, The standard deviation in the % of urban residents is:  14.82


In [135]:
# Plot summary stats
sum_stats = go.Figure()

sum_stats.add_trace(go.Box(y=x_val2, name='Urban Residents'))
sum_stats.add_trace(go.Box(y=y_val2, name='Vaccination Complete'))

# Update layout
sum_stats.update_layout(title='Urban and Vaccinated % of Residents Across US States', yaxis=dict(title='Percentage'))

# Show the plot
sum_stats.show()

# Save file
sum_stats.write_image("./urban_and_vaccination_stats.png")

In [157]:
# Compute stats
x_val2_min = x_val2.min()
x_val2_max = x_val2.max()
x_val2_median = x_val2.median()
x_val2_mean = x_val2.mean()
x_val2_std = x_val2.std()

y_val2_min = y_val2.min()
y_val2_max = y_val2.max()
y_val2_median = y_val2.median()
y_val2_mean = y_val2.mean()
y_val2_std = y_val2.std()

In [160]:
# Table of stats
stats = go.Figure(data=[go.Table(header=dict(values=['Statistic', 'Percent of Population in Urban Areas', 'Percent of Adult Population Completed Primary Vaccination Series']),
                 cells=dict(values=[['Max', 'Mean', 'Median', 'Min', 'Standard Deviation'], [x_val2_max, x_val2_mean, x_val2_median, x_val2_min, round(x_val2_std, 2)], [y_val2_max, y_val2_mean, y_val2_median, y_val2_min, round(y_val2_std, 2)]]))
                     ])
stats.show()
stats.write_image("./urban_and_vaccination_stats_table.png")