In [4]:
#dependencies
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import scipy.stats as st
import numpy as np
from scipy.stats import linregress

In [5]:
#setting file path
file = Path("../datasets/provider_locations.csv")
provider_df = pd.read_csv(file)
provider_df.head()

FileNotFoundError: [Errno 2] No such file or directory: '../datasets/provider_locations.csv'

In [None]:
#second file path
file2 = Path("../datasets/2020census_urban_vs_rural.csv")
census_df = pd.read_csv(file2)
census_df.head()

In [None]:
#third file path
file3 = Path("../datasets/cleaned_death_rates.csv")
death_rates_df = pd.read_csv(file3)
death_rates_df.head()

In [None]:
#4th file path
file4 = Path("../datasets/covid19_vaccinations.csv")
vaccine_df = pd.read_csv(file4)
vaccine_df.head()

In [None]:
#renaming columns for vaccine_df
renamed_vaccine = vaccine_df.rename(columns = {"Jurisdiction (State/Territory) or Federal Entity":"State"})
renamed_vaccine.head()

In [None]:
#renaming state column to do a merge
renamed_census = census_df.rename(columns = {"STATE NAME":"State",
                                  "2020 TOTAL POP" : "Total Population"})
renamed_census.head()

In [None]:
#merging the dataframes
merged_df = pd.merge(provider_df, renamed_census, on = "State")
merged_df.head()

In [None]:
#selecting only the columns i want to work with
pop_vs_provider = merged_df[["State", "Total Population", "Total Providers"]]
pop_vs_provider.head()

In [None]:
#looking at the datatypes of each column
pop_vs_provider.dtypes

In [None]:
#removing the formatting of total population values so it could be converted to an integer to perform calculations
pop_vs_provider["Total Population"] = pop_vs_provider["Total Population"].str.replace(',',"")

pop_vs_provider.head()

In [None]:
#changing the total population column to numeric
pop_vs_provider["Total Population"] = pd.to_numeric(pop_vs_provider["Total Population"])
pop_vs_provider.dtypes

In [None]:
#displaying the dataframe
pop_vs_provider.head()

In [None]:
#finding the % of available providers relative to population and adding it to a new column on the df
percent = pop_vs_provider["Total Providers"] / pop_vs_provider["Total Population"]*100
pop_vs_provider["% of Providers Relative to Pop"] = percent
pop_vs_provider.head()

In [None]:
#making total population in more managable numbers 
per_2M = pop_vs_provider["Total Population"] / 2000000
pop_vs_provider["Total Population per 2M"] = per_2M
pop_vs_provider.head()

In [None]:
#creating a new df that holds columns i want
filtered_providers = pop_vs_provider[["State", "% of Providers Relative to Pop", "Total Population per 2M"]]
filtered_providers.head()

In [None]:
#narrowing down df more
provider_percent = pop_vs_provider[["State", "% of Providers Relative to Pop"]]
provider_percent.head()

In [None]:
#setting index to state
provider_percent2 = provider_percent.set_index("State")
provider_percent2.head()

In [None]:
#creating a bar graph displaying % of providers relative to pop by state
percent_bar = provider_percent2.plot(kind="bar", color="blue", figsize=(30,14))
plt.xlabel("State")
plt.ylabel("% of Providers Relative to Pop")

#set a title for the chart
plt.title("% of Providers Per State Population")

#saving chart
plt.savefig("providers_per_state_pop.png")
#showing the chart
plt.show()

In [None]:
deaths_provider_merged = pd.merge(provider_percent, death_rates_df, on = "State")
deaths_provider_merged.head()

In [None]:
filtered_merged = deaths_provider_merged[["State", "% of Providers Relative to Pop", "Rate"]]
filtered_merged.head()

In [None]:
filtered_merged2 = filtered_merged.set_index("State")
filtered_merged2.head()

In [None]:
fig1, ax1 = plt.subplots(figsize=(6, 5))
plt.scatter(filtered_merged2["Rate"], 
            filtered_merged2["% of Providers Relative to Pop"], color="blue")
plt.xlabel("Rate")
plt.ylabel("% of Providers Relative to Pop")

plt.show()

In [None]:
#building the line regression equation and held the x and y values in a variable so it was easier to plot
#without having to retype
x_values = filtered_merged2["Rate"]
y_values = filtered_merged2["% of Providers Relative to Pop"]

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept

line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

#print statement for pearsonr
correlation = round(st.pearsonr(filtered_merged2["Rate"], 
                                filtered_merged2["% of Providers Relative to Pop"])[0],2)
print(f"The correlation between death rate and the percent of providers relative to state population is: {correlation}")

#plotting the same scatter with the regression line
plt.scatter(filtered_merged2["Rate"], filtered_merged2["% of Providers Relative to Pop"], color="blue")
plt.plot(x_values, regress_values, "r-")
plt.annotate(line_eq,(5.8,0.8), color = "red")
plt.xlabel("Rate")
plt.ylabel("% of Providers Relative to Pop")
plt.show()

In [None]:
deaths_provider_merged2 = pd.merge(pop_vs_provider, death_rates_df, on = "State")
deaths_provider_merged2.head()

In [None]:
cleaned_merged2 = deaths_provider_merged2[["State", "Total Providers", "Rate"]]
cleaned_merged2.head()

In [None]:
cleaned_merged3 = cleaned_merged2.set_index("State")
cleaned_merged3.head()

In [None]:
fig1, ax1 = plt.subplots(figsize=(6, 5))
plt.scatter(cleaned_merged3["Rate"], 
            cleaned_merged3["Total Providers"], color="blue")
plt.xlabel("Rate")
plt.ylabel("Total Providers")

plt.show()

In [None]:
#building the line regression equation and held the x and y values in a variable so it was easier to plot
#without having to retype
x_values = cleaned_merged3["Rate"]
y_values = cleaned_merged3["Total Providers"]

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept

line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

#print statement for pearsonr
correlation = round(st.pearsonr(cleaned_merged3["Rate"], 
                                cleaned_merged3["Total Providers"])[0],2)
print(f"The correlation between death rate and the percent of providers relative to state population is: {correlation}")

#plotting same scatter
fig1, ax1 = plt.subplots(figsize=(7, 5))
plt.scatter(cleaned_merged3["Rate"], 
            cleaned_merged3["Total Providers"], color="blue")
plt.plot(x_values, regress_values, "r-")
plt.annotate(line_eq,(5.8,0.8), color = "red")
plt.xlabel("Death Rate")
plt.ylabel("Total Providers")
plt.title("# of Total Providers Per State vs Death Rate")
#saving the graph
plt.savefig("total_providers_vs_death.png")
#showing the graph
plt.show()

In analyzing the correlation between the COVID-19 death rate per state and number of total providers, the r-value was calculated to be 0.18. This is considered a very weak correlation, which conflicts with our hypothesis that states with more available vaccine providers will have a lower death rate.

In [None]:
providers_and_vaccines = pd.merge(deaths_provider_merged2, renamed_vaccine, on = "State")
providers_and_vaccines.head()

In [None]:
filtered = providers_and_vaccines[["State", 
                                   "% of Providers Relative to Pop", 
                                   "Percent of total pop with a completed primary series"]]
filtered.head()

In [None]:
filtered = filtered.set_index("State")
filtered.head()

In [None]:
fig1, ax1 = plt.subplots(figsize=(6, 5))
plt.scatter(filtered["% of Providers Relative to Pop"], 
            filtered["Percent of total pop with a completed primary series"], color="blue")
plt.xlabel("% of Providers Relative to Pop")
plt.ylabel("Percent of total pop with a completed primary series")

plt.show()

In [None]:
x_values = filtered["% of Providers Relative to Pop"]
y_values = filtered["Percent of total pop with a completed primary series"]

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept

line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

#print statement for pearsonr
correlation = round(st.pearsonr(filtered["% of Providers Relative to Pop"], 
                                filtered["Percent of total pop with a completed primary series"])[0],2)
print(f"The correlation between available providers and the percent of the total population with a completed primary series is: {correlation}")

#plotting same scatter with regression
fig1, ax1 = plt.subplots(figsize=(6, 5))
plt.scatter(filtered["% of Providers Relative to Pop"], 
            filtered["Percent of total pop with a completed primary series"], color="blue")
plt.plot(x_values, regress_values, "r-")
plt.annotate(line_eq,(5.8,0.8), color = "red")
plt.xlabel("% of Providers Relative to Population")
plt.ylabel("% of Total Population With a Completed Primary Series")
plt.title("% Fully Vaccinated vs % of Providers Relative to Population")
#saving graph
plt.savefig("percent_vaccinated_vs_percent_providers")
#showing graph
plt.show()

In [None]:
filtered2 = providers_and_vaccines[["State", 
                                   "Total Providers", 
                                   "Percent of total pop with a completed primary series"]]
filtered2.head()

In [None]:
filtered2 = filtered2.set_index("State")
filtered2.head()

In [None]:
fig1, ax1 = plt.subplots(figsize=(6, 5))
plt.scatter(filtered2["Total Providers"], 
            filtered2["Percent of total pop with a completed primary series"], color="blue")
plt.xlabel("Total Providers")
plt.ylabel("% of Total Population with a Completed Primary Series")

plt.show()

In [None]:
x_values = filtered2["Total Providers"]
y_values = filtered2["Percent of total pop with a completed primary series"]

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept

line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

#print statement for pearsonr
correlation = round(st.pearsonr(filtered2["Total Providers"], 
                                filtered2["Percent of total pop with a completed primary series"])[0],2)
print(f"The correlation between total providers and the percent of the total population with a completed primary series is: {correlation}")
