In [None]:
# In this analysis, we will be examining deaths by drug overdose and various contributing factors. 
# Data examined includes information from the years 2021-2023 in the United States, for individuals 18 and older.
# The various contributing factors analyzed are High School Graduation Rate, Median Household Income, and Unemployment Rate. 
# These values are averages from 2021-2023. 

In [1]:
# Import modules
from matplotlib import pyplot as plt
from scipy.stats import linregress
import numpy as np
import hvplot.pandas
import pandas as pd
from pathlib import Path


In [25]:
# Read HS Grad Rate, Median Household Income, Unemployment Rate, and Drug Overdose Rates as CSV files into DataFrames
# Note that these datasets have already been cleaned in separate Notebooks in our Repository. 
# Refer to these folders: **INSERT FOLDERS HERE** to view raw data and cleaning process.
#import HS grad data & create dataframe
hs_grad_data = Path("Resources/Average_HS_Grad_Rates.csv")
hs_grad_df = pd.read_csv(hs_grad_data)
#import median income data & create dataframe
income_data = Path("Median Household Income.csv")
income_df = pd.read_csv(income_data)
#import unemployment data & create dataframe
unemployment_data = Path("unemployment_rate_by_state_2021_2023.csv")
unemployment_df = pd.read_csv(unemployment_data)
#import overdose data & create dataframe
overdose_data = Path ("average_deaths_df.csv")
overdose_df = pd.read_csv(overdose_data)
#import state geographic data & create dataframe
state_geo_data = Path("Resources/state_lat_lon.csv")
state_geo_df = pd.read_csv(state_geo_data)
state_geo_df.head()


Unnamed: 0.1,Unnamed: 0,State name,lat,lon,correct lon
0,0,Alabama,32.7794,-86.8287,86.8287
1,1,Alaska,64.0685,-152.2782,152.2782
2,2,Arizona,34.2744,-111.6602,111.6602
3,3,Arkansas,34.8938,-92.4426,92.4426
4,4,California,37.1841,-119.4696,119.4696


In [29]:
### clean dataframes to only include necessary columns
#HS graduation rates cleaned df
hs_grad_final_df = hs_grad_df[["State name", "Average High School Graduation Rate (2021-2023)", "Average Population (2021-2023)"]]
#unemployment rates cleaned df
unemployment_final_df = unemployment_df[["State name", "21-23 Avg Unemp Rate"]]
#overdose rates cleaned df
overdose_final_df = overdose_df[["State Name", "Average Deaths by OD"]]
overdoses_final_df = overdose_final_df.rename(columns={"State Name":"State name"})
#states geographical data rates cleaned df
state_geo_final_df = state_geo_df[["State name", "lat", "correct lon"]]
states_geo_final_df = state_geo_final_df.rename(columns={"lat":"Latitude", "correct lon":"Longitude"})


Unnamed: 0,State name,Latitude,Longitude
0,Alabama,32.7794,86.8287
1,Alaska,64.0685,152.2782
2,Arizona,34.2744,111.6602
3,Arkansas,34.8938,92.4426
4,California,37.1841,119.4696


In [32]:
# Merge overdose and states geographic DataFrame on "State name"
overdose_states_geo_df = overdoses_final_df.merge(states_geo_final_df, how="inner", on="State name")
overdose_states_geo_df

Unnamed: 0,State name,Average Deaths by OD,Latitude,Longitude
0,Alabama,1461.0,32.7794,86.8287
1,Alaska,286.333333,64.0685,152.2782
2,Arizona,2745.666667,34.2744,111.6602
3,Arkansas,578.333333,34.8938,92.4426
4,California,11842.0,37.1841,119.4696
5,Colorado,1899.666667,38.9972,105.5478
6,Connecticut,1456.0,41.6219,72.7273
7,Delaware,524.333333,38.9896,75.505
8,District of Columbia,583.666667,38.9101,77.0147
9,Florida,7795.333333,28.6305,82.4497


In [33]:
# Merge overdose, states geographic data with HS grad data on "State name"
overdose_geo_HS_df = overdose_states_geo_df.merge(hs_grad_final_df, how="inner", on="State name")
overdose_geo_HS_df

Unnamed: 0,State name,Average Deaths by OD,Latitude,Longitude,Average High School Graduation Rate (2021-2023),Average Population (2021-2023)
0,Alabama,1461.0,32.7794,86.8287,88.08,3954008
1,Alaska,286.333333,64.0685,152.2782,92.6,556716
2,Arizona,2745.666667,34.2744,111.6602,88.6,5761630
3,Arkansas,578.333333,34.8938,92.4426,88.74,2346215
4,California,11842.0,37.1841,119.4696,85.24,30508752
5,Colorado,1899.666667,38.9972,105.5478,92.11,4621961
6,Connecticut,1456.0,41.6219,72.7273,91.21,2889563
7,Delaware,524.333333,38.9896,75.505,91.12,808333
8,District of Columbia,583.666667,38.9101,77.0147,93.06,548108
9,Florida,7795.333333,28.6305,82.4497,89.56,17891648


In [36]:
# Merge overdose, state geographic data, HS graduation, with unemployment data on "State name"
overdose_geo_HS_unemp_df = overdose_geo_HS_df.merge(unemployment_final_df, how="inner", on="State name")
overdose_geo_HS_unemp_df

Unnamed: 0,State name,Average Deaths by OD,Latitude,Longitude,Average High School Graduation Rate (2021-2023),Average Population (2021-2023),21-23 Avg Unemp Rate
0,Alabama,1461.0,32.7794,86.8287,88.08,3954008,2.638538
1,Alaska,286.333333,64.0685,152.2782,92.6,556716,2.299195
2,Arizona,2745.666667,34.2744,111.6602,88.6,5761630,3.912461
3,Arkansas,578.333333,34.8938,92.4426,88.74,2346215,2.87626
4,California,11842.0,37.1841,119.4696,85.24,30508752,2.740597
5,Colorado,1899.666667,38.9972,105.5478,92.11,4621961,4.037969
6,Connecticut,1456.0,41.6219,72.7273,91.21,2889563,2.840842
7,Delaware,524.333333,38.9896,75.505,91.12,808333,3.500364
8,District of Columbia,583.666667,38.9101,77.0147,93.06,548108,2.704639
9,Florida,7795.333333,28.6305,82.4497,89.56,17891648,4.382576


In [None]:
# Merge overdose, state geographic data, HS graduation, unemployment data with median income on "State name"


In [None]:
# If needed, calculate any Rates by dividing by Total Population for given year


# Create new column in DataFrame for the calculated rate


# Average the rates for 2021-2021 and store in the new column

In [None]:
# Drop any rows with N/A values - Puerto Rico, District of Columbia may not be needed? 

In [None]:
# Start Caluclations
# Q1: How does education rates (high school graduation) affect overdose rates by county across different regional areas in the US?
# graph: each state as a data point on a scatterplot comparing HS graduation rate to drug overdose rate, with a regression line) 


# Plot HS Grad Rates vs Drug Overdose Rates on Scatter Plot


# Format

In [None]:
# Add Linear Regression Equation and Line to Plot, as well as r-squared value 


# Export graph as jpeg for use in presentation

In [None]:
# In Markdown below, discuss relationship between HS Grad Rates and Drug Overdose Rates

Add Discussion here. 

In [None]:
# Q2: Is there a link between socioeconomic status (household median income) and rate of overdoses? 
# graph: each state as a data point on a scatter plot comparing household mean income to drug overdose rate, with regression line


# Plot Median Household Income vs Drug Overdose Rates on Scatter Plot


# Format


In [None]:
# Add Linear Regression Equation and Line to Plot, as well as r-squared value 


# Export graph as jpeg for use in presentation

In [None]:
# In markdown below, discuss relationship between Median Household Income and Drug Overdose Rates

Add Discussion here. 

In [None]:
# Q3: Does unemployment rate affect rates of drug overdose by state? 
# graph: each state as a data point on a scatter plot comparing unemployment rate to drug overdose rate, with regression line


# Plot Unemployment Rate vs Drug Overdose Rates on Scatter Plot


# Format


In [None]:
# Add Linear Regression Equation and Line to Plot, as well as r-squared value 


# Export graph as jpeg for use in presentation

In [None]:
# In markdown below, discuss relationship between Unemployment Rate and Drug Overdose Rates

Add Discussion here. 

In [None]:
# Statistical Calculations: 


# Create a Boxplot of all States Average Overdose Rate. 


# Calculate Quartiles

In [None]:
# Use Quartiles calculates above along with contributing factor (income, unemployment, grad rate) found to have strongest
# correlation to Overdose Rates to make 4 boxplots (side by side)


# Perform ANOVA test on boxplots to determine if there is a significant difference between the 4 quartiles


# Export graph as jpeg for use in presentation

In [None]:
# In markdown below, discuss ANOVA test and the quartiles created

Add Discusssion here. 

In [None]:
# For Presentation, create map of all 50 states, using Drug Overdose Rate for the "size"


# Export graph as jpeg for use in presentation

In [None]:
# In markdown below, add summary of project and discuss and further observations from analysis. 