In [1]:
#Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler


In [2]:
#Load the dataset
url = "https://data.ct.gov/api/views/rybz-nyjw/rows.csv?accessType=DOWNLOAD"
df = pd.read_csv(url)

In [3]:
#Display the first few rows of the dataset
print(df.head())


         Date      Date Type   Age     Sex   Race Ethnicity Residence City   
0  05/29/2012  Date of death  37.0    Male  Black       NaN       STAMFORD  \
1  06/27/2012  Date of death  37.0    Male  White       NaN        NORWICH   
2  03/24/2014  Date of death  28.0    Male  White       NaN         HEBRON   
3  12/31/2014  Date of death  26.0  Female  White       NaN         BALTIC   
4  01/16/2016  Date of death  41.0    Male  White       NaN        SHELTON   

  Residence County Residence State Injury City  ... Xylazine Gabapentin   
0        FAIRFIELD             NaN    STAMFORD  ...      NaN        NaN  \
1       NEW LONDON             NaN     NORWICH  ...      NaN        NaN   
2              NaN             NaN      HEBRON  ...      NaN        NaN   
3              NaN             NaN         NaN  ...      NaN        NaN   
4        FAIRFIELD              CT     SHELTON  ...      NaN        NaN   

  Opiate NOS Heroin/Morph/Codeine Other Opioid Any Opioid Other   
0        NaN 

In [4]:
# Rename columns
df.rename(columns={
    "Death Date": "Date",
    "Age": "Age",
    "Sex": "Gender",
    "Race": "Race",
    "Residence City": "Residence City",
    "Residence County": "Residence County",
    "Residence State": "Residence State",
    "Location of death": "Location",
    "Location of Death City": "Death City",
    "Location of Death County": "Death County",
    "Location of Death State": "Death State",
    "DescriptionofInjury": "Description of Injury",
    "Heroin(Cause of death)": "Heroin",
    "Cocaine(Cause of death)": "Cocaine",
    "Fentanyl(Cause of death)": "Fentanyl",
    "Fentanyl Analogue(Cause of death)": "Fentanyl Analogue",
    "Oxycodone(Cause of death)": "Oxycodone",
    "Oxymorphone(Cause of death)": "Oxymorphone",
    "Ethanol(Cause of death)": "Ethanol",
    "Hydrocodone(Cause of death)": "Hydrocodone",
    "Benzodiazepine(Cause of death)": "Benzodiazepine",
    "Methadone(Cause of death)": "Methadone",
    "Amphetamines(Cause of death)": "Meth/Amphetamine",
    "Methamphetamine(Cause of death)": "Meth/Amphetamine",
    "Tramadol(Cause of death)": "Tramadol",
    "Hydromorphone(Cause of death)": "Hydromorphone",
    "Other": "Other",
    "Any Opioid": "Any Opioid",
    "Manner of Death": "Manner of Death",
    "Death City": "DeathCityGeo",
    "Residence City": "ResidenceCityGeo",
    "Injury City": "InjuryCityGeo",
    "Injury County": "Injury County",
    "Injury State": "Injury State",
    "Cause of Death": "Cause of Death",
    "Other Opioid": "Other Opioid",
    "Other Significant Conditions and contributing causes": "Other Significant Conditions "
}, inplace=True)

In [5]:
# Drop irrelevant columns
df.drop(["ID", "DateType", "Time", "LocationifOther", "ResidenceCityGeo", "InjuryCityGeo", "DeathCityGeo"], axis=1, inplace=True)


KeyError: "['ID' 'DateType' 'Time' 'LocationifOther' 'ResidenceCityGeo'\n 'InjuryCityGeo' 'DeathCityGeo'] not found in axis"

In [None]:
# Convert Date column to datetime format
df["Date"] = pd.to_datetime(df["Date"], format="%m/%d/%Y")

# Convert Age column to numeric format
df["Age"] = pd.to_numeric(df["Age"], errors="coerce")

# Convert Y/N columns to 1/0
for col in ["Heroin", "Cocaine", "Fentanyl", "Fentanyl Analogue", "Hydromorphone", "Xylazine", "Gabapentin", "Opiate NOS", "Heroin/Morph/Codeine", "Other Opioid", "Any Opioid", "Other"]:
    df[col] = df[col].apply(lambda x: 1 if x == "Y" else 0)


In [None]:
#Replace missing values with "Unknown"
df.fillna("Unknown", inplace=True)


In [None]:
#Find and remove outliers
numeric_cols = ["Age"]
numeric_df = df[numeric_cols]
z_scores = np.abs(StandardScaler().fit_transform(numeric_df))
outliers = np.where(z_scores > 3)
df = df.drop(df.index[outliers])


In [None]:
# Explore the relationship between variables using a heatmap
sns.heatmap(df.corr(), annot=True)
plt.show()

# Explore the income distribution by gender
sns.histplot(data=df, x="Gender", hue="Income", multiple="stack")
plt.show()
