In [1]:
#Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler


In [2]:
#Load the dataset
url = "https://data.ct.gov/api/views/rybz-nyjw/rows.csv?accessType=DOWNLOAD"
df = pd.read_csv(url)

In [3]:
#Display the first few rows of the dataset
print(df.head())


         Date      Date Type   Age     Sex   Race Ethnicity Residence City   
0  05/29/2012  Date of death  37.0    Male  Black       NaN       STAMFORD  \
1  06/27/2012  Date of death  37.0    Male  White       NaN        NORWICH   
2  03/24/2014  Date of death  28.0    Male  White       NaN         HEBRON   
3  12/31/2014  Date of death  26.0  Female  White       NaN         BALTIC   
4  01/16/2016  Date of death  41.0    Male  White       NaN        SHELTON   

  Residence County Residence State Injury City  ... Xylazine Gabapentin   
0        FAIRFIELD             NaN    STAMFORD  ...      NaN        NaN  \
1       NEW LONDON             NaN     NORWICH  ...      NaN        NaN   
2              NaN             NaN      HEBRON  ...      NaN        NaN   
3              NaN             NaN         NaN  ...      NaN        NaN   
4        FAIRFIELD              CT     SHELTON  ...      NaN        NaN   

  Opiate NOS Heroin/Morph/Codeine Other Opioid Any Opioid Other   
0        NaN 

In [4]:
#Rename the columns to be more readable
df.rename(columns={
    "Date": "Date",
    "DateType": "Date Type",
    "Age": "Age",
    "Sex": "Gender",
    "Race": "Race",
    "ResidenceCity": "Residence City",
    "ResidenceCounty": "Residence County",
    "ResidenceState": "Residence State",
    "DeathCity": "Death City",
    "DeathCounty": "Death County",
    "Location": "Location",
    "LocationifOther": "Location if Other",
    "DescriptionofInjury": "Description of Injury",
    "InjuryPlace": "Injury Place",
    "InjuryCity": "Injury City",
    "InjuryCounty": "Injury County",
    "InjuryState": "Injury State",
    "CauseofDeath": "Cause of Death",
    "MannerofDeath": "Manner of Death",
    "AmendedMannerofDeath": "Amended Manner of Death",
    "DeathLoc": "Death Location",
    "Certifier": "Certifier",
    "Pronouncer": "Pronouncer",
    "PronouncerTitle": "Pronouncer Title",
    "OtherSignifican": "Other Significant Conditions ",
    "Heroin": "Heroin",
    "Cocaine": "Cocaine",
    "Fentanyl": "Fentanyl",
    "FentanylAnalogue": "Fentanyl Analogue",
    "Oxycodone": "Oxycodone",
    "Oxymorphone": "Oxymorphone",
    "Ethanol": "Ethanol",
    "Hydrocodone": "Hydrocodone",
    "Benzodiazepine": "Benzodiazepine",
    "Methadone": "Methadone",
    "Amphet": "Meth/Amphetamine",
    "Tramad": "Tramadol",
    "Morphine_NotHeroin": "Morphine (Not Heroin)",
    "Hydromorphone": "Hydromorphone",
    "Other": "Other",
    "AnyOpioid": "Any Opioid",
    "MannerofDeath": "Manner of Death",
    "DeathCityGeo": "DeathCityGeo",
    "ResidenceCityGeo": "ResidenceCityGeo",
    "InjuryCityGeo": "InjuryCityGeo"
}, inplace=True)


In [5]:
# Drop irrelevant columns
df.drop(["ID", "DateType", "Time", "LocationifOther", "ResidenceCityGeo", "InjuryCityGeo", "DeathCityGeo"], axis=1, inplace=True)


KeyError: "['ID' 'DateType' 'Time' 'LocationifOther'] not found in axis"

In [6]:
print(df.columns)

Index(['Date', 'Date Type', 'Age', 'Gender', 'Race', 'Ethnicity',
       'Residence City', 'Residence County', 'Residence State', 'Injury City',
       'Injury County', 'Injury State', 'Injury Place',
       'Description of Injury', 'Death City', 'Death County', 'Death State',
       'Location', 'Location if Other', 'Cause of Death', 'Manner of Death',
       'Other Significant Conditions ', 'Heroin',
       'Heroin death certificate (DC)', 'Cocaine', 'Fentanyl',
       'Fentanyl Analogue', 'Oxycodone', 'Oxymorphone', 'Ethanol',
       'Hydrocodone', 'Benzodiazepine', 'Methadone', 'Meth/Amphetamine',
       'Meth/Amphetamine', 'Tramadol', 'Hydromorphone',
       'Morphine (Not Heroin)', 'Xylazine', 'Gabapentin', 'Opiate NOS',
       'Heroin/Morph/Codeine', 'Other Opioid', 'Any Opioid', 'Other',
       'ResidenceCityGeo', 'InjuryCityGeo', 'DeathCityGeo'],
      dtype='object')


In [None]:
#Replace missing values with "Unknown"
df.fillna("Unknown", inplace=True)


In [None]:
#Find and remove outliers
numeric_cols = ["Age"]
numeric_df = df[numeric_cols]
z_scores = np.abs(StandardScaler().fit_transform(numeric_df))
outliers = np.where(z_scores > 3)
df = df.drop(df.index[outliers])


In [None]:
# Explore the relationship between variables using a heatmap
sns.heatmap(df.corr(), annot=True)
plt.show()

# Explore the income distribution by gender
sns.histplot(data=df, x="Gender", hue="Income", multiple="stack")
plt.show()
