In [3]:
# Install required packages
%pip install pandas numpy seaborn scikit-learn

import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Read the CSV data from the local file
file_path = r"C:\Users\lewis\OneDrive\Documents\2023-02-DSI-WE\Sal1776\Accidental_Drug_Related_Deaths_2012-2021.csv"
df = pd.read_csv(file_path)

# Rename columns with friendly names
df.rename(columns={
    "Age": "Age",
    "Sex": "Gender",
    "Race": "Race",
    "ResidenceCity": "Residence City",
    "ResidenceState": "Residence State",
    "DeathCity": "Death City",
    "DeathState": "Death State",
    "Location": "Location",
    "DescriptionofInjury": "Description of Injury",
    "InjuryPlace": "Injury Place",
    "ImmediateCauseA": "Immediate Cause of Death",
    "Heroin": "Heroin",
    "Cocaine": "Cocaine",
    "Fentanyl": "Fentanyl",
    "Oxycodone": "Oxycodone",
    "Oxymorphone": "Oxymorphone",
    "EtOH": "Alcohol",
    "Hydrocodone": "Hydrocodone",
    "Benzodiazepine": "Benzodiazepine",
    "Methadone": "Methadone",
    "Amphet": "Amphetamines",
    "Tramad": "Tramadol",
    "Morphine (not heroin)": "Morphine",
    "Other": "Other",
    "Any Opioid": "Any Opioid",
    "MannerofDeath": "Manner of Death",
    "DeathLoc": "Death Location"
}, inplace=True)


# Explore the dataset
print("Data Shape:", df.shape)
print("\nData Types:\n", df.dtypes)
print("\nData Description:\n", df.describe(include="all"))

# Check for missing data
print("\nMissing Data:\n", df.isnull().sum())

# Check for outliers
numeric_cols = ["Age", "Hours per Week"]
numeric_df = df[numeric_cols]
z_scores = np.abs(StandardScaler().fit_transform(numeric_df))
outliers = np.where(z_scores > 3)
print("\nOutliers:\n", numeric_df.iloc[outliers])

# Explore the relationship between variables
sns.heatmap(df.corr(), annot=True)

# Explore the income distribution by gender
sns.histplot(data=df, x="Gender", hue="Income", multiple="stack")

# Explore the hours per week distribution by education level
sns.boxplot(data=df, x="Education", y="Hours per Week")

# Prepare the data for modeling
df["Gender"] = df["Gender"].apply(lambda x: 1 if x == "Male" else 0)
df["Income"] = df["Income"].apply(lambda x: 1 if x == ">50K" else 0)
X = df.drop("Income", axis=1)
y = df["Income"]
X_encoded = pd.get_dummies(X, drop_first=True)
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42)

# Train a logistic regression model
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

# Evaluate the model performance
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Note: you may need to restart the kernel to use updated packages.
Data Shape: (9202, 48)

Data Types:
 Date                              object
Date Type                         object
Age                              float64
Gender                            object
Race                              object
Ethnicity                         object
Residence City                    object
Residence County                  object
Residence State                   object
Injury City                       object
Injury County                     object
Injury State                      object
Injury Place                      object
Description of Injury             object
Death City                        object
Death County                      object
Death State                       object
Location                          object
Location if Other                 object
Cause of Death                    object
Manner of Death                   object
Other Significant Conditions      ob

KeyError: "['Hours per Week'] not in index"