## **Overall Mortality**

### Remove Irrelevant Columns

In [22]:
import pandas as pd
for year in range(2005, 2016):
    file_name = f"{year}_data.csv"
    df = pd.read_csv(file_name, low_memory=False)
    df = df.drop(["resident_status", 
                  "education_1989_revision", 
                  "education_2003_revision",
                 "education_reporting_flag",
                 "age_substitution_flag",
                 "age_recode_52",
                  "age_recode_52",
                  "age_recode_27",
                  "age_recode_12",
                  "infant_age_recode_22",
                  "marital_status",
                  "day_of_week_of_death",
                  "injury_at_work",
                  "manner_of_death",
                  "autopsy",
                  "activity_code",
                  "place_of_injury_for_causes_w00_y34_except_y06_and_y07_",
                  "358_cause_recode",
                  "113_cause_recode",
                  "130_infant_cause_recode",
                  "39_cause_recode",
                  "number_of_entity_axis_conditions",
                  "bridged_race_flag",
                  "race_imputation_flag",
                  "race_recode_3",
                  "hispanic_origin",
                  "hispanic_originrace_recode",
                  "race",
                  "number_of_record_axis_conditions",
                  "method_of_disposition",
                  "detail_age_type"], axis=1)
    entity_columns = [f"entity_condition_{x}" for x in range(1, 21)]
    df = df.drop(columns = entity_columns)
    df.to_csv(file_name, index=False)


### Merge CSVs

In [27]:
import pandas as pd
csv_files = []
for year in range(2005, 2016):
    file_name = f"{year}_data.csv"
    csv_files.append(file_name)

new_df = []

for csv in csv_files:
    df = pd.read_csv(csv, low_memory=False)
    new_df.append(df)

combined_df = pd.concat(new_df, ignore_index=True)

combined_df.to_csv("mortality.csv", index=False)


## **Substance Use Mortality**

### Filter for SU as underlying or contributing cause

In [28]:
import pandas as pd

df = pd.read_csv("mortality.csv", low_memory=False)

codes = [
    'E244', 'F10', 'G312', 'G621', 'G721', 'I426', 'K70', 'K852', 'K860', 'R780',
    'T51', 'X65', 'Y15', 'F11', 'F12', 'F13', 'F14', 'F15', 'F16', 'F18', 'F19',
    'R781', 'R782', 'R783', 'R784', 'T40', 'T423', 'T424', 'T426', 'T427', 'T436',
    'X61', 'X62', 'X64', 'X66', 'Y11', 'Y12', 'Y14', 'Y16', 'T52', 'T53', 'T509',
    'T400', 'T401', 'T402', 'T403', 'T404', 'T406', 'T407', 'T405'
]

# All columns to check
cols = ['icd_code_10th_revision'] + [f'record_condition_{i}' for i in range(1, 21)]

# Filter only columns that exist in the DataFrame
cols = [col for col in cols if col in df.columns]

# Create a boolean DataFrame for startswith across all relevant columns
mask = df[cols].astype(str).apply(lambda col: col.str.startswith(tuple(codes)))

# Keep rows where any column matches
filtered_df = df[mask.any(axis=1)]

filtered_df.to_csv('SU_mortality_final.csv', index=False)

### Adding custom Age Recode

In [29]:
import pandas as pd
df = pd.read_csv("SU_mortality_final.csv", low_memory = False)

bins = [0, 24, 39, 54, 69, 84, float('inf')]
labels = [0, 1, 2, 3, 4, 5]

df['age_recode_custom'] = pd.cut(df['detail_age'], bins=bins, labels=labels, right=True, include_lowest=True)

df['age_recode_custom'] = df['age_recode_custom'].astype(int)

df.to_csv('SU_mortality_final.csv', index=False)


### Remove mortalities < age 25 

In [33]:
import pandas as pd 
df = pd.read_csv("SU_mortality_final.csv", low_memory = False)
df = df[df["age_recode_custom"] != 0]
df.to_csv('SU_mortality_final.csv', index=False)

## **Heart Disease mortality**

### Filter HD as underlying or contributing cause (I00-I09, I11, I13, I20-I51)

In [31]:
import pandas as pd

df = pd.read_csv("mortality_final.csv", low_memory=False)

codes = [
    'I00', 'I01', 'I02', 'I03', 'I04', 'I05', 'I06', 'I07', 'I08', 'I09', 'I11', 'I13', 
    'I20', 'I21', 'I22', 'I23', 'I24', 'I25', 'I26', 'I27', 'I28', 'I29', 'I30',
    'I30', 'I31', 'I32', 'I33', 'I34', 'I35', 'I36', 'I37', 'I38', 'I39', 'I40',
    'I40', 'I41', 'I42', 'I43', 'I44', 'I45', 'I46', 'I47', 'I48', 'I49', 'I50',
    "I51"
]

# All columns to check
cols = ['icd_code_10th_revision'] + [f'record_condition_{i}' for i in range(1, 21)]

# Filter only columns that exist in the DataFrame
cols = [col for col in cols if col in df.columns]

# Create a boolean DataFrame for startswith across all relevant columns
mask = df[cols].astype(str).apply(lambda col: col.str.startswith(tuple(codes)))

# Keep rows where any column matches
filtered_df = df[mask.any(axis=1)]

filtered_df.to_csv('HD_mortality_final.csv', index=False)

### Adding custom age recode

In [32]:
import pandas as pd
df = pd.read_csv("HD_mortality_final.csv", low_memory = False)

bins = [0, 24, 39, 54, 69, 84, float('inf')]
labels = [0, 1, 2, 3, 4, 5]

df['age_recode_custom'] = pd.cut(df['detail_age'], bins=bins, labels=labels, right=True, include_lowest=True)

df['age_recode_custom'] = df['age_recode_custom'].astype(int)

df.to_csv('HD_mortality_final.csv', index=False)

### Remove mortalities < age 25 

In [34]:
import pandas as pd 
df = pd.read_csv("HD_mortality_final.csv", low_memory = False)
df = df[df["age_recode_custom"] != 0]
df.to_csv('HD_mortality_final.csv', index=False)

## **Substance Use and Heart Disease**

### Filtering for SU and HD as underlying or contributing

In [3]:
import pandas as pd

df = pd.read_csv("mortality_final.csv", low_memory=False)

# Define HD and SU codes separately
hd_codes = [
    'I00', 'I01', 'I02', 'I03', 'I04', 'I05', 'I06', 'I07', 'I08', 'I09', 'I11', 'I13', 
    'I20', 'I21', 'I22', 'I23', 'I24', 'I25', 'I26', 'I27', 'I28', 'I29', 'I30',
    'I31', 'I32', 'I33', 'I34', 'I35', 'I36', 'I37', 'I38', 'I39', 'I40',
    'I41', 'I42', 'I43', 'I44', 'I45', 'I46', 'I47', 'I48', 'I49', 'I50', 'I51'
]

su_codes = [
    'E244', 'F10', 'G312', 'G621', 'G721', 'I426', 'K70', 'K852', 'K860', 'R780',
    'T51', 'X65', 'Y15', 'F11', 'F12', 'F13', 'F14', 'F15', 'F16', 'F18', 'F19',
    'R781', 'R782', 'R783', 'R784', 'T40', 'T423', 'T424', 'T426', 'T427', 'T436',
    'X61', 'X62', 'X64', 'X66', 'Y11', 'Y12', 'Y14', 'Y16', 'T52', 'T53', 'T509',
    'T400', 'T401', 'T402', 'T403', 'T404', 'T406', 'T407', 'T405'
]

# All columns to check
cols = ['icd_code_10th_revision'] + [f'record_condition_{i}' for i in range(1, 21)] 

# Check for HD codes in any column
has_hd = df[cols].apply(lambda x: x.astype(str).str.startswith(tuple(hd_codes))).any(axis=1)

# Check for SU codes in any column
has_su = df[cols].apply(lambda x: x.astype(str).str.startswith(tuple(su_codes))).any(axis=1)

# Select rows where BOTH conditions are met
filtered_df = df[has_hd & has_su]

# Save the filtered DataFrame
filtered_df.to_csv('SU_HD_mortality_final.csv', index=False)


ERROR! Session/line number was not unique in database. History logging moved to new session 46


### Adding custom age recode

In [4]:
import pandas as pd
df = pd.read_csv("SU_HD_mortality_final.csv", low_memory = False)

bins = [0, 24, 39, 54, 69, 84, float('inf')]
labels = [0, 1, 2, 3, 4, 5]

df['age_recode_custom'] = pd.cut(df['detail_age'], bins=bins, labels=labels, right=True, include_lowest=True)

df['age_recode_custom'] = df['age_recode_custom'].astype(int)

df.to_csv('SU_HD_mortality_final.csv', index=False)

### Remove mortalities < age 25 

In [6]:
import pandas as pd 
df = pd.read_csv("SU_HD_mortality_final.csv", low_memory = False)
df = df[df["age_recode_custom"] != 0]
df.to_csv('SU_HD_mortality_final.csv', index=False)