## Model 1

Columns (Model 1, Base model)\
*X_columns*
- resident_status
- education_2003_revision
- sex
- age_recode_27
- maritial_status
- race
- hispanic_origin

*y_column (Label)*
- month_of_death

In [1]:
%config Completer.use_jedi = False

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import sys
import os

In [3]:
data_2005 = pd.read_csv('../data/2005_data.csv')

# remove columns with entity and record condition except entity_condition_1, entity_condition_2 and entity_condition_3
data_2005_revised = data_2005.drop(columns=['entity_condition_2', 'entity_condition_3', 'entity_condition_4',
       'entity_condition_5', 'entity_condition_6', 'entity_condition_7',
       'entity_condition_8', 'entity_condition_9', 'entity_condition_10',
       'entity_condition_11', 'entity_condition_12', 'entity_condition_13',
       'entity_condition_14', 'entity_condition_15', 'entity_condition_16',
       'entity_condition_17', 'entity_condition_18', 'entity_condition_19',
       'entity_condition_20', 'record_condition_2', 'record_condition_3',
       'record_condition_4', 'record_condition_5', 'record_condition_6',
       'record_condition_7', 'record_condition_8', 'record_condition_9',
       'record_condition_10', 'record_condition_11', 'record_condition_12',
       'record_condition_13', 'record_condition_14', 'record_condition_15',
       'record_condition_16', 'record_condition_17', 'record_condition_18',
       'record_condition_19', 'record_condition_20'])

# removing education 1989 education revised bc 2003 education revision has the most up to date information
try:
    data_2005_revised = data_2005_revised.drop(columns=['education_1989_revision'])
except:
    pass

# removing column for age_sub flag
try:
    data_2005_revised = data_2005_revised.drop(columns=['age_substitution_flag'])
except:
    pass

# remove infant age_recode
try:
    data_2005_revised = data_2005_revised.drop(columns=['infant_age_recode_22'])
except:
    pass

# remove place_of_injury_for_causes_w00_y34_except_y06_and_y07_
try:
    data_2005_revised = data_2005_revised.drop(columns=['place_of_injury_for_causes_w00_y34_except_y06_and_y07_'])
except:
    pass

# remove 130_infant_cause_recode
try:
    data_2005_revised = data_2005_revised.drop(columns=['130_infant_cause_recode'])
except:
    pass

# remove bridged race flag
try:
    data_2005_revised = data_2005_revised.drop(columns=['bridged_race_flag'])
except:
    pass

# Race imputation flag
try:
    data_2005_revised = data_2005_revised.drop(columns=['race_imputation_flag'])
except:
    pass

# replace NaN values with '9' for education column

data_2005_revised['education_2003_revision'] = data_2005_revised['education_2003_revision'].fillna(9)

# replace manner of death NaN values with 0 for "not specified"

data_2005_revised['manner_of_death'] = data_2005_revised['manner_of_death'].fillna(0)

# replace activity code with 10 for NaN values

data_2005_revised['activity_code'] = data_2005_revised['activity_code'].fillna(10)

# getting desired columns
data_2005_revised = data_2005_revised[['resident_status', 'education_2003_revision', 'sex', 'age_recode_27', 'marital_status', 'race', 'hispanic_origin', 'month_of_death']]

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
# Changing the label to categorical to work with logistic regression

data_2005_revised = data_2005_revised.astype({'month_of_death': 'category', 'education_2003_revision': 'int64'})

In [5]:
data_2005_revised.dtypes

resident_status               int64
education_2003_revision       int64
sex                          object
age_recode_27                 int64
marital_status               object
race                          int64
hispanic_origin               int64
month_of_death             category
dtype: object

In [6]:
# one hot encode marital stautus
marital_dummies = pd.get_dummies(data_2005_revised.marital_status, prefix='marital_')

In [7]:
data_2005_revised = pd.concat([data_2005_revised, marital_dummies])
data_2005_revised = data_2005_revised.drop(columns=['marital_status'])

In [8]:
data_2005_revised.dtypes

resident_status             float64
education_2003_revision     float64
sex                          object
age_recode_27               float64
race                        float64
hispanic_origin             float64
month_of_death             category
marital__D                  float64
marital__M                  float64
marital__S                  float64
marital__U                  float64
marital__W                  float64
dtype: object

In [10]:
# binary encode sex column
data_2005_revised.sex.value_counts()

F    1241896
M    1210610
Name: sex, dtype: int64

In [11]:
data_2005_revised['sex'] = data_2005_revised['sex'].map(lambda x : 1 if x == 'M' else 0)

In [12]:
data_2005_revised.dtypes

resident_status             float64
education_2003_revision     float64
sex                           int64
age_recode_27               float64
race                        float64
hispanic_origin             float64
month_of_death             category
marital__D                  float64
marital__M                  float64
marital__S                  float64
marital__U                  float64
marital__W                  float64
dtype: object

### Train logistic regression model

In [13]:
#TODO