## Model 1

Columns (Model 1, Base model)\
*X_columns*
- resident_status
- education_2003_revision
- sex
- age_recode_27
- maritial_status
- race
- hispanic_origin

*y_column (Label)*
- month_of_death

In [1]:
%config Completer.use_jedi = False

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import sys
import os

In [66]:
data_2005 = pd.read_csv('../data/2005_data.csv')

# remove columns with entity and record condition except entity_condition_1, entity_condition_2 and entity_condition_3
data_2005_revised = data_2005.drop(columns=['entity_condition_2', 'entity_condition_3', 'entity_condition_4',
       'entity_condition_5', 'entity_condition_6', 'entity_condition_7',
       'entity_condition_8', 'entity_condition_9', 'entity_condition_10',
       'entity_condition_11', 'entity_condition_12', 'entity_condition_13',
       'entity_condition_14', 'entity_condition_15', 'entity_condition_16',
       'entity_condition_17', 'entity_condition_18', 'entity_condition_19',
       'entity_condition_20', 'record_condition_2', 'record_condition_3',
       'record_condition_4', 'record_condition_5', 'record_condition_6',
       'record_condition_7', 'record_condition_8', 'record_condition_9',
       'record_condition_10', 'record_condition_11', 'record_condition_12',
       'record_condition_13', 'record_condition_14', 'record_condition_15',
       'record_condition_16', 'record_condition_17', 'record_condition_18',
       'record_condition_19', 'record_condition_20'])

# removing education 1989 education revised bc 2003 education revision has the most up to date information
try:
    data_2005_revised = data_2005_revised.drop(columns=['education_1989_revision'])
except:
    pass

# removing column for age_sub flag
try:
    data_2005_revised = data_2005_revised.drop(columns=['age_substitution_flag'])
except:
    pass

# remove infant age_recode
try:
    data_2005_revised = data_2005_revised.drop(columns=['infant_age_recode_22'])
except:
    pass

# remove place_of_injury_for_causes_w00_y34_except_y06_and_y07_
try:
    data_2005_revised = data_2005_revised.drop(columns=['place_of_injury_for_causes_w00_y34_except_y06_and_y07_'])
except:
    pass

# remove 130_infant_cause_recode
try:
    data_2005_revised = data_2005_revised.drop(columns=['130_infant_cause_recode'])
except:
    pass

# remove bridged race flag
try:
    data_2005_revised = data_2005_revised.drop(columns=['bridged_race_flag'])
except:
    pass

# Race imputation flag
try:
    data_2005_revised = data_2005_revised.drop(columns=['race_imputation_flag'])
except:
    pass

# replace NaN values with '9' for education column

data_2005_revised['education_2003_revision'] = data_2005_revised['education_2003_revision'].fillna(9)

# replace manner of death NaN values with 0 for "not specified"

data_2005_revised['manner_of_death'] = data_2005_revised['manner_of_death'].fillna(0)

# replace activity code with 10 for NaN values

data_2005_revised['activity_code'] = data_2005_revised['activity_code'].fillna(10)

# getting desired columns
data_2005_revised = data_2005_revised[['resident_status', 'education_2003_revision', 'sex', 'age_recode_27', 'marital_status', 'race', 'hispanic_origin', 'month_of_death']]

  exec(code_obj, self.user_global_ns, self.user_ns)


In [67]:
# Changing the label to categorical to work with logistic regression

data_2005_revised = data_2005_revised.astype({'education_2003_revision': 'int64'})

In [68]:
data_2005_revised.dtypes

resident_status             int64
education_2003_revision     int64
sex                        object
age_recode_27               int64
marital_status             object
race                        int64
hispanic_origin             int64
month_of_death              int64
dtype: object

In [69]:
# one hot encode marital stautus
data_2005_revised_one_hot = pd.get_dummies(data_2005_revised, prefix='marital_', columns=['marital_status'])

In [70]:
data_2005_revised_one_hot.isna().sum()

resident_status            0
education_2003_revision    0
sex                        0
age_recode_27              0
race                       0
hispanic_origin            0
month_of_death             0
marital__D                 0
marital__M                 0
marital__S                 0
marital__U                 0
marital__W                 0
dtype: int64

In [71]:
# binary encode sex column
data_2005_revised_one_hot.sex.value_counts()

F    1241896
M    1210610
Name: sex, dtype: int64

In [72]:
data_2005_revised_one_hot['sex'] = data_2005_revised_one_hot['sex'].map(lambda x : 1 if x == 'M' else 0)

In [73]:
data_2005_revised_one_hot.dtypes

resident_status            int64
education_2003_revision    int64
sex                        int64
age_recode_27              int64
race                       int64
hispanic_origin            int64
month_of_death             int64
marital__D                 uint8
marital__M                 uint8
marital__S                 uint8
marital__U                 uint8
marital__W                 uint8
dtype: object

### Train logistic regression model

In [74]:
X = data_2005_revised_one_hot[['resident_status', 'education_2003_revision', 'sex', 'age_recode_27', 'race', 'hispanic_origin', 'marital__D',
                      'marital__M', 'marital__S', 'marital__U', 'marital__W']]
y = data_2005_revised_one_hot['month_of_death']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [75]:
X_train.head(3)

Unnamed: 0,resident_status,education_2003_revision,sex,age_recode_27,race,hispanic_origin,marital__D,marital__M,marital__S,marital__U,marital__W
1507607,1,9,1,18,1,100,0,1,0,0,0
1755102,1,6,0,24,1,100,0,1,0,0,0
691367,2,9,0,22,1,100,0,0,0,0,1


In [60]:
from sklearn.linear_model import LogisticRegression

logisticRegr = LogisticRegression(solver='sag', verbose=1, max_iter=1000)

logisticRegr.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


9234
Epoch 43, change: 0.01132628
Epoch 44, change: 0.01107225
Epoch 45, change: 0.01082734
Epoch 46, change: 0.01058838
Epoch 47, change: 0.01035863
Epoch 48, change: 0.01013665
Epoch 49, change: 0.00991989
Epoch 50, change: 0.00971218
Epoch 51, change: 0.00950947
Epoch 52, change: 0.00931050
Epoch 53, change: 0.00912033
Epoch 54, change: 0.00893527
Epoch 55, change: 0.00875344
Epoch 56, change: 0.00857821
Epoch 57, change: 0.00840819
Epoch 58, change: 0.00824243
Epoch 59, change: 0.00808107
Epoch 60, change: 0.00792322
Epoch 61, change: 0.00777179
Epoch 62, change: 0.00762286
Epoch 63, change: 0.00747799
Epoch 64, change: 0.00733561
Epoch 65, change: 0.00719764
Epoch 66, change: 0.00706266
Epoch 67, change: 0.00693078
Epoch 68, change: 0.00680391
Epoch 69, change: 0.00667950
Epoch 70, change: 0.00655575
Epoch 71, change: 0.00643699
Epoch 72, change: 0.00632038
Epoch 73, change: 0.00620711
Epoch 74, change: 0.00609590
Epoch 75, change: 0.00598596
Epoch 76, change: 0.00592704
Epoch 77,

Epoch 357, change: 0.00189501
Epoch 358, change: 0.00188623
Epoch 359, change: 0.00187744
Epoch 360, change: 0.00186869
Epoch 361, change: 0.00186001
Epoch 362, change: 0.00185143
Epoch 363, change: 0.00184286
Epoch 364, change: 0.00183436
Epoch 365, change: 0.00182598
Epoch 366, change: 0.00181761
Epoch 367, change: 0.00180922
Epoch 368, change: 0.00180088
Epoch 369, change: 0.00179264
Epoch 370, change: 0.00178447
Epoch 371, change: 0.00177641
Epoch 372, change: 0.00176827
Epoch 373, change: 0.00176017
Epoch 374, change: 0.00175219
Epoch 375, change: 0.00174428
Epoch 376, change: 0.00173632
Epoch 377, change: 0.00172847
Epoch 378, change: 0.00172061
Epoch 379, change: 0.00171285
Epoch 380, change: 0.00170513
Epoch 381, change: 0.00169750
Epoch 382, change: 0.00168983
Epoch 383, change: 0.00168225
Epoch 384, change: 0.00167472
Epoch 385, change: 0.00166724
Epoch 386, change: 0.00165984
Epoch 387, change: 0.00165242
Epoch 388, change: 0.00164505
Epoch 389, change: 0.00163775
Epoch 390,

Epoch 631, change: 0.00063373
Epoch 632, change: 0.00063147
Epoch 633, change: 0.00062923
Epoch 634, change: 0.00062703
Epoch 635, change: 0.00062482
Epoch 636, change: 0.00062263
Epoch 637, change: 0.00062044
Epoch 638, change: 0.00061826
Epoch 639, change: 0.00061610
Epoch 640, change: 0.00061392
Epoch 641, change: 0.00061175
Epoch 642, change: 0.00060961
Epoch 643, change: 0.00060746
Epoch 644, change: 0.00060532
Epoch 645, change: 0.00060321
Epoch 646, change: 0.00060108
Epoch 647, change: 0.00059899
Epoch 648, change: 0.00059688
Epoch 649, change: 0.00059480
Epoch 650, change: 0.00059269
Epoch 651, change: 0.00059063
Epoch 652, change: 0.00058857
Epoch 653, change: 0.00058651
Epoch 654, change: 0.00058447
Epoch 655, change: 0.00058245
Epoch 656, change: 0.00058042
Epoch 657, change: 0.00057837
Epoch 658, change: 0.00057635
Epoch 659, change: 0.00057433
Epoch 660, change: 0.00057234
Epoch 661, change: 0.00057037
Epoch 662, change: 0.00056838
Epoch 663, change: 0.00056642
Epoch 664,

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 21.1min finished


LogisticRegression(max_iter=1000, solver='sag', verbose=1)

In [61]:
score = logisticRegr.score(X_test, y_test)
print(score)

0.09541878622608661


In [40]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics

m = metrics.confusion_matrix(y_test, predictions)
print(cm)

### Train cat boost model
Due to having many categorical values in the dataset, we can try boosting instead of logistic regression

In [77]:
data_2005_revised.head(2)

Unnamed: 0,resident_status,education_2003_revision,sex,age_recode_27,marital_status,race,hispanic_origin,month_of_death
0,1,9,F,15,M,1,100,1
1,1,9,M,18,D,1,100,1


In [78]:
data_2005_revised = data_2005_revised.astype({'resident_status': 'category', 'education_2003_revision': 'category',
                                             'sex': 'category', 'age_recode_27': 'category', 'marital_status': 'category',
                                             'race': 'category', 'hispanic_origin': 'category', 'month_of_death': 'category'})

In [79]:
data_2005_revised.dtypes

resident_status            category
education_2003_revision    category
sex                        category
age_recode_27              category
marital_status             category
race                       category
hispanic_origin            category
month_of_death             category
dtype: object

In [82]:
X = data_2005_revised[['resident_status', 'education_2003_revision', 'sex', 'age_recode_27', 'race', 'hispanic_origin', 'marital_status']]
y = data_2005_revised['month_of_death']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
import numpy as np
from catboost import CatBoostClassifier

cat_features = list(X_train.columns)

model = CatBoostClassifier(iterations=20, depth=2, learning_rate=1, loss_function='MultiClass', verbose=True, cat_features=cat_features)

model.fit(X_train, y_train)

In [88]:
model.score(X_train, y_train)

0.09604918271229124