In [1]:
import pandas
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [3]:
df = pandas.read_csv('output.csv', low_memory=False)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,age,sex,race,education,month_of_death,day_of_week_of_death,manner,relationship_status,icd_code,cause,group,New_group
0,2195403,93,F,White,4.0,3,1,Natural Causes,Widowed,I25,['Chronic ischemic heart disease'],Heart Disease,Heart Disease
1,1585793,90,F,White,4.0,1,5,Natural Causes,Single,J18,"['Pneumonia, unspecified organism']",Respiratory Condition,Issues Breathing
2,115267,71,F,White,3.0,7,6,Natural Causes,Married,N39,['Other disorders of urinary system'],Bladder Condition,Organ Failure
3,782491,77,F,White,3.0,6,3,Natural Causes,Widowed,C43,['Malignant melanoma of skin'],Cancer,Cancer
4,1809783,79,F,White,3.0,2,3,Natural Causes,Widowed,I21,['ST elevation (STEMI) and non-ST elevation (N...,Heart Disease,Heart Disease


In [5]:
features_cat = ['race', 'sex', 'relationship_status', 'education']
features_num = ['age']

X_cat = df[features_cat]
X_num = df[features_num]
enc = preprocessing.OneHotEncoder()
enc.fit(X_cat)
one_hot = enc.transform(X_cat)
X_cat_proc = pandas.DataFrame(one_hot.toarray(), columns=enc.get_feature_names())

In [6]:
X_num_proc = X_num
X = pandas.concat([X_num_proc, X_cat_proc], axis=1, sort=False)
X = X.fillna(0)

In [7]:
y = df['New_group']

In [8]:
X_train, X_TEMP, y_train, y_TEMP = train_test_split(X, y, test_size=0.30) # split out into training 70% of our data
X_validation, X_test, y_validation, y_test = train_test_split(X_TEMP, y_TEMP, test_size=0.50) # split out into validation 15% of our data and test 15% of our data

In [9]:
# helper method to print basic model metrics
def metrics(y_true, y_pred):
    print('Confusion matrix:\n', confusion_matrix(y_true, y_pred))
    # target_names = ['denied', 'approved']
    print('\nReport:\n', classification_report(y_true, y_pred))

    
logistic_regression = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=250)
decision_tree_classifier = DecisionTreeClassifier()
naive_bayes = GaussianNB()
knn = KNeighborsClassifier()
lda = LinearDiscriminantAnalysis()
random_forest = RandomForestClassifier(bootstrap=True);

model = random_forest.fit(X_train, y_train) # first fit (train) the model
y_pred = model.predict(X_validation) # next get the model's predictions for a sample in the validation set
metrics(y_validation, y_pred)



Confusion matrix:
 [[1398   86 1476  106  249   50]
 [ 305  210 1309   43  100   24]
 [1138  269 2938  160  322   70]
 [ 407   71  920   39   88   24]
 [ 639   85 1007   63  364   66]
 [ 229   52  480   24  134   55]]

Report:
                       precision    recall  f1-score   support

              Cancer       0.34      0.42      0.37      3365
Congenital Anomalies       0.27      0.11      0.15      1991
       Heart Disease       0.36      0.60      0.45      4897
    Issues Breathing       0.09      0.03      0.04      1549
       Organ Failure       0.29      0.16      0.21      2224
               Other       0.19      0.06      0.09       974

            accuracy                           0.33     15000
           macro avg       0.26      0.23      0.22     15000
        weighted avg       0.29      0.33      0.29     15000



In [12]:
import pickle

In [13]:
with open('deaths_model', 'wb') as f:
    pickle.dump(model, f)

In [14]:
with open('deaths_model', 'rb') as f:
    model2 = pickle.load(f)

In [16]:
columns = list(X.columns)
columns = columns[1:]
with open('columns', 'wb') as file:
    pickle.dump(columns, file)

In [17]:
predict_data_cat = [['Black', 'M', 'Single', 6], ['White', 'M', 'Widowed', 3], ['Hawaiian', 'F', 'Married', 5]]
predict_data_num = [[20], [72], [36]]

enc1 = preprocessing.OneHotEncoder()
enc1.fit(predict_data_cat)
one_hot1 = enc1.transform(predict_data_cat)
X_cat_processed = pandas.DataFrame(one_hot1.toarray(), columns=enc1.get_feature_names())
query = X_cat_processed.reindex(columns=columns, fill_value=0)
query.head()

Unnamed: 0,x0_Black,x0_Chinese,x0_Filipino,x0_Guamanian,x0_Hawaiian,x0_Indian,x0_Japanese,x0_Korean,x0_Native American,x0_Samoan,...,x2_Widowed,x3_1.0,x3_2.0,x3_3.0,x3_4.0,x3_5.0,x3_6.0,x3_7.0,x3_8.0,x3_9.0
0,1.0,0,0,0,0.0,0,0,0,0,0,...,0.0,0,0,0,0,0,0,0,0,0
1,0.0,0,0,0,0.0,0,0,0,0,0,...,1.0,0,0,0,0,0,0,0,0,0
2,0.0,0,0,0,1.0,0,0,0,0,0,...,0.0,0,0,0,0,0,0,0,0,0


In [18]:
s = pandas.DataFrame(predict_data_num, columns=features_num)
query = pandas.concat([s, query], axis=1, sort=False)
query.head()

Unnamed: 0,age,x0_Black,x0_Chinese,x0_Filipino,x0_Guamanian,x0_Hawaiian,x0_Indian,x0_Japanese,x0_Korean,x0_Native American,...,x2_Widowed,x3_1.0,x3_2.0,x3_3.0,x3_4.0,x3_5.0,x3_6.0,x3_7.0,x3_8.0,x3_9.0
0,20,1.0,0,0,0,0.0,0,0,0,0,...,0.0,0,0,0,0,0,0,0,0,0
1,72,0.0,0,0,0,0.0,0,0,0,0,...,1.0,0,0,0,0,0,0,0,0,0
2,36,0.0,0,0,0,1.0,0,0,0,0,...,0.0,0,0,0,0,0,0,0,0,0


In [19]:
model2.predict(query)

array(['Other', 'Heart Disease', 'Organ Failure'], dtype=object)