In [55]:
import pandas
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing

In [56]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [57]:
df = pandas.read_csv('output.csv', low_memory=False)

In [72]:
df.columns

Index(['Unnamed: 0', 'age', 'sex', 'race', 'education', 'month_of_death',
       'day_of_week_of_death', 'manner', 'relationship_status', 'icd_code',
       'cause', 'group', 'New_group'],
      dtype='object')

In [59]:
features_cat = ['race', 'sex', 'relationship_status', 'education']
features_num = ['age']

X_cat = df[features_cat]
X_num = df[features_num]
enc = preprocessing.OneHotEncoder()
enc.fit(X_cat)
one_hot = enc.transform(X_cat)
X_cat_proc = pandas.DataFrame(one_hot.toarray(), columns=enc.get_feature_names())

In [60]:
X_num_proc = X_num
X = pandas.concat([X_num_proc, X_cat_proc], axis=1, sort=False)
X = X.fillna(0)

In [61]:
y = df['New_group']

In [62]:
X_train, X_TEMP, y_train, y_TEMP = train_test_split(X, y, test_size=0.30) # split out into training 70% of our data
X_validation, X_test, y_validation, y_test = train_test_split(X_TEMP, y_TEMP, test_size=0.50) # split out into validation 15% of our data and test 15% of our data

In [63]:
# helper method to print basic model metrics
def metrics(y_true, y_pred):
    print('Confusion matrix:\n', confusion_matrix(y_true, y_pred))
    # target_names = ['denied', 'approved']
    print('\nReport:\n', classification_report(y_true, y_pred))

    
logistic_regression = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=250)
decision_tree_classifier = DecisionTreeClassifier()
naive_bayes = GaussianNB()
knn = KNeighborsClassifier()
lda = LinearDiscriminantAnalysis()
random_forest = RandomForestClassifier(bootstrap=True);

model = logistic_regression.fit(X_train, y_train) # first fit (train) the model
y_pred = model.predict(X_validation) # next get the model's predictions for a sample in the validation set
metrics(y_validation, y_pred)

  'precision', 'predicted', average, warn_for)


Confusion matrix:
 [[1201    2 2143    0   73    0]
 [ 211    8 1723    0   98    0]
 [ 869   11 3872    0  105    0]
 [ 249    2 1235    0   18    0]
 [ 655    4 1361    0  226    0]
 [ 219    1  573    0  141    0]]

Report:
                       precision    recall  f1-score   support

              Cancer       0.35      0.35      0.35      3419
Congenital Anomalies       0.29      0.00      0.01      2040
       Heart Disease       0.36      0.80      0.49      4857
    Issues Breathing       0.00      0.00      0.00      1504
       Organ Failure       0.34      0.10      0.16      2246
               Other       0.00      0.00      0.00       934

            accuracy                           0.35     15000
           macro avg       0.22      0.21      0.17     15000
        weighted avg       0.29      0.35      0.26     15000



In [64]:
import pickle

In [65]:
with open('deaths_model', 'wb') as f:
    pickle.dump(model, f)

In [66]:
with open('deaths_model', 'rb') as f:
    model2 = pickle.load(f)

In [67]:
columns = list(X.columns)
columns = columns[1:]
with open('columns', 'wb') as file:
    pickle.dump(columns, file)

['x0_Black',
 'x0_Chinese',
 'x0_Filipino',
 'x0_Guamanian',
 'x0_Hawaiian',
 'x0_Indian',
 'x0_Japanese',
 'x0_Korean',
 'x0_Native American',
 'x0_Samoan',
 'x0_Vietnamese',
 'x0_White',
 'x0_other Asian or Pacific Islander',
 'x1_F',
 'x1_M',
 'x2_Divorced',
 'x2_Married',
 'x2_Single',
 'x2_Unkown',
 'x2_Widowed',
 'x3_1.0',
 'x3_2.0',
 'x3_3.0',
 'x3_4.0',
 'x3_5.0',
 'x3_6.0',
 'x3_7.0',
 'x3_8.0',
 'x3_9.0']

In [68]:
predict_data_cat = [['Black', 'M', 'Single', 6], ['White', 'M', 'Widowed', 3], ['Hawaiian', 'F', 'Married', 5]]
predict_data_num = [[20], [72], [36]]

enc1 = preprocessing.OneHotEncoder()
enc1.fit(predict_data_cat)
one_hot1 = enc1.transform(predict_data_cat)
X_cat_processed = pandas.DataFrame(one_hot1.toarray(), columns=enc1.get_feature_names())
query = X_cat_processed.reindex(columns=columns, fill_value=0)
query.head()

Unnamed: 0,x0_Black,x0_Chinese,x0_Filipino,x0_Guamanian,x0_Hawaiian,x0_Indian,x0_Japanese,x0_Korean,x0_Native American,x0_Samoan,...,x2_Widowed,x3_1.0,x3_2.0,x3_3.0,x3_4.0,x3_5.0,x3_6.0,x3_7.0,x3_8.0,x3_9.0
0,1.0,0,0,0,0.0,0,0,0,0,0,...,0.0,0,0,0,0,0,0,0,0,0
1,0.0,0,0,0,0.0,0,0,0,0,0,...,1.0,0,0,0,0,0,0,0,0,0
2,0.0,0,0,0,1.0,0,0,0,0,0,...,0.0,0,0,0,0,0,0,0,0,0


In [69]:
s = pandas.DataFrame(predict_data_num, columns=features_num)
query = pandas.concat([s, query], axis=1, sort=False)
query.head()

Unnamed: 0,age,x0_Black,x0_Chinese,x0_Filipino,x0_Guamanian,x0_Hawaiian,x0_Indian,x0_Japanese,x0_Korean,x0_Native American,...,x2_Widowed,x3_1.0,x3_2.0,x3_3.0,x3_4.0,x3_5.0,x3_6.0,x3_7.0,x3_8.0,x3_9.0
0,20,1.0,0,0,0,0.0,0,0,0,0,...,0.0,0,0,0,0,0,0,0,0,0
1,72,0.0,0,0,0,0.0,0,0,0,0,...,1.0,0,0,0,0,0,0,0,0,0
2,36,0.0,0,0,0,1.0,0,0,0,0,...,0.0,0,0,0,0,0,0,0,0,0


In [70]:
model2.predict(query)

array(['Organ Failure', 'Heart Disease', 'Cancer'], dtype=object)