In [147]:
import pandas
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing

In [148]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [149]:
df = pandas.read_csv('output.csv', low_memory=False)

In [150]:
df.shape

(100000, 13)

In [151]:
features_cat = ['race', 'sex', 'relationship_status']
features_num = ['age', 'education']

X_cat = df[features_cat]
X_num = df[features_num]
enc = preprocessing.OneHotEncoder()
enc.fit(X_cat)
one_hot = enc.transform(X_cat)
X_cat_proc = pandas.DataFrame(one_hot.toarray(), columns=enc.get_feature_names())

In [152]:
scaled = preprocessing.scale(X_num, with_mean=True, with_std=True)
X_num_proc = pandas.DataFrame(scaled, columns=features_num)

In [153]:
X = pandas.concat([X_num_proc, X_cat_proc], axis=1, sort=False)
X = X.fillna(0)

In [154]:
y = df['New_group']

In [155]:
X_train, X_TEMP, y_train, y_TEMP = train_test_split(X, y, test_size=0.30) # split out into training 70% of our data
X_validation, X_test, y_validation, y_test = train_test_split(X_TEMP, y_TEMP, test_size=0.50) # split out into validation 15% of our data and test 15% of our data

In [156]:
# helper method to print basic model metrics
def metrics(y_true, y_pred):
    print('Confusion matrix:\n', confusion_matrix(y_true, y_pred))
    # target_names = ['denied', 'approved']
    print('\nReport:\n', classification_report(y_true, y_pred))

    
logistic_regression = LogisticRegression(solver='lbfgs', multi_class='ovr', max_iter=250)
decision_tree_classifier = DecisionTreeClassifier()
naive_bayes = GaussianNB()
knn = KNeighborsClassifier()
lda = LinearDiscriminantAnalysis()
random_forest = RandomForestClassifier(bootstrap=True);

model = logistic_regression.fit(X_train, y_train) # first fit (train) the model
y_pred = model.predict(X_validation) # next get the model's predictions for a sample in the validation set
metrics(y_validation, y_pred)

Confusion matrix:
 [[   1   93    0    0    0   70  106    0    0    0    0]
 [   0 1334    0    0    0   20 2219    0    0    1    0]
 [   0   51    0    0    0   72  812    0    0    0    0]
 [   0  221    0    0    0   11  624    0    0    0    0]
 [   0  164    0    0    0    9  839    0    0    0    0]
 [   0  208    0    0    0   93  189    0    0    0    0]
 [   1  862    0    0    0   22 3905    0    0    0    0]
 [   0   67    0    0    0    1  208    0    0    0    0]
 [   0  266    0    0    0   14 1224    0    0    0    0]
 [   0  220    0    0    0    8  658    0    0    0    0]
 [   1   90    0    0    0    9  307    0    0    0    0]]


  'precision', 'predicted', average, warn_for)



Report:
                                 precision    recall  f1-score   support

                      Accident       0.33      0.00      0.01       270
                        Cancer       0.37      0.37      0.37      3574
          Congenital Anomalies       0.00      0.00      0.00       935
                      Diabetes       0.00      0.00      0.00       856
Diseases of the nervous system       0.00      0.00      0.00      1012
                      Drug Use       0.28      0.19      0.23       490
                 Heart Disease       0.35      0.82      0.49      4790
                     Infection       0.00      0.00      0.00       276
              Issues Breathing       0.00      0.00      0.00      1504
                 Organ Failure       0.00      0.00      0.00       886
                         Other       0.00      0.00      0.00       407

                      accuracy                           0.36     15000
                     macro avg       0.12      0.13 

In [96]:
predict_data_cat = [['Hawaiian', 'F', 'Married']]
#['Black', 'M', 'Single'], ['White', 'M', 'Widowed'], ['Hawaiian', 'F', 'Married']
#[20, 6], [72, 3], [36, 5]
predict_data_num = [[36, 5]]

Y = enc.transform(predict_data_cat)
Y = pandas.DataFrame(Y.toarray(), columns=enc.get_feature_names())

s = preprocessing.scale(predict_data_num)
s = pandas.DataFrame(s, columns=features_num)
Y = pandas.concat([s, Y], axis=1, sort=False)

In [97]:
predictions = model.predict(Y)
print(predictions)

['Heart Disease']


In [98]:
import pickle

In [99]:
with open('deaths_model', 'wb') as f:
    pickle.dump(model, f)

In [100]:
with open('deaths_model', 'rb') as f:
    model2 = pickle.load(f)

In [101]:
model2.predict(Y)

array(['Heart Disease'], dtype='<U30')

In [102]:
columns = list(X.columns)
columns = columns[2:]
with open('columns', 'wb') as file:
    pickle.dump(columns, file)

In [103]:
# query = pandas.get_dummies(pandas.DataFrame(predict_data_cat))
enc1 = preprocessing.OneHotEncoder()
enc1.fit(predict_data_cat)
one_hot1 = enc1.transform(predict_data_cat)
X_cat_processed = pandas.DataFrame(one_hot1.toarray(), columns=enc1.get_feature_names())
# X_cat_processed.head()
query = X_cat_processed.reindex(columns=columns, fill_value=0)
query.head()

Unnamed: 0,x0_Black,x0_Chinese,x0_Filipino,x0_Guamanian,x0_Hawaiian,x0_Indian,x0_Japanese,x0_Korean,x0_Native American,x0_Samoan,x0_Vietnamese,x0_White,x0_other Asian or Pacific Islander,x1_F,x1_M,x2_Divorced,x2_Married,x2_Single,x2_Unkown,x2_Widowed
0,0,0,0,0,1.0,0,0,0,0,0,0,0,0,1.0,0,0,1.0,0,0,0


In [104]:
s = preprocessing.scale(predict_data_num)
s = pandas.DataFrame(s, columns=features_num)
query = pandas.concat([s, query], axis=1, sort=False)
query.head()

Unnamed: 0,age,education,x0_Black,x0_Chinese,x0_Filipino,x0_Guamanian,x0_Hawaiian,x0_Indian,x0_Japanese,x0_Korean,...,x0_Vietnamese,x0_White,x0_other Asian or Pacific Islander,x1_F,x1_M,x2_Divorced,x2_Married,x2_Single,x2_Unkown,x2_Widowed
0,0.0,0.0,0,0,0,0,1.0,0,0,0,...,0,0,0,1.0,0,0,1.0,0,0,0


In [105]:
model2.predict(query)

array(['Heart Disease'], dtype='<U30')