In [404]:
import pandas
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing

In [405]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [406]:
df = pandas.read_csv('output.csv', low_memory=False)

In [407]:
df['race'].unique()
# df = df.sample(frac=1)

array(['White', 'Black', 'Korean', 'Vietnamese', 'Indian',
       'Native American', 'Hawaiian', 'Chinese', 'Japanese',
       'other Asian or Pacific Islander', 'Filipino', 'Samoan',
       'Guamanian'], dtype=object)

In [408]:
features_cat = ['race', 'sex', 'relationship_status', 'education']
features_num = ['age']

X_cat = df[features_cat]
X_num = df[features_num]
enc = preprocessing.OneHotEncoder()
enc.fit(X_cat)
one_hot = enc.transform(X_cat)
X_cat_proc = pandas.DataFrame(one_hot.toarray(), columns=enc.get_feature_names())

In [416]:
# scaled = preprocessing.scale(X_num, with_mean=True, with_std=True)
X_num_proc = X_num

In [417]:
X = pandas.concat([X_num_proc, X_cat_proc], axis=1, sort=False)
X = X.fillna(0)

In [418]:
y = df['New_group']

In [419]:
X_train, X_TEMP, y_train, y_TEMP = train_test_split(X, y, test_size=0.30) # split out into training 70% of our data
X_validation, X_test, y_validation, y_test = train_test_split(X_TEMP, y_TEMP, test_size=0.50) # split out into validation 15% of our data and test 15% of our data

In [441]:
# helper method to print basic model metrics
def metrics(y_true, y_pred):
    print('Confusion matrix:\n', confusion_matrix(y_true, y_pred))
    # target_names = ['denied', 'approved']
    print('\nReport:\n', classification_report(y_true, y_pred))

    
logistic_regression = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=250)
decision_tree_classifier = DecisionTreeClassifier()
naive_bayes = GaussianNB()
knn = KNeighborsClassifier()
lda = LinearDiscriminantAnalysis()
random_forest = RandomForestClassifier(bootstrap=True);

model = random_forest.fit(X_train, y_train) # first fit (train) the model
y_pred = model.predict(X_validation) # next get the model's predictions for a sample in the validation set
metrics(y_validation, y_pred)



Confusion matrix:
 [[1361  107 1535  113  262   48]
 [ 279  179 1333   40   85   26]
 [1170  242 2996  116  322   64]
 [ 413   70  970   44   72   18]
 [ 617   80  993   63  341   88]
 [ 237   55  441   30  128   62]]

Report:
                       precision    recall  f1-score   support

              Cancer       0.33      0.40      0.36      3426
Congenital Anomalies       0.24      0.09      0.13      1942
       Heart Disease       0.36      0.61      0.45      4910
    Issues Breathing       0.11      0.03      0.04      1587
       Organ Failure       0.28      0.16      0.20      2182
               Other       0.20      0.07      0.10       953

            accuracy                           0.33     15000
           macro avg       0.26      0.22      0.22     15000
        weighted avg       0.29      0.33      0.29     15000



In [443]:
predict_data_cat = [['Black', 'M', 'Single', 6], ['White', 'M', 'Widowed', 3], ['Hawaiian', 'F', 'Married', 5]]
#['Black', 'M', 'Single'], ['White', 'M', 'Widowed'], ['Hawaiian', 'F', 'Married']
#[20, 6], [72, 3], [36, 5]
predict_data_num = [[20], [72], [36]]

Y = enc.transform(predict_data_cat)
Y = pandas.DataFrame(Y.toarray(), columns=enc.get_feature_names())


# s = preprocessing.scale(predict_data_num)
s = pandas.DataFrame(predict_data_num, columns=features_num)
# s = X_num
Y = s.join(Y)
Y.head()

Unnamed: 0,age,x0_Black,x0_Chinese,x0_Filipino,x0_Guamanian,x0_Hawaiian,x0_Indian,x0_Japanese,x0_Korean,x0_Native American,...,x2_Widowed,x3_1.0,x3_2.0,x3_3.0,x3_4.0,x3_5.0,x3_6.0,x3_7.0,x3_8.0,x3_9.0
0,20,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,72,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,36,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [444]:
predictions = model.predict(Y)
print(predictions)

['Organ Failure' 'Heart Disease' 'Heart Disease']


In [445]:
import pickle

In [446]:
with open('deaths_model', 'wb') as f:
    pickle.dump(model, f)

In [447]:
with open('deaths_model', 'rb') as f:
    model2 = pickle.load(f)

In [448]:
model2.predict(Y)

array(['Organ Failure', 'Heart Disease', 'Heart Disease'], dtype=object)

In [449]:
columns = list(X.columns)
columns = columns[1:]
with open('columns', 'wb') as file:
    pickle.dump(columns, file)

In [450]:
enc1 = preprocessing.OneHotEncoder()
enc1.fit(predict_data_cat)
one_hot1 = enc1.transform(predict_data_cat)
X_cat_processed = pandas.DataFrame(one_hot1.toarray(), columns=enc1.get_feature_names())
query = X_cat_processed.reindex(columns=columns, fill_value=0)
query.head()

Unnamed: 0,x0_Black,x0_Chinese,x0_Filipino,x0_Guamanian,x0_Hawaiian,x0_Indian,x0_Japanese,x0_Korean,x0_Native American,x0_Samoan,...,x2_Widowed,x3_1.0,x3_2.0,x3_3.0,x3_4.0,x3_5.0,x3_6.0,x3_7.0,x3_8.0,x3_9.0
0,1.0,0,0,0,0.0,0,0,0,0,0,...,0.0,0,0,0,0,0,0,0,0,0
1,0.0,0,0,0,0.0,0,0,0,0,0,...,1.0,0,0,0,0,0,0,0,0,0
2,0.0,0,0,0,1.0,0,0,0,0,0,...,0.0,0,0,0,0,0,0,0,0,0


In [451]:
s = pandas.DataFrame(predict_data_num, columns=features_num)
query = pandas.concat([s, query], axis=1, sort=False)
query.head()

Unnamed: 0,age,x0_Black,x0_Chinese,x0_Filipino,x0_Guamanian,x0_Hawaiian,x0_Indian,x0_Japanese,x0_Korean,x0_Native American,...,x2_Widowed,x3_1.0,x3_2.0,x3_3.0,x3_4.0,x3_5.0,x3_6.0,x3_7.0,x3_8.0,x3_9.0
0,20,1.0,0,0,0,0.0,0,0,0,0,...,0.0,0,0,0,0,0,0,0,0,0
1,72,0.0,0,0,0,0.0,0,0,0,0,...,1.0,0,0,0,0,0,0,0,0,0
2,36,0.0,0,0,0,1.0,0,0,0,0,...,0.0,0,0,0,0,0,0,0,0,0


In [452]:
model2.predict(query)

array(['Other', 'Issues Breathing', 'Heart Disease'], dtype=object)