In [68]:
import pandas
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing

In [69]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [70]:
df = pandas.read_csv('output.csv', low_memory=False)

In [71]:
df.shape

(100000, 13)

In [72]:
features_cat = ['race', 'sex', 'relationship_status']
features_num = ['age', 'education']

X_cat = df[features_cat]
X_num = df[features_num]
enc = preprocessing.OneHotEncoder()
enc.fit(X_cat)
one_hot = enc.transform(X_cat)
X_cat_proc = pandas.DataFrame(one_hot.toarray(), columns=enc.get_feature_names())

In [73]:
scaled = preprocessing.scale(X_num, with_mean=True, with_std=True)
X_num_proc = pandas.DataFrame(scaled, columns=features_num)

In [74]:
X = pandas.concat([X_num_proc, X_cat_proc], axis=1, sort=False)
X = X.fillna(0)

In [75]:
y = df['New_group']

In [76]:
X_train, X_TEMP, y_train, y_TEMP = train_test_split(X, y, test_size=0.30) # split out into training 70% of our data
X_validation, X_test, y_validation, y_test = train_test_split(X_TEMP, y_TEMP, test_size=0.50) # split out into validation 15% of our data and test 15% of our data

In [77]:
# helper method to print basic model metrics
def metrics(y_true, y_pred):
    print('Confusion matrix:\n', confusion_matrix(y_true, y_pred))
    # target_names = ['denied', 'approved']
    print('\nReport:\n', classification_report(y_true, y_pred))

    
logistic_regression = LogisticRegression(solver='lbfgs', multi_class='ovr', max_iter=250)
decision_tree_classifier = DecisionTreeClassifier()
naive_bayes = GaussianNB()
knn = KNeighborsClassifier()
lda = LinearDiscriminantAnalysis()
random_forest = RandomForestClassifier(bootstrap=True);

model = lda.fit(X_train, y_train) # first fit (train) the model
y_pred = model.predict(X_validation) # next get the model's predictions for a sample in the validation set
metrics(y_validation, y_pred)

  'precision', 'predicted', average, warn_for)


Confusion matrix:
 [[  18   90    0    0    0   92    0  114    0    0    0    1    0    0
     0]
 [   7 1244    0    0    0   57    0 2082    0    0    0    1    0    0
     0]
 [  14   41    0    0    0   65    0  800    0    0    0    0    0    0
     0]
 [   2  212    0    0    0   38    0  582    0    0    0    1    0    0
     0]
 [   0  152    0    0    0   22    0  852    0    0    0    0    0    0
     0]
 [  12  203    0    0    0  179    0  147    0    0    0    0    0    0
     0]
 [   1   26    0    0    0    5    0  170    0    0    0    0    0    0
     0]
 [  12  816    0    0    0   77    0 3866    0    2    0    3    0    0
     0]
 [   0    0    0    0    0    0    0    1    0    0    0    0    0    0
     0]
 [   1   60    0    0    0    8    0  227    0    0    0    0    0    0
     0]
 [   5  279    0    0    0   19    0 1237    0    0    0    1    0    0
     0]
 [   0   10    0    0    0    4    0   15    0    0    0    0    0    0
     0]
 [   0    0    0    0

In [37]:
predict_data_cat = [['Hawaiian', 'F', 'Married']]
#['Black', 'M', 'Single'], ['White', 'M', 'Widowed'], ['Hawaiian', 'F', 'Married']
#[20, 6], [72, 3], [36, 5]
predict_data_num = [[36, 5]]

Y = enc.transform(predict_data_cat)
Y = pandas.DataFrame(Y.toarray(), columns=enc.get_feature_names())

s = preprocessing.scale(predict_data_num)
s = pandas.DataFrame(s, columns=features_num)
Y = pandas.concat([s, Y], axis=1, sort=False)

In [38]:
predictions = model.predict(Y)
print(predictions)

['Medical Care Error']


In [35]:
import pickle

In [36]:
with open('deaths_model', 'wb') as f:
    pickle.dump(model, f)

In [26]:
with open('deaths_model', 'rb') as f:
    model2 = pickle.load(f)

In [27]:
model2.predict(Y)

array(['Heart Disease', 'Heart Disease', 'Medical Care Error'],
      dtype='<U33')

In [28]:
columns = list(X.columns)
columns = columns[2:]
with open('columns', 'wb') as file:
    pickle.dump(columns, file)

In [29]:
# query = pandas.get_dummies(pandas.DataFrame(predict_data_cat))
enc1 = preprocessing.OneHotEncoder()
enc1.fit(predict_data_cat)
one_hot1 = enc1.transform(predict_data_cat)
X_cat_processed = pandas.DataFrame(one_hot1.toarray(), columns=enc1.get_feature_names())
# X_cat_processed.head()
query = X_cat_processed.reindex(columns=columns, fill_value=0)
query.head()

Unnamed: 0,x0_Black,x0_Chinese,x0_Filipino,x0_Guamanian,x0_Hawaiian,x0_Indian,x0_Japanese,x0_Korean,x0_Native American,x0_Samoan,x0_Vietnamese,x0_White,x0_other Asian or Pacific Islander,x1_F,x1_M,x2_Divorced,x2_Married,x2_Single,x2_Unkown,x2_Widowed
0,1.0,0,0,0,0.0,0,0,0,0,0,0,0.0,0,0.0,1.0,0,0.0,1.0,0,0.0
1,0.0,0,0,0,0.0,0,0,0,0,0,0,1.0,0,0.0,1.0,0,0.0,0.0,0,1.0
2,0.0,0,0,0,1.0,0,0,0,0,0,0,0.0,0,1.0,0.0,0,1.0,0.0,0,0.0


In [30]:
s = preprocessing.scale(predict_data_num)
s = pandas.DataFrame(s, columns=features_num)
query = pandas.concat([s, query], axis=1, sort=False)
query.head()

Unnamed: 0,age,education,x0_Black,x0_Chinese,x0_Filipino,x0_Guamanian,x0_Hawaiian,x0_Indian,x0_Japanese,x0_Korean,...,x0_Vietnamese,x0_White,x0_other Asian or Pacific Islander,x1_F,x1_M,x2_Divorced,x2_Married,x2_Single,x2_Unkown,x2_Widowed
0,-1.042337,1.069045,1.0,0,0,0,0.0,0,0,0,...,0,0.0,0,0.0,1.0,0,0.0,1.0,0,0.0
1,1.348907,-1.336306,0.0,0,0,0,0.0,0,0,0,...,0,1.0,0,0.0,1.0,0,0.0,0.0,0,1.0
2,-0.30657,0.267261,0.0,0,0,0,1.0,0,0,0,...,0,0.0,0,1.0,0.0,0,1.0,0.0,0,0.0


In [31]:
model2.predict(query)

array(['Heart Disease', 'Heart Disease', 'Medical Care Error'],
      dtype='<U33')