In [2]:
import pandas
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [4]:
df = pandas.read_csv('death_causes.csv', low_memory=False)

In [5]:
df.shape

(100000, 12)

In [6]:
features_cat = ['race', 'sex', 'relationship_status']
features_num = ['age', 'education']

X_cat = df[features_cat]
X_num = df[features_num]
enc = preprocessing.LabelEncoder()
enc.fit(X_cat)
one_hot = enc.transform(X_cat)
X_cat_proc = pandas.DataFrame(one_hot.toarray(), columns=enc.get_feature_names())

ValueError: bad input shape (100000, 3)

In [7]:
scaled = preprocessing.scale(X_num)
X_num_proc = pandas.DataFrame(scaled, columns=features_num)

In [40]:
X = pandas.concat([X_num_proc, X_cat_proc], axis=1, sort=False)
X = X.fillna(0)

In [9]:
y = df['group']

In [10]:
X_train, X_TEMP, y_train, y_TEMP = train_test_split(X, y, test_size=0.30) # split out into training 70% of our data
X_validation, X_test, y_validation, y_test = train_test_split(X_TEMP, y_TEMP, test_size=0.50) # split out into validation 15% of our data and test 15% of our data

In [28]:
# helper method to print basic model metrics
def metrics(y_true, y_pred):
    print('Confusion matrix:\n', confusion_matrix(y_true, y_pred))
    # target_names = ['denied', 'approved']
    print('\nReport:\n', classification_report(y_true, y_pred))

    
logistic_regression = LogisticRegression(solver='lbfgs', multi_class='ovr', max_iter=250)
decision_tree_classifier = DecisionTreeClassifier()
naive_bayes = GaussianNB()
knn = KNeighborsClassifier()
lda = LinearDiscriminantAnalysis()
random_forest = RandomForestClassifier(bootstrap=True);

model = decision_tree_classifier.fit(X_train, y_train) # first fit (train) the model
y_pred = model.predict(X_validation) # next get the model's predictions for a sample in the validation set
metrics(y_validation, y_pred)

Confusion matrix:
 [[ 5  6  6 ...  0  0  0]
 [ 3 28  0 ...  0  3  0]
 [ 0  0 42 ...  0  0  0]
 ...
 [ 0  0  0 ...  0  0  0]
 [ 2 17  1 ...  0  7  0]
 [ 0  1  3 ...  0  0  0]]

Report:
                                    precision    recall  f1-score   support

                         Accident       0.08      0.05      0.06        99
                           Asthma       0.07      0.03      0.04       921
                  Birth Condition       0.55      0.91      0.68        46
                Bladder Condition       0.00      0.00      0.00        70
                           Cancer       0.31      0.54      0.40      3371
             Congenital Anomalies       0.09      0.06      0.07        54
                         Diabetes       0.09      0.03      0.04       809
   Diseases of the nervous system       0.12      0.04      0.06      1045
Disturbance of Behavior or Senses       0.00      0.00      0.00         1
                         Drug Use       0.32      0.28      0.30

  'precision', 'predicted', average, warn_for)


In [29]:
predict_data_cat = [['Black', 'M', 'Single'], ['White', 'M', 'Widowed'], ['Hawaiian', 'F', 'Married']]
predict_data_num = [[20, 6], [72, 3], [36, 5]]

Y = enc.transform(predict_data_cat)
Y = pandas.DataFrame(Y.toarray(), columns=enc.get_feature_names())

s = preprocessing.scale(predict_data_num)
s = pandas.DataFrame(s, columns=features_num)
Y = pandas.concat([s, Y], axis=1, sort=False)

In [30]:
predictions = model.predict(Y)
print(predictions)

['Fall' 'Heart Disease' 'Cancer']


In [31]:
import pickle

In [34]:
with open('deaths_model', 'wb') as f:
    pickle.dump(model, f)

In [37]:
with open('deaths_model', 'rb') as f:
    model2 = pickle.load(f)

In [38]:
model2.predict(Y)

array(['Fall', 'Heart Disease', 'Cancer'], dtype=object)