In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('haberman.csv', index_col = False, names = ['age', 'operation_year', 'positive_axillary_nodes_detected', 'survival_status'])

In [3]:
data

Unnamed: 0,age,operation_year,positive_axillary_nodes_detected,survival_status
0,30,64,1,1
1,30,62,3,1
2,30,65,0,1
3,31,59,2,1
4,31,65,4,1
...,...,...,...,...
301,75,62,1,1
302,76,67,0,1
303,77,65,3,1
304,78,65,1,2


In [4]:
#survival_status = {1:'survived', 2:'died'}
#data['survival_status'] = data.survival_status.map(survival_status)

In [5]:
data

Unnamed: 0,age,operation_year,positive_axillary_nodes_detected,survival_status
0,30,64,1,1
1,30,62,3,1
2,30,65,0,1
3,31,59,2,1
4,31,65,4,1
...,...,...,...,...
301,75,62,1,1
302,76,67,0,1
303,77,65,3,1
304,78,65,1,2


In [6]:
X, y = data[['age', 'operation_year', 'positive_axillary_nodes_detected']], data['survival_status']

In [7]:
def getPrecision(tp, fp):
    return tp/(tp + fp)

In [8]:
def getRecall(tp, fn):
    return tp/(tp + fn)

In [9]:
def getF1Score(p, r):
    return (2 * (p * r))/(p + r)

In [10]:
def getSpecificity(tn, fp):
    return tn/(tn + fp)


In [11]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, recall_score

In [12]:
specificity = make_scorer(recall_score, pos_label=2)
sensitivity = make_scorer(recall_score, pos_label=1)
scoring = {'accuracy': 'accuracy',
           'precision': 'precision',
           'recall': 'recall',
           'sensitivity': sensitivity,
           'specificity': specificity,
           'f1': 'f1'}

Decision Tree

In [13]:
from sklearn.tree import DecisionTreeClassifier

In [14]:
dt_classifier = DecisionTreeClassifier()

In [15]:
dt_scores = cross_validate(dt_classifier, X, y, cv=10, scoring=scoring)

In [16]:
print('Precision: %s' % dt_scores['test_precision'].mean())
print('Recall: %s' % dt_scores['test_recall'].mean())
print('Sensitivity: %s' % dt_scores['test_sensitivity'].mean())
print('Specificity: %s' % dt_scores['test_specificity'].mean())
print('F1: %s' % dt_scores['test_f1'].mean())
print('Accuracy: %s' % dt_scores['test_accuracy'].mean())


Precision: 0.734510210282706
Recall: 0.7221343873517787
Sensitivity: 0.7221343873517787
Specificity: 0.30277777777777776
F1: 0.7187873648565695
Accuracy: 0.6120430107526882


Naive Bayes

In [17]:
from sklearn.naive_bayes import GaussianNB

In [18]:
gnb_classifier = GaussianNB()

In [19]:
gnb_scores = cross_validate(gnb_classifier, X, y, cv=10, scoring=scoring)

In [20]:
print('Precision: %s' % gnb_scores['test_precision'].mean())
print('Recall: %s' % gnb_scores['test_recall'].mean())
print('Sensitivity: %s' % gnb_scores['test_sensitivity'].mean())
print('Specificity: %s' % gnb_scores['test_specificity'].mean())
print('F1: %s' % gnb_scores['test_f1'].mean())
print('Accuracy: %s' % gnb_scores['test_accuracy'].mean())

Precision: 0.7703470142513524
Recall: 0.9464426877470355
Sensitivity: 0.9464426877470355
Specificity: 0.20972222222222223
F1: 0.848352996153908
Accuracy: 0.7519354838709678


Logistic Regression

In [21]:
from sklearn.linear_model import LogisticRegression

In [22]:
lr_classifier = LogisticRegression(random_state = 0)

In [23]:
lr_scores = cross_validate(lr_classifier, X, y, cv=10, scoring=scoring)

In [24]:
print('Precision: %s' % lr_scores['test_precision'].mean())
print('Recall: %s' % lr_scores['test_recall'].mean())
print('Sensitivity: %s' % lr_scores['test_sensitivity'].mean())
print('Specificity: %s' % lr_scores['test_specificity'].mean())
print('F1: %s' % lr_scores['test_f1'].mean())
print('Accuracy: %s' % lr_scores['test_accuracy'].mean())

Precision: 0.7603994330968747
Recall: 0.9549407114624506
Sensitivity: 0.9549407114624506
Specificity: 0.15972222222222224
F1: 0.8460620797873615
Accuracy: 0.7450537634408603


Support Vector Machine

In [25]:
from sklearn import svm

In [26]:
svm_classifier = svm.SVC(kernel='linear', C=1)

In [27]:
svm_scores = cross_validate(svm_classifier, X, y, cv=10, scoring=scoring)

In [28]:
print('Precision: %s' % svm_scores['test_precision'].mean())
print('Recall: %s' % svm_scores['test_recall'].mean())
print('Sensitivity: %s' % svm_scores['test_sensitivity'].mean())
print('Specificity: %s' % svm_scores['test_specificity'].mean())
print('F1: %s' % svm_scores['test_f1'].mean())
print('Accuracy: %s' % svm_scores['test_accuracy'].mean())

Precision: 0.7377594152232639
Recall: 0.9865612648221344
Sensitivity: 0.9865612648221344
Specificity: 0.025
F1: 0.8439835763365174
Accuracy: 0.7319354838709677


Random Forest

In [29]:
from sklearn.ensemble import RandomForestClassifier

In [30]:
rf_classifier = RandomForestClassifier(random_state = 0)

In [31]:
rf_scores = cross_validate(rf_classifier, X, y, cv=10, scoring=scoring)

In [32]:
print('Precision: %s' % rf_scores['test_precision'].mean())
print('Recall: %s' % rf_scores['test_recall'].mean())
print('Sensitivity: %s' % rf_scores['test_sensitivity'].mean())
print('Specificity: %s' % rf_scores['test_specificity'].mean())
print('F1: %s' % rf_scores['test_f1'].mean())
print('Accuracy: %s' % rf_scores['test_accuracy'].mean())

Precision: 0.7727295304530244
Recall: 0.8363636363636363
Sensitivity: 0.8363636363636363
Specificity: 0.29027777777777775
F1: 0.7970221459407589
Accuracy: 0.6931182795698925
