In [90]:
import pandas as pd

In [91]:
data = pd.read_csv('haberman.csv', index_col = False, names = ['age', 'operation_year', 'positive_axillary_nodes_detected', 'survival_status'])

In [92]:
data

Unnamed: 0,age,operation_year,positive_axillary_nodes_detected,survival_status
0,30,64,1,1
1,30,62,3,1
2,30,65,0,1
3,31,59,2,1
4,31,65,4,1
...,...,...,...,...
301,75,62,1,1
302,76,67,0,1
303,77,65,3,1
304,78,65,1,2


In [95]:
X, y = data[['age', 'operation_year', 'positive_axillary_nodes_detected']], data['survival_status']

In [96]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, recall_score

In [97]:
specificity = make_scorer(recall_score, pos_label=2)
sensitivity = make_scorer(recall_score, pos_label=1)
scoring = {'accuracy': 'accuracy',
           'precision': 'precision',
           'recall': 'recall',
           'sensitivity': sensitivity,
           'specificity': specificity,
           'f1': 'f1'}

Decision Tree

In [98]:
from sklearn.tree import DecisionTreeClassifier

In [99]:
dt_classifier = DecisionTreeClassifier(random_state=0)

In [100]:
dt_scores = cross_validate(dt_classifier, X, y, cv=10, scoring=scoring)
dt_scores

{'fit_time': array([0.00400043, 0.00199866, 0.00199962, 0.00300002, 0.0060029 ,
        0.00399828, 0.00299859, 0.00500202, 0.00300026, 0.00300121]),
 'score_time': array([0.00599861, 0.00699997, 0.00600147, 0.00599885, 0.00900102,
        0.00801516, 0.00800252, 0.00699735, 0.00800157, 0.00699854]),
 'test_accuracy': array([0.74193548, 0.38709677, 0.25806452, 0.58064516, 0.61290323,
        0.77419355, 0.76666667, 0.66666667, 0.63333333, 0.6       ]),
 'test_precision': array([0.74193548, 0.61111111, 0.5       , 0.77777778, 0.82352941,
        0.85714286, 0.8       , 0.75      , 0.72      , 0.77777778]),
 'test_recall': array([1.        , 0.47826087, 0.26086957, 0.60869565, 0.60869565,
        0.81818182, 0.90909091, 0.81818182, 0.81818182, 0.63636364]),
 'test_sensitivity': array([1.        , 0.47826087, 0.26086957, 0.60869565, 0.60869565,
        0.81818182, 0.90909091, 0.81818182, 0.81818182, 0.63636364]),
 'test_specificity': array([0.        , 0.125     , 0.25      , 0.5       , 

In [101]:
print('Precision: %s' % dt_scores['test_precision'].mean())
print('Recall: %s' % dt_scores['test_recall'].mean())
print('Sensitivity: %s' % dt_scores['test_sensitivity'].mean())
print('Specificity: %s' % dt_scores['test_specificity'].mean())
print('F1: %s' % dt_scores['test_f1'].mean())
print('Accuracy: %s' % dt_scores['test_accuracy'].mean())


Precision: 0.7359274419445196
Recall: 0.6956521739130436
Sensitivity: 0.6956521739130436
Specificity: 0.3416666666666667
F1: 0.7051060464404446
Accuracy: 0.6021505376344086


In [102]:
print('Precision: %0.2f' % (dt_scores['test_precision'].mean() * 100))
print('Recall:  %0.2f' % (dt_scores['test_recall'].mean() * 100))
print('Sensitivity:  %0.2f' % (dt_scores['test_sensitivity'].mean() * 100))
print('Specificity: %0.2f' % (dt_scores['test_specificity'].mean() * 100))
print('F1: %0.2f' % (dt_scores['test_f1'].mean() * 100))
print('Accuracy: %0.2f' % (dt_scores['test_accuracy'].mean() * 100))

Precision: 73.59
Recall:  69.57
Sensitivity:  69.57
Specificity: 34.17
F1: 70.51
Accuracy: 60.22


Naive Bayes

In [103]:
from sklearn.naive_bayes import GaussianNB

In [104]:
gnb_classifier = GaussianNB()

In [105]:
gnb_scores = cross_validate(gnb_classifier, X, y, cv=10, scoring=scoring)
gnb_scores

{'fit_time': array([0.00399971, 0.00300097, 0.00300026, 0.00499964, 0.00200176,
        0.00899911, 0.00599837, 0.00400043, 0.00300097, 0.00399995]),
 'score_time': array([0.00900078, 0.00900006, 0.00700188, 0.00700045, 0.03100181,
        0.00800323, 0.00600076, 0.00900054, 0.00699997, 0.00600171]),
 'test_accuracy': array([0.74193548, 0.80645161, 0.70967742, 0.74193548, 0.74193548,
        0.67741935, 0.8       , 0.8       , 0.76666667, 0.73333333]),
 'test_precision': array([0.77777778, 0.79310345, 0.76923077, 0.74193548, 0.74193548,
        0.73076923, 0.80769231, 0.80769231, 0.8       , 0.73333333]),
 'test_recall': array([0.91304348, 1.        , 0.86956522, 1.        , 1.        ,
        0.86363636, 0.95454545, 0.95454545, 0.90909091, 1.        ]),
 'test_sensitivity': array([0.91304348, 1.        , 0.86956522, 1.        , 1.        ,
        0.86363636, 0.95454545, 0.95454545, 0.90909091, 1.        ]),
 'test_specificity': array([0.25      , 0.25      , 0.25      , 0.        , 

In [106]:
print('Precision: %s' % gnb_scores['test_precision'].mean())
print('Recall: %s' % gnb_scores['test_recall'].mean())
print('Sensitivity: %s' % gnb_scores['test_sensitivity'].mean())
print('Specificity: %s' % gnb_scores['test_specificity'].mean())
print('F1: %s' % gnb_scores['test_f1'].mean())
print('Accuracy: %s' % gnb_scores['test_accuracy'].mean())

Precision: 0.7703470142513524
Recall: 0.9464426877470355
Sensitivity: 0.9464426877470355
Specificity: 0.20972222222222223
F1: 0.848352996153908
Accuracy: 0.7519354838709678


In [107]:
print('Precision: %0.2f' % (gnb_scores['test_precision'].mean() * 100))
print('Recall:  %0.2f' % (gnb_scores['test_recall'].mean() * 100))
print('Sensitivity:  %0.2f' % (gnb_scores['test_sensitivity'].mean() * 100))
print('Specificity: %0.2f' % (gnb_scores['test_specificity'].mean() * 100))
print('F1: %0.2f' % (gnb_scores['test_f1'].mean() * 100))
print('Accuracy: %0.2f' % (gnb_scores['test_accuracy'].mean() * 100))

Precision: 77.03
Recall:  94.64
Sensitivity:  94.64
Specificity: 20.97
F1: 84.84
Accuracy: 75.19


Logistic Regression

In [108]:
from sklearn.linear_model import LogisticRegression

In [109]:
lr_classifier = LogisticRegression(random_state=0)

In [110]:
lr_scores = cross_validate(lr_classifier, X, y, cv=10, scoring=scoring)
lr_scores

{'fit_time': array([0.01400185, 0.01100349, 0.01199961, 0.01200032, 0.01000118,
        0.01200056, 0.01099992, 0.00999928, 0.00900054, 0.0130012 ]),
 'score_time': array([0.0069983 , 0.00699997, 0.0070014 , 0.00900054, 0.00699878,
        0.01300359, 0.01200128, 0.00800228, 0.00900054, 0.00600028]),
 'test_accuracy': array([0.74193548, 0.80645161, 0.77419355, 0.74193548, 0.74193548,
        0.67741935, 0.73333333, 0.76666667, 0.76666667, 0.7       ]),
 'test_precision': array([0.75862069, 0.79310345, 0.78571429, 0.74193548, 0.74193548,
        0.73076923, 0.75      , 0.77777778, 0.8       , 0.72413793]),
 'test_recall': array([0.95652174, 1.        , 0.95652174, 1.        , 1.        ,
        0.86363636, 0.95454545, 0.95454545, 0.90909091, 0.95454545]),
 'test_sensitivity': array([0.95652174, 1.        , 0.95652174, 1.        , 1.        ,
        0.86363636, 0.95454545, 0.95454545, 0.90909091, 0.95454545]),
 'test_specificity': array([0.125     , 0.25      , 0.25      , 0.        , 

In [111]:
print('Precision: %s' % lr_scores['test_precision'].mean())
print('Recall: %s' % lr_scores['test_recall'].mean())
print('Sensitivity: %s' % lr_scores['test_sensitivity'].mean())
print('Specificity: %s' % lr_scores['test_specificity'].mean())
print('F1: %s' % lr_scores['test_f1'].mean())
print('Accuracy: %s' % lr_scores['test_accuracy'].mean())

Precision: 0.7603994330968747
Recall: 0.9549407114624506
Sensitivity: 0.9549407114624506
Specificity: 0.15972222222222224
F1: 0.8460620797873615
Accuracy: 0.7450537634408603


In [112]:
print('Precision: %0.2f' % (lr_scores['test_precision'].mean() * 100))
print('Recall:  %0.2f' % (lr_scores['test_recall'].mean() * 100))
print('Sensitivity:  %0.2f' % (lr_scores['test_sensitivity'].mean() * 100))
print('Specificity: %0.2f' % (lr_scores['test_specificity'].mean() * 100))
print('F1: %0.2f' % (lr_scores['test_f1'].mean() * 100))
print('Accuracy: %0.2f' % (lr_scores['test_accuracy'].mean() * 100))

Precision: 76.04
Recall:  95.49
Sensitivity:  95.49
Specificity: 15.97
F1: 84.61
Accuracy: 74.51


Support Vector Machine

In [113]:
from sklearn import svm

In [114]:
svm_classifier = svm.SVC(kernel='linear')

In [115]:
svm_scores = cross_validate(svm_classifier, X, y, cv=10, scoring=scoring)
svm_scores

{'fit_time': array([0.04000235, 0.02200055, 0.056005  , 0.07500362, 0.05300426,
        0.05200338, 0.02600455, 0.05000377, 0.05700421, 0.04300332]),
 'score_time': array([0.00700068, 0.00799966, 0.00900197, 0.00800085, 0.00600052,
        0.00900245, 0.00699854, 0.00600076, 0.00600171, 0.00500035]),
 'test_accuracy': array([0.77419355, 0.74193548, 0.74193548, 0.74193548, 0.74193548,
        0.67741935, 0.73333333, 0.73333333, 0.7       , 0.73333333]),
 'test_precision': array([0.78571429, 0.74193548, 0.74193548, 0.74193548, 0.74193548,
        0.7       , 0.73333333, 0.73333333, 0.72413793, 0.73333333]),
 'test_recall': array([0.95652174, 1.        , 1.        , 1.        , 1.        ,
        0.95454545, 1.        , 1.        , 0.95454545, 1.        ]),
 'test_sensitivity': array([0.95652174, 1.        , 1.        , 1.        , 1.        ,
        0.95454545, 1.        , 1.        , 0.95454545, 1.        ]),
 'test_specificity': array([0.25, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 

In [116]:
print('Precision: %s' % svm_scores['test_precision'].mean())
print('Recall: %s' % svm_scores['test_recall'].mean())
print('Sensitivity: %s' % svm_scores['test_sensitivity'].mean())
print('Specificity: %s' % svm_scores['test_specificity'].mean())
print('F1: %s' % svm_scores['test_f1'].mean())
print('Accuracy: %s' % svm_scores['test_accuracy'].mean())

Precision: 0.7377594152232639
Recall: 0.9865612648221344
Sensitivity: 0.9865612648221344
Specificity: 0.025
F1: 0.8439835763365174
Accuracy: 0.7319354838709677


In [117]:
print('Precision: %0.2f' % (svm_scores['test_precision'].mean() * 100))
print('Recall:  %0.2f' % (svm_scores['test_recall'].mean() * 100))
print('Sensitivity:  %0.2f' % (svm_scores['test_sensitivity'].mean() * 100))
print('Specificity: %0.2f' % (svm_scores['test_specificity'].mean() * 100))
print('F1: %0.2f' % (svm_scores['test_f1'].mean() * 100))
print('Accuracy: %0.2f' % (svm_scores['test_accuracy'].mean() * 100))

Precision: 73.78
Recall:  98.66
Sensitivity:  98.66
Specificity: 2.50
F1: 84.40
Accuracy: 73.19


Random Forest

In [118]:
from sklearn.ensemble import RandomForestClassifier

In [119]:
rf_classifier = RandomForestClassifier(random_state=0)

In [120]:
rf_scores = cross_validate(rf_classifier, X, y, cv=10, scoring=scoring)
rf_scores

{'fit_time': array([0.16201091, 0.17701602, 0.14201236, 0.14401174, 0.13801169,
        0.14801192, 0.14801216, 0.14701223, 0.14201093, 0.14101195]),
 'score_time': array([0.0150013 , 0.01899981, 0.01700091, 0.01700139, 0.01500106,
        0.01800132, 0.01500106, 0.01700091, 0.01700139, 0.01600051]),
 'test_accuracy': array([0.74193548, 0.58064516, 0.58064516, 0.61290323, 0.77419355,
        0.77419355, 0.76666667, 0.66666667, 0.73333333, 0.7       ]),
 'test_precision': array([0.74193548, 0.69230769, 0.77777778, 0.76190476, 0.80769231,
        0.89473684, 0.75862069, 0.75      , 0.81818182, 0.72413793]),
 'test_recall': array([1.        , 0.7826087 , 0.60869565, 0.69565217, 0.91304348,
        0.77272727, 1.        , 0.81818182, 0.81818182, 0.95454545]),
 'test_sensitivity': array([1.        , 0.7826087 , 0.60869565, 0.69565217, 0.91304348,
        0.77272727, 1.        , 0.81818182, 0.81818182, 0.95454545]),
 'test_specificity': array([0.        , 0.        , 0.5       , 0.375     , 

In [121]:
print('Precision: %s' % rf_scores['test_precision'].mean())
print('Recall: %s' % rf_scores['test_recall'].mean())
print('Sensitivity: %s' % rf_scores['test_sensitivity'].mean())
print('Specificity: %s' % rf_scores['test_specificity'].mean())
print('F1: %s' % rf_scores['test_f1'].mean())
print('Accuracy: %s' % rf_scores['test_accuracy'].mean())

Precision: 0.7727295304530244
Recall: 0.8363636363636363
Sensitivity: 0.8363636363636363
Specificity: 0.29027777777777775
F1: 0.7970221459407589
Accuracy: 0.6931182795698925


In [122]:
print('Precision: %0.2f' % (rf_scores['test_precision'].mean() * 100))
print('Recall:  %0.2f' % (rf_scores['test_recall'].mean() * 100))
print('Sensitivity:  %0.2f' % (rf_scores['test_sensitivity'].mean() * 100))
print('Specificity: %0.2f' % (rf_scores['test_specificity'].mean() * 100))
print('F1: %0.2f' % (rf_scores['test_f1'].mean() * 100))
print('Accuracy: %0.2f' % (rf_scores['test_accuracy'].mean() * 100))

Precision: 77.27
Recall:  83.64
Sensitivity:  83.64
Specificity: 29.03
F1: 79.70
Accuracy: 69.31


In [123]:
# 1. What classification/regression problem you are trying to address? Kenz
# 2. Explain the content of the DATASET used. What is its purpose? Mention the source of your dataset. Marben
# 3. What are the classification methods tested.- 5 algorithms Marben
# 4. What method is used to determine the efficiency of each model. Since we use k-fold cross validation, explain how it works.Kenz
# 5. Code - Ako
# 6. Determine which is superior among the identified classifiers tested and percent the performance measures being used in the experiment. Ako

# The dataset contains cases from a study that was conducted between 1958 and 1970 at the University of Chicago's Billings Hospital on the survival of patients who had undergone surgery for breast cancer.

# 1. Age of patient at time of operation (numerical)
# 2. Patient's year of operation (year - 1900, numerical)
# 3. Number of positive axillary nodes detected (numerical)
# 4. Survival status (class attribute)
# -- 1 = the patient survived 5 years or longer
# -- 2 = the patient died within 5 years

