In [2]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score 
from sklearn.model_selection import  StratifiedShuffleSplit

In [3]:
data = pd.read_csv('./database.csv')

Unnamed: 0,xCentroid,yCentroid,arcLength,area,eccentricity,diagnosis
0,125,491,103.941125,573.0,0.87,notDegenerative
1,124,431,168.610172,1004.5,0.86,notDegenerative
2,29,453,98.08326,444.0,0.9,notDegenerative
3,110,150,97.012193,347.5,0.84,notDegenerative
4,135,216,95.597979,207.0,0.73,degenerative


In [4]:
data.diagnosis = data.diagnosis.map({
    'degenerative': 1,
    'notDegenerative': 0
}).astype(np.int)
maxArcLength = data.arcLength.max()
data.arcLength = data.arcLength / maxArcLength
maxArea = data.area.max()
data.area = data.area / maxArea
data.head()

Unnamed: 0,xCentroid,yCentroid,arcLength,area,eccentricity,diagnosis
0,125,491,0.616458,0.570433,0.87,0
1,124,431,1.0,1.0,0.86,0
2,29,453,0.581716,0.442011,0.9,0
3,110,150,0.575364,0.345943,0.84,0
4,135,216,0.566976,0.206073,0.73,1


In [10]:
featuresColumns = [x for x in data.columns if x not in 'diagnosis']
stf = StratifiedShuffleSplit(n_splits=1, test_size=2, random_state=42)
trainIdx, testIdx = next(stf.split(data[featuresColumns], data['diagnosis']))

xTrain = data.loc[trainIdx, featuresColumns]
yTrain = data.loc[trainIdx, 'diagnosis']

xTest = data.loc[testIdx, featuresColumns]
yTest = data.loc[testIdx, 'diagnosis']

In [11]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(xTrain, yTrain)

DecisionTreeClassifier(random_state=42)

In [12]:
dt.tree_.node_count, dt.tree_.max_depth

(3, 1)

In [13]:
def measureError(yTrue, yGuess, label):
    return pd.Series({
        'accuracy': accuracy_score(y_true=yTrue, y_pred=yGuess),
        'precision': precision_score(y_true=yTrue, y_pred=yGuess),
        'recall': recall_score(y_true=yTrue, y_pred=yGuess),
        'roc': roc_auc_score(y_true=yTrue, y_pred=yGuess)
        },name=label
    )