In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score 
from sklearn.model_selection import  StratifiedShuffleSplit

In [2]:
data = pd.read_csv('./database.csv')
data.shape
data.tail()

Unnamed: 0,label,area,perimeter,aspectratio,centroidx,centroidy,equivalentdiameter,extent,meanintensity,orientation,eccentricity,dicompatientid,previouscancerdiagnosis
72,580a2665-d0e8-4d36-81c2-c543eafa9671,1006,631.771645,1.545455,140.037773,444.460239,35.78937,0.33623,0.332172,-1.542073,0.815709,3,0.5
73,6ccb78b2-44e4-4aae-ab8b-23cde8585db9,342,150.953319,0.5,228.98538,441.038012,20.867389,0.527778,0.496996,-0.096871,0.874032,4,1.0
74,6ccb78b2-44e4-4aae-ab8b-23cde8585db9,324,155.539105,0.473684,28.179012,451.783951,20.310825,0.473684,0.53924,0.1368,0.887465,4,1.0
75,8eb39b0d-4c10-4b1f-9083-c8f48666a48c,154,54.870058,1.153846,160.519481,440.194805,14.002817,0.789744,0.379374,1.20565,0.598403,4,1.0
76,6ccb78b2-44e4-4aae-ab8b-23cde8585db9,139,56.870058,0.8,220.359712,432.71223,13.303394,0.772222,0.488278,-0.220644,0.638744,4,1.0


In [3]:

data.label = data.label.map({
    '580a2665-d0e8-4d36-81c2-c543eafa9671': 0,
    '8eb39b0d-4c10-4b1f-9083-c8f48666a48c': 1,
    '6ccb78b2-44e4-4aae-ab8b-23cde8585db9': 0,
    'e3b044b1-c1db-4d71-9c08-a9c760730fd5': 0
}).astype(np.int)
data.centroidx= (data.centroidx - data.centroidx.min()) / (data.centroidx.max() - data.centroidx.min())
data.centroidy= (data.centroidy - data.centroidy.min()) / (data.centroidy.max() - data.centroidy.min())
data.orientation = (data.orientation  - data.orientation.min()) / (data.orientation.max() - data.orientation.min())
data.perimeter= (data.perimeter - data.perimeter.min()) / (data.perimeter.max() - data.perimeter.min())
data.meanintensity= (data.meanintensity - data.meanintensity.min()) / (data.meanintensity.max() - data.meanintensity.min())
data.extent= (data.extent - data.extent.min()) / (data.extent.max() - data.extent.min())
data.aspectratio= (data.aspectratio - data.aspectratio.min()) / (data.aspectratio.max() - data.aspectratio.min())
data.area = (data.area - data.area.min()) / (data.area.max() - data.area.min())
data.equivalentdiameter = (data.equivalentdiameter - data.equivalentdiameter.min()) / (data.equivalentdiameter.max() - data.equivalentdiameter.min())
data = data.drop(columns=['dicompatientid', 'previouscancerdiagnosis'])
data.head()

Unnamed: 0,label,area,perimeter,aspectratio,centroidx,centroidy,equivalentdiameter,extent,meanintensity,orientation,eccentricity
0,0,0.326653,0.350579,0.949091,0.530454,0.842094,0.534709,0.308178,0.333713,0.036288,0.811104
1,0,0.326653,0.350579,0.949091,0.530454,0.842094,0.534709,0.308178,0.333713,0.036288,0.811104
2,0,0.334669,0.229176,0.04,1.0,0.894392,0.54224,0.435063,0.684463,0.464267,0.874032
3,0,0.130261,0.086087,0.28,0.040752,0.857569,0.30873,0.720855,0.668596,0.545255,0.602152
4,1,0.1002,0.072654,0.44,0.470218,0.208083,0.261825,0.700916,0.334637,0.793856,0.418157


In [4]:
featuresColumns = [x for x in data.columns if x not in 'label']
stf = StratifiedShuffleSplit(n_splits=1, test_size=7, random_state=42)
trainIdx, testIdx = next(stf.split(data[featuresColumns], data['label']))

xTrain = data.loc[trainIdx, featuresColumns]
yTrain = data.loc[trainIdx, 'label']

xTest = data.loc[testIdx, featuresColumns]
yTest = data.loc[testIdx, 'label']

In [5]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(xTrain, yTrain)

DecisionTreeClassifier(random_state=42)

In [6]:
dt.tree_.node_count, dt.tree_.max_depth

(11, 4)

In [7]:
def measureErrors(yTrue, yGuess, label):
    return pd.Series({
        'accuracy': accuracy_score(y_true=yTrue, y_pred=yGuess),
        'precision': precision_score(y_true=yTrue, y_pred=yGuess),
        'recall': recall_score(y_true=yTrue, y_pred=yGuess),
        'roc': roc_auc_score(yTrue,yGuess)
        },name=label
    )

In [8]:
trainPrediction = dt.predict(xTrain)
testPrediction = dt.predict(xTest)

trainAndTestErrors = pd.concat([measureErrors(yTrain, trainPrediction, 'train'), measureErrors(yTest, testPrediction, 'test')], axis=1)

trainAndTestErrors

Unnamed: 0,train,test
accuracy,1.0,1.0
precision,1.0,1.0
recall,1.0,1.0
roc,1.0,1.0
