In [11]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score 
from sklearn.model_selection import  StratifiedShuffleSplit

In [12]:
data = pd.read_csv('./database.csv')
data.shape
data.head()

Unnamed: 0,label,area,perimeter,aspectratio,centroidx,centroidy,equivalentdiameter,extent,meanintensity,orientation,eccentricity
0,e3b044b1-c1db-4d71-9c08-a9c760730fd5,46,23.071068,1.142857,175.391304,235.934783,7.65304,0.821429,0.385848,1.160695,0.624644
1,e3b044b1-c1db-4d71-9c08-a9c760730fd5,15,11.414214,1.0,150.6,237.4,4.370194,0.9375,0.371765,0.785398,0.489898
2,e3b044b1-c1db-4d71-9c08-a9c760730fd5,65,27.313708,1.25,65.769231,221.476923,9.097284,0.8125,0.398492,1.239514,0.669442
3,e3b044b1-c1db-4d71-9c08-a9c760730fd5,756,171.4386,0.767857,77.374339,450.34127,31.025298,0.313953,0.447692,0.647034,0.969795
4,e3b044b1-c1db-4d71-9c08-a9c760730fd5,14,10.828427,1.0,98.5,426.5,4.222008,0.875,0.368347,-0.785398,0.67082


In [13]:

data.label = data.label.map({
    '580a2665-d0e8-4d36-81c2-c543eafa9671': 0,
    '8eb39b0d-4c10-4b1f-9083-c8f48666a48c': 1,
    '6ccb78b2-44e4-4aae-ab8b-23cde8585db9': 0,
    'e3b044b1-c1db-4d71-9c08-a9c760730fd5': 0
}).astype(np.int)

data.centroidx= (data.centroidx - data.centroidx.min()) / (data.centroidx.max() - data.centroidx.min())
data.centroidy= (data.centroidy - data.centroidy.min()) / (data.centroidy.max() - data.centroidy.min())
data.orientation = (data.orientation  - data.orientation.min()) / (data.orientation.max() - data.orientation.min())
data.perimeter= (data.perimeter - data.perimeter.min()) / (data.perimeter.max() - data.perimeter.min())
data.meanintensity= (data.meanintensity - data.meanintensity.min()) / (data.meanintensity.max() - data.meanintensity.min())
data.extent= (data.extent - data.extent.min()) / (data.extent.max() - data.extent.min())
data.aspectratio= (data.aspectratio - data.aspectratio.min()) / (data.aspectratio.max() - data.aspectratio.min())
data.area = (data.area - data.area.min()) / (data.area.max() - data.area.min())
data.equivalentdiameter = (data.equivalentdiameter - data.equivalentdiameter.min()) / (data.equivalentdiameter.max() - data.equivalentdiameter.min())

data.head()

Unnamed: 0,label,area,perimeter,aspectratio,centroidx,centroidy,equivalentdiameter,extent,meanintensity,orientation,eccentricity
0,0,0.005508,0.016201,0.502741,0.732475,0.278592,0.049251,0.739709,0.062515,0.863889,0.624644
1,0,0.001015,0.00367,0.419865,0.609141,0.283081,0.013011,0.908898,0.029129,0.739329,0.489898
2,0,0.008262,0.020761,0.564898,0.187115,0.234289,0.065195,0.726695,0.092487,0.890049,0.669442
3,0,0.108422,0.175688,0.28519,0.244849,0.935589,0.307263,0.0,0.20912,0.693407,0.969795
4,0,0.00087,0.00304,0.419865,0.349948,0.862533,0.011376,0.817797,0.021028,0.217988,0.67082


In [14]:
featuresColumns = [x for x in data.columns if x not in 'label']
stf = StratifiedShuffleSplit(n_splits=1, test_size=10, random_state=42)
trainIdx, testIdx = next(stf.split(data[featuresColumns], data['label']))

xTrain = data.loc[trainIdx, featuresColumns]
yTrain = data.loc[trainIdx, 'label']

xTest = data.loc[testIdx, featuresColumns]
yTest = data.loc[testIdx, 'label']

In [15]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(xTrain, yTrain)

DecisionTreeClassifier(random_state=42)

In [16]:
dt.tree_.node_count, dt.tree_.max_depth

(19, 5)

In [17]:
def measureErrors(yTrue, yGuess, label):
    return pd.Series({
        'accuracy': accuracy_score(y_true=yTrue, y_pred=yGuess),
        'precision': precision_score(y_true=yTrue, y_pred=yGuess),
        'recall': recall_score(y_true=yTrue, y_pred=yGuess),
        'roc': roc_auc_score(yTrue,yGuess)
        },name=label
    )

In [18]:
trainPrediction = dt.predict(xTrain)
testPrediction = dt.predict(xTest)

trainAndTestErrors = pd.concat([measureErrors(yTrain, trainPrediction, 'train'), measureErrors(yTest, testPrediction, 'test')], axis=1)

trainAndTestErrors

Unnamed: 0,train,test
accuracy,1.0,0.9
precision,1.0,0.833333
recall,1.0,1.0
roc,1.0,0.9
