In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score 
from sklearn.model_selection import  StratifiedShuffleSplit

In [2]:
data = pd.read_csv('./database.csv')
data.shape
data.head()

Unnamed: 0,label,category,area,perimeter,aspectratio,centroidx,centroidy,equivalentdiameter,extent,meanintensity,orientation,eccentricity
0,6ccb78b2-44e4-4aae-ab8b-23cde8585db9,post,284,154.296465,0.5,229.035211,441.15493,19.015784,0.438272,0.702085,-0.088274,0.871399
1,e3b044b1-c1db-4d71-9c08-a9c760730fd5,post,117,69.112698,0.8,36.401709,428.777778,12.205287,0.65,0.688989,0.171587,0.622435
2,8eb39b0d-4c10-4b1f-9083-c8f48666a48c,post,22,14.828427,1.0,105.090909,390.772727,5.292567,0.88,0.586809,0.588003,0.357979
3,8eb39b0d-4c10-4b1f-9083-c8f48666a48c,post,146,44.970563,1.153846,133.191781,250.157534,13.634257,0.748718,0.624335,1.490833,0.525384
4,8eb39b0d-4c10-4b1f-9083-c8f48666a48c,post,12,12.0,0.6,132.0,262.0,3.90882,0.8,0.534641,0.0,0.801784


In [3]:

data.label = data.label.map({
    '580a2665-d0e8-4d36-81c2-c543eafa9671': 0,
    '8eb39b0d-4c10-4b1f-9083-c8f48666a48c': 1,
    '6ccb78b2-44e4-4aae-ab8b-23cde8585db9': 0,
    'e3b044b1-c1db-4d71-9c08-a9c760730fd5': 0
}).astype(np.int)
data.category = data.category.map({
    'ant': 0,
    'post': 0.5,
    'cra': 1
}).astype(np.int)
data.centroidx= (data.centroidx - data.centroidx.min()) / (data.centroidx.max() - data.centroidx.min())
data.centroidy= (data.centroidy - data.centroidy.min()) / (data.centroidy.max() - data.centroidy.min())
data.orientation = (data.orientation  - data.orientation.min()) / (data.orientation.max() - data.orientation.min())
data.perimeter= (data.perimeter - data.perimeter.min()) / (data.perimeter.max() - data.perimeter.min())
data.meanintensity= (data.meanintensity - data.meanintensity.min()) / (data.meanintensity.max() - data.meanintensity.min())
data.extent= (data.extent - data.extent.min()) / (data.extent.max() - data.extent.min())
data.aspectratio= (data.aspectratio - data.aspectratio.min()) / (data.aspectratio.max() - data.aspectratio.min())
data.area = (data.area - data.area.min()) / (data.area.max() - data.area.min())
data.equivalentdiameter = (data.equivalentdiameter - data.equivalentdiameter.min()) / (data.equivalentdiameter.max() - data.equivalentdiameter.min())

data.head()

Unnamed: 0,label,category,area,perimeter,aspectratio,centroidx,centroidy,equivalentdiameter,extent,meanintensity,orientation,eccentricity
0,0,post,0.055904,0.156502,0.160305,1.0,0.907492,0.207785,0.195797,0.95117,0.466075,0.871399
1,0,post,0.022078,0.065376,0.336641,0.041041,0.869607,0.118358,0.498919,0.919137,0.549704,0.622435
2,1,post,0.002836,0.007305,0.454198,0.382986,0.753278,0.027588,0.828201,0.669193,0.683716,0.357979
3,1,post,0.027952,0.03955,0.544627,0.522877,0.322873,0.137121,0.64025,0.760985,0.974266,0.525384
4,1,post,0.00081,0.004279,0.219084,0.516944,0.359122,0.009418,0.713668,0.541582,0.494484,0.801784


In [4]:
featuresColumns = [x for x in data.columns if x not in 'label']
stf = StratifiedShuffleSplit(n_splits=1, test_size=10, random_state=42)
trainIdx, testIdx = next(stf.split(data[featuresColumns], data['label']))

xTrain = data.loc[trainIdx, featuresColumns]
yTrain = data.loc[trainIdx, 'label']

xTest = data.loc[testIdx, featuresColumns]
yTest = data.loc[testIdx, 'label']

In [5]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(xTrain, yTrain)

ValueError: could not convert string to float: 'ant'

In [16]:
dt.tree_.node_count, dt.tree_.max_depth

(19, 5)

In [17]:
def measureErrors(yTrue, yGuess, label):
    return pd.Series({
        'accuracy': accuracy_score(y_true=yTrue, y_pred=yGuess),
        'precision': precision_score(y_true=yTrue, y_pred=yGuess),
        'recall': recall_score(y_true=yTrue, y_pred=yGuess),
        'roc': roc_auc_score(yTrue,yGuess)
        },name=label
    )

In [18]:
trainPrediction = dt.predict(xTrain)
testPrediction = dt.predict(xTest)

trainAndTestErrors = pd.concat([measureErrors(yTrain, trainPrediction, 'train'), measureErrors(yTest, testPrediction, 'test')], axis=1)

trainAndTestErrors

Unnamed: 0,train,test
accuracy,1.0,0.9
precision,1.0,0.833333
recall,1.0,1.0
roc,1.0,0.9
