In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score 
from sklearn.model_selection import  StratifiedShuffleSplit

In [2]:
data = pd.read_csv('./database.csv')
data.shape
data.tail()

Unnamed: 0,label,area,perimeter,aspectRatio,centroidX,centroidY,equivalentDiameter,extent,meanIntensity,orientation,eccentricity,dicomPatientId,previousCancerDiagnosis
72,0,1006,631.771645,1.545455,140.037773,444.460239,35.78937,0.33623,0.332172,-1.542073,0.815709,3,0.5
73,0,342,150.953319,0.5,228.98538,441.038012,20.867389,0.527778,0.496996,-0.096871,0.874032,4,1.0
74,0,324,155.539105,0.473684,28.179012,451.783951,20.310825,0.473684,0.53924,0.1368,0.887465,4,1.0
75,1,154,54.870058,1.153846,160.519481,440.194805,14.002817,0.789744,0.379374,1.20565,0.598403,4,1.0
76,0,139,56.870058,0.8,220.359712,432.71223,13.303394,0.772222,0.488278,-0.220644,0.638744,4,1.0


In [3]:
data.centroidX= (data.centroidX - data.centroidX.min()) / (data.centroidX.max() - data.centroidX.min())
data.centroidY= (data.centroidY - data.centroidY.min()) / (data.centroidY.max() - data.centroidY.min())
data.orientation = (data.orientation  - data.orientation.min()) / (data.orientation.max() - data.orientation.min())
data.perimeter= (data.perimeter - data.perimeter.min()) / (data.perimeter.max() - data.perimeter.min())
data.meanIntensity= (data.meanIntensity - data.meanIntensity.min()) / (data.meanIntensity.max() - data.meanIntensity.min())
data.extent= (data.extent - data.extent.min()) / (data.extent.max() - data.extent.min())
data.aspectRatio= (data.aspectRatio - data.aspectRatio.min()) / (data.aspectRatio.max() - data.aspectRatio.min())
data.area = (data.area - data.area.min()) / (data.area.max() - data.area.min())
data.equivalentDiameter = (data.equivalentDiameter - data.equivalentDiameter.min()) / (data.equivalentDiameter.max() - data.equivalentDiameter.min())
data = data.drop(columns=['previousCancerDiagnosis', 'dicomPatientId'])
data.head()

Unnamed: 0,label,area,perimeter,aspectRatio,centroidX,centroidY,equivalentDiameter,extent,meanIntensity,orientation,eccentricity
0,0,0.326653,0.350579,0.949091,0.530454,0.842094,0.534709,0.308178,0.333713,0.036288,0.811104
1,0,0.326653,0.350579,0.949091,0.530454,0.842094,0.534709,0.308178,0.333713,0.036288,0.811104
2,0,0.334669,0.229176,0.04,1.0,0.894392,0.54224,0.435063,0.684463,0.464267,0.874032
3,0,0.130261,0.086087,0.28,0.040752,0.857569,0.30873,0.720855,0.668596,0.545255,0.602152
4,1,0.1002,0.072654,0.44,0.470218,0.208083,0.261825,0.700916,0.334637,0.793856,0.418157


In [4]:
featuresColumns = [x for x in data.columns if x not in 'label']
stf = StratifiedShuffleSplit(n_splits=1, test_size=23, random_state=42)
trainIdx, testIdx = next(stf.split(data[featuresColumns], data['label']))

xTrain = data.loc[trainIdx, featuresColumns]
yTrain = data.loc[trainIdx, 'label']

xTest = data.loc[testIdx, featuresColumns]
yTest = data.loc[testIdx, 'label']

In [5]:
dt = DecisionTreeClassifier(random_state=2)
dt.fit(xTrain, yTrain)

DecisionTreeClassifier(random_state=2)

In [6]:
dt.tree_.node_count, dt.tree_.max_depth

(9, 4)

In [7]:
def measureErrors(yTrue, yGuess, label):
    return pd.Series({
        'accuracy': accuracy_score(y_true=yTrue, y_pred=yGuess),
        'precision': precision_score(y_true=yTrue, y_pred=yGuess),
        'recall': recall_score(y_true=yTrue, y_pred=yGuess),
        'roc': roc_auc_score(yTrue,yGuess)
        },name=label
    )

In [8]:
trainPrediction = dt.predict(xTrain)
testPrediction = dt.predict(xTest)

trainAndTestErrors = pd.concat([measureErrors(yTrain, trainPrediction, 'train'), measureErrors(yTest, testPrediction, 'test')], axis=1)

trainAndTestErrors

Unnamed: 0,train,test
accuracy,1.0,0.869565
precision,1.0,0.777778
recall,1.0,0.875
roc,1.0,0.870833


In [11]:
import pickle;
pickle.dump(dt, open('dt.sav', 'wb'));

In [10]:

loaded_model = pickle.load(open('dt.sav', 'rb'));


FileNotFoundError: [Errno 2] No such file or directory: 'dt.sav'

In [58]:
predictData = [24,15.6568542495,1.2,128.5,318,5.5279063915000002,0.80000000000000004,0.43202614379999998,1.3211229659999999,0.64881682090000004]
prediction = loaded_model.predict([predictData]);
prob = loaded_model.predict_proba([predictData])
print(prediction[0], prob[0]);

area                  191.000000
perimeter             107.497475
aspectRatio             0.809524
centroidX             150.581152
centroidY             389.151832
equivalentDiameter     15.594510
extent                  0.535014
meanIntensity           0.328796
orientation             0.604204
eccentricity            0.864884
Name: 12, dtype: float64 1
1 [0. 1.]


In [59]:
trainPrediction = loaded_model.predict(xTrain)
testPrediction = loaded_model.predict(xTest)

trainAndTestErrors = pd.concat([measureErrors(yTrain, trainPrediction, 'train'), measureErrors(yTest, testPrediction, 'test')], axis=1)
trainAndTestErrors

Unnamed: 0,train,test
accuracy,1.0,0.85
precision,1.0,0.75
recall,1.0,0.857143
roc,1.0,0.851648
