### Imports

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn import model_selection
from Perceptron import Perceptron

### Constants

In [2]:
PATH_FILE_DS_CANCER = 'file/ds-breast-cancer-wisconsin/wdbc.data'
IS_DEBUG = False

CROSS_VAL_TEST_SIZE = .3
CROSS_VAL_TIMES = 20

CLASS_BAD_STR = 'M'
CLASS_BAD_NUM = 0

CLASS_GOOD_STR = 'B'
CLASS_GOOD_NUM = 1

COLUMN_CLASS = 'is_bad'
COLUMN_ID = 'id'

## Build DataSet

In [3]:
# Import file
cancerDF = pd.read_csv(PATH_FILE_DS_CANCER)

# Name columns
columnNames = [COLUMN_ID, COLUMN_CLASS] + ['f_' + str(i + 1) for i in range(0, cancerDF.columns.size - 2)]
cancerDF.columns = columnNames

# Clear data
cancerDF[COLUMN_CLASS] = cancerDF[COLUMN_CLASS].apply(lambda c: (CLASS_BAD_NUM if c == CLASS_BAD_STR else CLASS_GOOD_NUM))
cancerDF = cancerDF.applymap(pd.to_numeric, errors='coerce')
cancerDF = cancerDF[ cancerDF.notnull().all(axis=1) ]

# Separate labels & features clean data
allGoodFeatures = cancerDF[ cancerDF[COLUMN_CLASS] == CLASS_GOOD_NUM ].drop([COLUMN_ID, COLUMN_CLASS], axis=1)
allBadFeatures = cancerDF[ cancerDF[COLUMN_CLASS] == CLASS_BAD_NUM ].drop([COLUMN_ID, COLUMN_CLASS], axis=1)

allGoodLabels = np.full((allGoodFeatures.shape[0], 1), CLASS_GOOD_NUM)
allBadLabels = np.full((allBadFeatures.shape[0], 1), CLASS_BAD_NUM)


## Utils

In [4]:

def getCrossValidationData(features: pd.DataFrame, labels: pd.DataFrame, testSize: float, reproduceble = False) -> dict:
    '''
        TODO: 2021-10-22 - ADD Description
    '''

    randomState = 100 if reproduceble else None
    xTrain, xTest, yTrain, yTest = model_selection.train_test_split(features, labels, test_size=testSize, random_state=randomState)
    return {
        'xTrain': np.array(xTrain), 'xTest': np.array(xTest),
        'yTrain': np.array(yTrain), 'yTest': np.array(yTest),
    }


## Train

In [5]:

accuracies = np.array([])

for i in range(0, CROSS_VAL_TIMES):

    goodData = getCrossValidationData(allGoodFeatures, allGoodLabels, CROSS_VAL_TEST_SIZE, reproduceble=IS_DEBUG)
    badData = getCrossValidationData(allBadFeatures, allBadLabels, CROSS_VAL_TEST_SIZE, reproduceble=IS_DEBUG)
    # { 'xTrain': xTrain, 'xTest': xTest, 'yTrain': yTrain, 'yTest': yTest }

    # Train
    labelsTrain = np.append(goodData.get('yTrain'), badData.get('yTrain'), axis=0)
    featuresTrain = np.append(goodData.get('xTrain'), badData.get('xTrain'), axis=0)

    perceptron = Perceptron(X=featuresTrain, Y=labelsTrain, tolerance=0.01, step=.5, maxIterations=500, actvFunc='step')
    w = perceptron.train()

    # Test
    featuresTest = np.append(goodData.get('xTest'), badData.get('xTest'), axis=0)
    prediction = np.apply_along_axis(lambda row: perceptron.getPrediction(row), 1, featuresTest)
    labelsTest = np.append(goodData.get('yTest'), badData.get('yTest'), axis=0)
    
    test = prediction == labelsTest.reshape((labelsTest.shape[0]))
    accuracies = np.append(accuracies, test.mean())

print('accuracies:', accuracies.mean(), accuracies)
print('stdDeviation:', accuracies.std())

accuracies: 0.9026162790697676 [0.9127907  0.93023256 0.87209302 0.91860465 0.93023256 0.91860465
 0.93604651 0.90697674 0.87790698 0.90116279 0.9244186  0.91860465
 0.85465116 0.93023256 0.87790698 0.89534884 0.8255814  0.9244186
 0.86046512 0.93604651]
stdDeviation: 0.030481699285746414
