In [1]:
%pylab inline
import warnings
warnings.filterwarnings('ignore')

random_seed = 42 #set random state to this variable (when applicable) so results can be reproduced

clfList = []

Populating the interactive namespace from numpy and matplotlib


# Loading Dataset and splitting

In [2]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

from sklearn.metrics import accuracy_score

In [3]:
#Creating partial slices of the existing train set 
X_train2 = X_train[:249]
y_train2 = y_train[:249]

X_train3 = X_train[250:]
y_train3 = y_train[250:]

In [4]:
class Miner:
    def __init__(self, clf):
        self.clf = clf
        self.performance = 0
        self.correctPredictions = 0
        self.predictionsMade = 0
        self.currentPred = -1
    def getPerformance(self):
        return float(self.correctPredictions / self.predictionsMade)
    def getClf(self):
        return self.clf
    def setCurrentPred(self, pred):
        self.currentPred = pred
    def predict(self, data):
        self.setCurrentPred(self.clf.predict(data))
        return self.currentPred
    def predict_proba(self, data):
        temp = self.clf.predict_proba(data)
        if temp[0][0] > temp[0][1]:
            self.setCurrentPred(0)
        else:
            self.setCurrentPred(1)
        return temp
    def updatePrediction(self, val):
        self.predictionsMade = self.predictionsMade + 1
        if self.currentPred == val:
            self.correctPredictions = self.correctPredictions + 1
    
    

## Decision Tree Classifier

In [5]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=random_seed)
dt = dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)
print(accuracy_score(y_test, y_pred))

clfList.append(Miner(dt))

0.9298245614035088


In [6]:
dt = DecisionTreeClassifier(random_state=random_seed)
dt = dt.fit(X_train2, y_train2)

y_pred = dt.predict(X_test)
print(accuracy_score(y_test, y_pred))

clfList.append(Miner(dt))

0.9122807017543859


In [7]:
dt = DecisionTreeClassifier(random_state=random_seed)
dt = dt.fit(X_train3, y_train3)

y_pred = dt.predict(X_test)
print(accuracy_score(y_test, y_pred))

clfList.append(Miner(dt))

0.9181286549707602


## Logistic Regression Classifier

In [8]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=random_seed)
lr = lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)
print(accuracy_score(y_test, y_pred))

clfList.append(Miner(lr))

0.9649122807017544


In [9]:
lr = LogisticRegression(random_state=random_seed)
lr = lr.fit(X_train2, y_train2)

y_pred = lr.predict(X_test)
print(accuracy_score(y_test, y_pred))

clfList.append(Miner(lr))

0.9590643274853801


In [10]:
lr = LogisticRegression(random_state=random_seed)
lr = lr.fit(X_train3, y_train3)

y_pred = lr.predict(X_test)
print(accuracy_score(y_test, y_pred))

clfList.append(Miner(lr))

0.9590643274853801


## Random Forest

In [11]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=random_seed)
rf = rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
print(accuracy_score(y_test, y_pred))

clfList.append(Miner(rf))

0.9590643274853801


In [12]:
rf = RandomForestClassifier(random_state=random_seed)
rf = rf.fit(X_train2, y_train2)

y_pred = rf.predict(X_test)
print(accuracy_score(y_test, y_pred))

clfList.append(Miner(rf))

0.9298245614035088


In [13]:
rf = RandomForestClassifier(random_state=random_seed)
rf = rf.fit(X_train3, y_train3)

y_pred = rf.predict(X_test)
print(accuracy_score(y_test, y_pred))

clfList.append(Miner(rf))

0.935672514619883


## Gaussian Naive Bayes

In [14]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb = nb.fit(X_train, y_train)

y_pred = nb.predict(X_test)
print(accuracy_score(y_test, y_pred))

clfList.append(Miner(nb))

0.9239766081871345


In [15]:
nb = GaussianNB()
nb = nb.fit(X_train2, y_train2)

y_pred = nb.predict(X_test)
print(accuracy_score(y_test, y_pred))

clfList.append(Miner(nb))

0.9298245614035088


In [16]:
nb = GaussianNB()
nb = nb.fit(X_train3, y_train3)

y_pred = nb.predict(X_test)
print(accuracy_score(y_test, y_pred))

clfList.append(Miner(nb))

0.9122807017543859


## Support Vector Classifier

In [17]:
from sklearn.svm import SVC

svc = SVC(kernel='linear', probability=True, random_state=random_seed)
svc = svc.fit(X_train, y_train)

y_pred = svc.predict(X_test)
print(accuracy_score(y_test, y_pred))

clfList.append(Miner(svc))

0.9590643274853801


In [18]:
svc = SVC(kernel='linear', probability=True, random_state=random_seed)
svc = svc.fit(X_train2, y_train2)

y_pred = svc.predict(X_test)
print(accuracy_score(y_test, y_pred))

clfList.append(Miner(svc))

0.9239766081871345


In [19]:
svc = SVC(kernel='linear', probability=True, random_state=random_seed)
svc = svc.fit(X_train3, y_train3)

y_pred = svc.predict(X_test)
print(accuracy_score(y_test, y_pred))

clfList.append(Miner(svc))

0.9707602339181286


## Nearest Neighbors Classifier

In [20]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn = knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)
print(accuracy_score(y_test, y_pred))

clfList.append(Miner(knn))

0.9473684210526315


In [21]:
knn = KNeighborsClassifier()
knn = knn.fit(X_train2, y_train2)

y_pred = knn.predict(X_test)
print(accuracy_score(y_test, y_pred))

clfList.append(Miner(knn))

0.9590643274853801


In [22]:
knn = KNeighborsClassifier()
knn = knn.fit(X_train3, y_train3)

y_pred = knn.predict(X_test)
print(accuracy_score(y_test, y_pred))

clfList.append(Miner(knn))

0.9473684210526315


In [23]:
import heapq, random

class Chain:
    def __init__(self):
        self.blockList = []
    def addToChain(self, block):
        self.blockList.append(block)
    def getBlockList(self):
        return self.blockList
    
class Block:
    def __init__(self, data, label):
        self.data = data
        self.label = label

def pickRandomValidator(minerList):
    validator = random.choice(minerList)
    minerList.remove(validator)
    return validator

def pickBestValidator(minerList):
    accList = []
    for clf in minerList:
        accList.append(clf.getPerformance())
    validator = minerList[accList.index(max(accList))]
    minerList.remove(validator)
    return validator

def pickFromTopValidator(minerList):
    topList = []
    for clf in minerList:
        if len(topList) < 5:
            topList.append(clf)
        else:
            for cand in topList:
                if clf.getPerformance() > cand.getPerformance():
                    topList.append(clf)
                    topList.remove(cand)
    validator = random.choice(topList)
    minerList.remove(validator)
    return validator


def validateData(minerList, validator, data):
    mProba = [0, 0]
    for clf in minerList:
        mProba = mProba + clf.predict_proba(data)
    mProba = mProba / len(minerList)

    if mProba[0][0] > mProba[0][1]:
        consensus = 0
    else:
        consensus = 1
        
    vPred = validator.predict(data)
    for clf in minerList:
        clf.updatePrediction(vPred)
    if consensus == vPred:
        return consensus
    else:
        validator = None
        return -1



## RANDOM VALIDATOR PER ROUND

In [24]:
test = Chain() #Instantiate a test blockchain

failcount = 0

In [25]:
#Rerun this cell to simulate using random data points from the test set
for i in range(0,10000):
    randomData = random.choice(X_test)
    validator = pickRandomValidator(clfList)
    consensusPred = validateData(clfList, validator, randomData.reshape(1,-1))
    if consensusPred > -1:
        temp = Block(randomData, consensusPred)
        test.addToChain(temp)
    else:
        failcount = failcount + 1
    clfList.append(validator)

print("Validation Failed " + str(failcount) + " times.")
print("Final block height: " + str(len(test.getBlockList())))

Validation Failed 474 times.
Final block height: 9526


## PICK BEST PERFORMING VALIDATOR 

#### (Assumes that the models already have a prediction history)

In [26]:
test2 = Chain() #Instantiate a test blockchain
validator2 = None

failcount2 = 0

In [27]:
#Rerun this cell to simulate using random data points from the test set
for i in range(0,10000):
    randomData = random.choice(X_test)
    if validator2 is None:
        validator2 = pickBestValidator(clfList)
    consensusPred = validateData(clfList, validator2, randomData.reshape(1,-1))
    if consensusPred > -1:
        temp = Block(randomData, consensusPred)
        test2.addToChain(temp)
    else:
        clfList.append(validator2)
        validator2 = None
        failcount2 = failcount2 + 1
        
print("Validation Failed " + str(failcount2) + " times.")
print("Final block height: " + str(len(test2.getBlockList())))

Validation Failed 356 times.
Final block height: 9644


## PICK RANDOM VALIDATOR FROM TOP 5 PERFORMING MODELS

In [28]:
test3 = Chain() #Instantiate a test blockchain
validator3 = None

failcount3 = 0

In [29]:
#Rerun this cell to simulate using random data points from the test set
for i in range(0,10000):
    randomData = random.choice(X_test)
    if validator3 is None:
        validator3 = pickFromTopValidator(clfList)
    consensusPred = validateData(clfList, validator3, randomData.reshape(1,-1))
    if consensusPred > -1:
        temp = Block(randomData, consensusPred)
        test3.addToChain(temp)
    else:
        clfList.append(validator2)
        validator3 = None
        failcount3 = failcount3 + 1
        
print("Validation Failed " + str(failcount3) + " times.")
print("Final block height: " + str(len(test3.getBlockList())))

Validation Failed 359 times.
Final block height: 9641
