In [1]:
import os
import math
import operator
import numpy as np
import random
from nbayes import naiveBayes
from nbayes import discretize
from mldata import parse_c45

In [2]:
data1 = np.array(parse_c45('voting').to_float())
data2 = np.array(parse_c45('spam').to_float())
data3 = np.array(parse_c45('volcanoes').to_float())

In [206]:
def stratCrossValid(data): # stratified 5-fold-validation for both discrete and continuous cases
    subset0 = []
    subset1 = []
    fold1 = []
    fold2 = []
    fold3 = []
    fold4 = []
    fold5 = []
    for i in range(0, len(data)):
        if  1.0 == data[i,-1]:
            subset1.append(data[i])
        else:
            subset0.append(data[i])
    subset1 = np.array(subset1)
    subset0 = np.array(subset0)
    
    np.random.seed(12345) # set random seed to 12345
    np.random.shuffle(subset1)
    np.random.shuffle(subset0)  
    line1 = int(len(subset1) / 5)
    line2 = int(len(subset0) / 5)
    
    temp1 = subset1[0:line1]
    temp2 = subset0[0:line2]
    fold1 = np.concatenate((temp1,temp2), axis = 0)
    temp3 = subset1[line1:line1 * 2]
    temp4 = subset0[line2:line2 * 2]
    fold2 = np.concatenate((temp3,temp4), axis = 0)
    temp5 = subset1[line1 * 2:line1 * 3]
    temp6 = subset0[line2 * 2:line2 * 3]
    fold3 = np.concatenate((temp5,temp6), axis = 0)
    temp7 = subset1[line1 * 3:line1 * 4]
    temp8 = subset0[line2 * 3:line2 * 4]
    fold4 = np.concatenate((temp7,temp8), axis = 0)
    temp9 = subset1[line1 * 4:len(subset1)]
    temp10 = subset0[line2 * 4:len(subset0)]
    fold5 = np.concatenate((temp9,temp10), axis = 0)
    
    return fold1,fold2,fold3,fold4,fold5

def getLabelCounts(data):
    posCounts = 0
    negCounts = 0

    for i in range(0, len(data)):
        if  data[i] == 1.0:
            posCounts += 1 # number of positive labels
        else:
            negCounts += 1 # number of negative labels
    return negCounts, posCounts


def baggingHelper(testData, trainingData, algorithm, iterations):
   
    output = []
    if algorithm == "nbayes":
        for i in range(0, iterations):
            temp = []
            train = []
            for j in range(0, len(trainingData)): # create randomized training datasets
                train.append(random.choice(trainingData))
            for k in range(0, len(testData)): # run test data on training datasets
                train = np.array(train)
                temp.append(naiveBayes(-1, 5, train, testData[k]))
            output.append(temp)             
    output = np.array(output)
    return output

def bagging(testData, trainingData, algorithm, iterations):
    dataset = baggingHelper(testData, trainingData, algorithm, iterations)
    output = []
    
    if (algorithm == "nbayes"):
        for i in range(0, len(testData)):
            posCount = 0
            negCount = 0
            for j in range(0, iterations):
                if dataset[j, i] == 1:
                    posCount += 1
                elif dataset[j, i] == 0:
                    negCount += 1
            if posCount > negCount:
                output.append(1)
            elif posCount < negCount:
                output.append(0)
            else:
                output.append(random.randint(0,1))
    
    output = np.array(output)
    return output
        
def calcAccuracy(preds, data): #
    correctCounts = 0.0
    for i in range(1, len(preds) - 1):
        if preds[i] == data[i,-1]:
            correctCounts = correctCounts + 1
    accuracy = correctCounts / len(data)
    
    return accuracy

def search(ID, data):
    output = -1
    for i in range(0, len(data)):
        if data[i, 0] == ID:
            output = data[i, -1]
            break
    return output

In [135]:
test = baggingHelper(data3[1000:1050], data3[1100:1200], "nbayes", 5)
test

array([[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0]])

In [207]:
test2 = bagging(data2[100:300], data2[500:1000], "nbayes", 5)
test2

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1])

In [None]:
test3 = naiveBayes(-1, 3, data1[:100])
test3

In [208]:
calcAccuracy(test2, data2[100:300])

0.665

In [114]:
random.choice(data2[100:200])

array([1.31000000e+02, 2.67167600e+03, 9.00000000e+00, 7.24116000e+05,
       5.84000000e+00, 5.00000000e+01, 2.00000000e+00, 9.50000000e-01,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       1.00000000e+00, 1.00000000e+00, 1.28268800e+00, 8.95040000e-02,
       1.11042000e-01, 4.87910000e-02, 1.39730000e+04, 4.62699432e+03,
       1.00000000e+00])

In [58]:
qq = np.array(data1[:100])
yy = np.array(data1[300:])

In [171]:
sample = naiveBayes(-1, 5, data3, data3[559])
sample

1

In [None]:
test[0]

In [160]:
data3[1010]

array([1010.,    2.,  113.,  136.,  125.,  134.,  131.,   98.,   94.,
         98.,   97.,  108.,   97.,  100.,  100.,   81.,   94.,  107.,
        106.,  111.,  102.,  135.,  113.,  101.,  105.,   98.,   97.,
        102.,  103.,  102.,   95.,   98.,  110.,  110.,  114.,  113.,
        147.,  135.,  106.,  109.,   92.,   93.,   97.,  104.,  114.,
        101.,   90.,   94.,  104.,  115.,  107.,  133.,  151.,  104.,
         95.,  108.,  106.,  103.,   98.,  101.,  112.,  103.,  100.,
        100.,  114.,  103.,  105.,  148.,  128.,  106.,  100.,   88.,
         96.,   97.,   89.,  100.,  115.,  101.,  103.,  115.,  119.,
        118.,  144.,  151.,  124.,   97.,   89.,   97.,   94.,  102.,
         90.,   91.,  103.,  110.,  127.,  116.,  109.,  147.,  158.,
        142.,  120.,  103.,   96.,   96.,   96.,   78.,   92.,  119.,
        106.,  130.,  120.,  130.,  142.,  127.,  123.,  139.,  108.,
         98.,   86.,   89.,  100.,   92.,  121.,  117.,  166.,  136.,
        129.,  129.,

In [None]:
test[1][0]