In [2]:
import os
import math
import operator
import numpy as np
import random
from weighted_nbayes import weightedNaiveBayes
from mldata import parse_c45

In [3]:
data1 = np.array(parse_c45('voting').to_float())
data2 = np.array(parse_c45('spam').to_float())
data3 = np.array(parse_c45('volcanoes').to_float())

In [145]:
def stratCrossValid(data): # stratified 5-fold-validation for both discrete and continuous cases
    subset0 = []
    subset1 = []
    fold1 = []
    fold2 = []
    fold3 = []
    fold4 = []
    fold5 = []
    for i in range(0, len(data)):
        if  1.0 == data[i,-1]:
            subset1.append(data[i])
        else:
            subset0.append(data[i])
    subset1 = np.array(subset1)
    subset0 = np.array(subset0)
    
    np.random.seed(12345) # set random seed to 12345
    np.random.shuffle(subset1)
    np.random.shuffle(subset0)  
    line1 = int(len(subset1) / 5)
    line2 = int(len(subset0) / 5)
    
    temp1 = subset1[0:line1]
    temp2 = subset0[0:line2]
    fold1 = np.concatenate((temp1,temp2), axis = 0)
    temp3 = subset1[line1:line1 * 2]
    temp4 = subset0[line2:line2 * 2]
    fold2 = np.concatenate((temp3,temp4), axis = 0)
    temp5 = subset1[line1 * 2:line1 * 3]
    temp6 = subset0[line2 * 2:line2 * 3]
    fold3 = np.concatenate((temp5,temp6), axis = 0)
    temp7 = subset1[line1 * 3:line1 * 4]
    temp8 = subset0[line2 * 3:line2 * 4]
    fold4 = np.concatenate((temp7,temp8), axis = 0)
    temp9 = subset1[line1 * 4:len(subset1)]
    temp10 = subset0[line2 * 4:len(subset0)]
    fold5 = np.concatenate((temp9,temp10), axis = 0)
    
    return fold1,fold2,fold3,fold4,fold5

def getLabelCounts(data):
    posCounts = 0
    negCounts = 0

    for i in range(0, len(data)):
        if  data[i] == 1.0:
            posCounts += 1 # number of positive labels
        else:
            negCounts += 1 # number of negative labels
    return negCounts, posCounts

def getLabel(index, data):

    if data[index,-1] == 1:
        return 1
    else:
        return -1


def boosting(testData, trainingData, algorithm, iterations):
    output = []
    weights = np.zeros(len(trainingData)) + 1 / len(trainingData)
    votingWeights = np.zeros(iterations)
    
    if (algorithm == "nbayes"):
        for i in range(0, iterations):
            temp = []
            for j in range(0, len(trainingData)):
                temp.append(weightedNaiveBayes(3,3,trainingData,trainingData[j],weights))
            temp = np.array(temp)
            weightedError = calcError(weights, temp, trainingData)
            
            
            if (weightedError == 0 or weightedError >= 0.5):
                break
            else:
                votingWeights[i] = 0.5 * np.log((1 - weightedError) / weightedError)
                new_weights = []
                for k in range(0, len(trainingData)):
                     new_weights.append(weights[k] * np.exp(-votingWeights[i] * getLabel(k, trainingData) * temp[k]))
                for m in range(0, len(trainingData)):
                    weights[m] = new_weights[m] / np.sum(new_weights)
            

        fx = 0
        needConvert = []
        for n in range(0, len(testData)):
            for g in range(0, iterations):
                fx = fx + votingWeights[g] / np.sum(votingWeights) * weightedNaiveBayes(3,3,trainingData,testData[n],weights)
            needConvert.append(fx)
            
        output.append(1)
        for p in range(1,len(needConvert)):
            if (needConvert[p] - needConvert[p - 1] >= 0.99):
                output.append(1)
            else:
                output.append(-1)
            
    
    output = np.array(output)
    return output
        
def calcAccuracy(preds, data): #
    correctCounts = 0.0
    for i in range(0, len(preds)):
        if preds[i] == data[i,-1] == 1:
            correctCounts = correctCounts + 1
        elif preds[i] == -1 and data[i,-1] == 0:
            correctCounts = correctCounts + 1
    accuracy = correctCounts / len(data)
    
    return accuracy

def calcError(weights, preds, data): #
    error = 0.0
    for i in range(0, len(preds)):
        if preds[i] == -1 and data[i,-1] == 1:
            error = error + weights[i] * 1
        elif preds[i] == 1 and data[i,-1] == 0:
            error = error + weights[i] * 1
    
    return error

def applyWeights(data, weight):
    output = []
    for i in range(0,len(data)):
        output.append(weightedNaiveBayes(3,3,data,data[i],weight))
    output = np.array(output)
    return output

In [8]:
weight = np.zeros(440) + 1/440
len(weight)

440

In [152]:
test = boosting(data3[:10], data3[900:1000], "nbayes", 5)

In [147]:
weightedNaiveBayes(3,3,data3[:50],data3[51],weight)

1

In [150]:
test

array([ 1, -1, -1, -1, -1, -1, -1, -1, -1, -1])

In [84]:
len(test)

20

In [153]:
calcAccuracy(test, data3[:10])

1.0

In [160]:
data3[1010]

array([1010.,    2.,  113.,  136.,  125.,  134.,  131.,   98.,   94.,
         98.,   97.,  108.,   97.,  100.,  100.,   81.,   94.,  107.,
        106.,  111.,  102.,  135.,  113.,  101.,  105.,   98.,   97.,
        102.,  103.,  102.,   95.,   98.,  110.,  110.,  114.,  113.,
        147.,  135.,  106.,  109.,   92.,   93.,   97.,  104.,  114.,
        101.,   90.,   94.,  104.,  115.,  107.,  133.,  151.,  104.,
         95.,  108.,  106.,  103.,   98.,  101.,  112.,  103.,  100.,
        100.,  114.,  103.,  105.,  148.,  128.,  106.,  100.,   88.,
         96.,   97.,   89.,  100.,  115.,  101.,  103.,  115.,  119.,
        118.,  144.,  151.,  124.,   97.,   89.,   97.,   94.,  102.,
         90.,   91.,  103.,  110.,  127.,  116.,  109.,  147.,  158.,
        142.,  120.,  103.,   96.,   96.,   96.,   78.,   92.,  119.,
        106.,  130.,  120.,  130.,  142.,  127.,  123.,  139.,  108.,
         98.,   86.,   89.,  100.,   92.,  121.,  117.,  166.,  136.,
        129.,  129.,

In [None]:
test[1][0]