In [487]:
import pandas as pd
import numpy as np
import math

### Misc functions

In [488]:
#imputes the data by calculatig the mean given a class and replacing all values in that class
#that have a feature of "?" with the conditional mean. We round the mean to 0 or 1 when the data
#is categorical
def impute (data, categorical):
    class_dfs = []
    classes = np.unique(data['target'])
    #goes through each label to calculate the conditional mean
    for label in classes:
        #change the "?" to nan so we can compute the mean skipping the nans
        data_with_label = data[data['target']==label].replace('?', np.nan).astype('float')
        means = data_with_label.mean(skipna=True)
        #goes through each feature to replace empty vals with conditional means
        #rounds the mean if the data is categorical
        for feature in data.columns:
            if categorical:
                replacement_val = round(means[feature])
            else:
                replacement_val = means[feature]
            data_with_label[feature] = data_with_label[feature].apply(lambda x: replacement_val if math.isnan(x) else x)
        class_dfs.append(data_with_label)
    return pd.concat(class_dfs).reset_index().drop(columns='index')

def train_test (data, testing_size):
    testing = data.sample(frac=testing_size)
    training = data.drop(list(testing.index))
    #we also want to seperate x and y for each set
    y_training = (training['target'])#, dtype=np.int)
    #drop target so its not considered a feature
    x_training = training.drop(columns= 'target').values

    y_testing = (testing['target'])
    x_testing = testing.drop(columns= 'target').values
    return (x_training, y_training, x_testing, y_testing)

#one hot encodes each sample with a set vector size
#I could have made the vector size dynamic but each dataset that required this had featurs that all had the same max val
def one_hot (x, vector_size):
    #round the data to bucket
    x_rounded = x.astype(int)
    x_one_hot = []
    #for loop to add each feature to the one hot vector
    for feature in x_rounded:
        feature_vector = []
        #goes through each feature and places the 1 in the right place, the rest are 0s
        #this would fail for 0 so i added a special case for it
        for feature_val in feature:
            if feature_val ==0:
                one_hot_feature = [1] + [0]*(vector_size-1)
            else:
                one_hot_feature = [0]*(feature_val-1) + [1] + [0]*(vector_size-feature_val)
            feature_vector += one_hot_feature
        x_one_hot.append(np.asarray(feature_vector, dtype=np.int))
    return np.array(x_one_hot)


### Naive Bayes (Gaussian)

In [489]:
#function that calculates the class frequency and conditional mean and variance for each feature
#returns the above statistics in a dictionary
def fit_gaussian_naive_bayes(x_train, y_train):
    classes = np.unique(y_train)
    class_statistics = {}
    #goes through each class, calculates the stats and adds them to the dictionary
    for c in classes:
        class_statistics[c] = {}
        training_c = x_train[c == y_train]
        
        class_frequency = len(training_c)/len(y_train)
        mean = training_c.mean(axis=0)
        variance = training_c.var(axis=0)
        
        class_statistics[c]['class_frequency'] = class_frequency
        class_statistics[c]['mean'] = list(mean)
        class_statistics[c]['variance'] = list(variance)
    return(class_statistics)

#takes in the fitted stats and the testing data, calculates the gaussian log likelihood for each class, 
#computes the posterior, and returns the class with the max posterior
def predict_gaussian_naive_bayes(stats, x_test):
    classes = list(stats.keys())
    predicted = []
    #goes through each sample in the testing data
    for sample in x_test:
        posteriors = []
        #iterates through each class to calculate the posterior given the sample
        for c in classes:
            prior = stats[c]['class_frequency']
            conditionals = []
            #goes through each feature and calculates the conditional for that feature for the class
            for index in range(0,len(sample)):
                conditionals.append(gaussian_log_likelihood(stats, c, sample[index], index))
            #sums all the individual conditionals logged conditionals to get the overall conditional
            conditional = np.sum(conditionals)
            #calculates the posterior
            posterior = prior + conditional
            posteriors.append(posterior)
        #chooses the max posterior and class and returns them
        predicted.append((classes[np.argmax(posteriors)], max(posteriors)))
    return predicted

#returns the log likelihood for each feature given a class, using the fitted stats
def gaussian_log_likelihood(class_stats, c, x, idx):
    mean = class_stats[c]['mean'][idx]
    variance = class_stats[c]['variance'][idx]
    return np.log((np.exp(-((x-mean)**2)/(2*variance)))/(np.sqrt(2*np.pi*variance)))

### Naive Bayes (Bernoulli)


In [490]:
#function that calculates the class frequency and the probability of a feature given the class
#adds this data to a dictionary
def fit_bernoulli_naive_bayes(x_train, y_train):
    classes = np.unique(y_train)
    class_statistics = {}
    #goes through each class to calculate the above stats
    for c in classes:
        class_statistics[c] = {}
        training_c = x_train[c == y_train]
        class_frequency = len(training_c)/len(y_train)
        probability_given_class =[]
        #goes through each feature calculating the probability of the feature given the class
        for feature in range(0, len(training_c[0])):
            count = (sum([item[feature] for item in training_c]))
            probability_given_class.append(count/sum([item[feature] for item in x_train]))
        #adds the stats to a dictionary
        class_statistics[c]['class_frequency'] = class_frequency
        class_statistics[c]['probability_given_class'] = probability_given_class
    return(class_statistics)

#function that takes in the fitted stats and the training samples and predicts the samples by taking the 
#argmax of the posteriors given a bernoulli distribution
def predict_bernoulli_naive_bayes(stats, x_train):
    classes = list(stats.keys())
    predicted = []
    #iterates through each sample to predict them
    for sample in x_train:
        posteriors = []
        #calculates the posterior for each class
        for c in classes:
            prior = stats[c]['class_frequency']
            conditionals = []
            #goes through each feature to calculate the individual log likelihoods
            for index in range(0,len(sample)):
                conditionals.append(bernoulli_log_likelihood(stats, c, sample[index], index))
            #adds up the conditionals to make the overall for the class
            conditional = np.sum(conditionals)
            posterior = prior + conditional
            posteriors.append(posterior)
        #predicts the class by taking the argmax of the posterior list
        predicted.append(classes[np.argmax(posteriors)])
    return predicted

#function that computes the bernoulli log likelihood of a feature given a class and its fitted stats
def bernoulli_log_likelihood(class_stats, c, x, idx):
    probability = class_stats[c]['probability_given_class'][idx]
    return np.log(probability)*x + np.log(1-probability)*(1-x)

### Winnow-2

In [491]:
#function that takes in training data, a threshold and a learning rate to calculate the fitted weights for each feature
def fit_winnow2(x_train, y_train, threshold = 0.5, learning_rate = 2):
    #initialize the weights to 1
    weights = np.ones(len(x_train[0]))
    y_train = np.array(y_train)
    #goes through each sample to adjust the weights
    for index, sample in enumerate(x_train):
        #calculate our f(x) by multiplying each sample by its respective weight
        fx = np.sum(np.multiply(sample, weights))
        #if the sample is above the threshold, predict 1. If not, predict 0.
        if fx > threshold:
            y_predict = 1
        else:
            y_predict = 0
        #if the prediction is incorrect and the prediction is 0 (actual is 1) we should promote
        #if the prediction is incorrect and the prediction is 1 (actual is 0) we should demote
        if y_predict!=y_train[index]:
            if y_predict == 0:
                weights = promote(sample, weights, learning_rate)
            else:
                weights = demote(sample, weights, learning_rate)
    return weights

#takes in the weights and and feature values, if the feature is 1 we promote by multiplying the weight by the learning rate
def promote(sample, weights, learning_rate):
    new_weights = []
    for index, weight in enumerate(weights):
        if sample[index] == 1:
            new_weights.append(weight*learning_rate)
        else:
            new_weights.append(weight)
    return(np.asarray(new_weights, dtype=np.float64))
#takes in the weights and and feature values, if the feature is 1 we demote by dividing the weight by the learning rate
def demote(sample, weights, learning_rate):
    new_weights = []
    for index, weight in enumerate(weights):
        if sample[index] == 1:
            new_weights.append(weight/learning_rate)
        else:
            new_weights.append(weight)
    return(np.asarray(new_weights, dtype=np.float64))

#takes in the testing data, the fitted weights and the threshold. using the fitted weights we make a prediction for each sample
def predict_winnow2(x_test, weights, threshold):
    predictions = []
    for index, sample in enumerate(x_test):
        #computes f(x) by multiplying the features of the sample by their respective fitted weights
        fx = np.sum(np.multiply(sample, weights))
        #if f(x) is above the threshold, return 1 and the distance from the threshold
        #if it is below, return 0 and set the distance from the threshold to 0 because this will not be used for multiclass
        if fx > threshold:
            predictions.append((1, fx-threshold))
        else:
            predictions.append((0, 0))
    return predictions

# Breast Cancer

In [492]:
#importing breast cancer data and removing the id column
#"Missing Attribute Values: Denoted by "?"" so we need to impute those
breast_cancer = pd.read_csv('breast-cancer-wisconsin.data', sep=",", header=None)
breast_cancer.columns = ["id", "Clump Thickness", "Uniformity of Cell Size", "Uniformity of Cell Shape", "Marginal Adhesion",
               "Single Epithelial Cell Size", "Bare Nuclei", "Bland Chromatin", "Normal Nucleoli", "Mitoses", "target"]
breast_cancer = breast_cancer.drop(columns='id')

In [493]:
#our data labels 2 for benign and 4 for malignant, so let's relabel this to 0 and 1 respectively
breast_cancer['target'] = breast_cancer['target'].replace(2, 0).replace(4, 1)

#the missing attributes are numerical, so we impute the data by 
#finding the mean for that meature given the class
breast_cancer = impute(breast_cancer, False)

In [494]:
#Now we have to split the data into 2/3 training and 1/3 testing
breast_cancer_x_train,breast_cancer_y_train, breast_cancer_x_test, breast_cancer_y_test = train_test (breast_cancer, 1/3)


In [495]:
#Now what we have our training data, we can learn naive bayes
breast_cancer_class_stats = fit_gaussian_naive_bayes(breast_cancer_x_train, breast_cancer_y_train)


In [496]:
#lets now get our predictions for our testing data
breast_cancer_naive_bayes_predicted_data = predict_gaussian_naive_bayes(breast_cancer_class_stats, breast_cancer_x_test)
#grab only the predictions (not the metric)
breast_cancer_naive_bayes_y_predict = [x[0] for x in breast_cancer_naive_bayes_predicted_data]


In [497]:
#compute the accuracy
breast_cancer_naive_bayes_results = [breast_cancer_naive_bayes_y_predict[index]==actual for index, actual in enumerate(breast_cancer_y_test)]
print(breast_cancer_naive_bayes_results.count(True)/len(breast_cancer_naive_bayes_results))

0.9613733905579399


In [498]:
#Next we go on to winnow-2 but we need to one hot encode the x values before training
#We know that the max of each columns is 10, so we use that as out vector size
breast_cancer_x_train_one_hot = one_hot(breast_cancer_x_train, 10)
breast_cancer_x_test_one_hot = one_hot(breast_cancer_x_test, 10)

In [499]:
#fit the winnow-2 algorithm
breast_cancer_weights = fit_winnow2(breast_cancer_x_train_one_hot, breast_cancer_y_train,0.5,2)

In [500]:
#predict the test data and extract the predictions from the prediction data
breast_cancer_winnow_predict_data = predict_winnow2(breast_cancer_x_test_one_hot, weights, 0.5)
breast_cancer_winnow_y_predict = [x[0] for x in breast_cancer_winnow_predict_data]


In [501]:
#compute the accuracy
breast_cancer_winnow_results = [breast_cancer_winnow_y_predict[index]==actual for index, actual in enumerate(breast_cancer_y_test)]
print(breast_cancer_winnow_results.count(True)/len(breast_cancer_winnow_results))


0.9484978540772532


# Iris

In [502]:
#importing iris data
iris = pd.read_csv('iris.data', sep=",", header=None)
iris.columns = ["sepal length in cm", "sepal width in cm", "petal length in cm", "petal width in cm", "target"]


In [503]:
#the iris data set has 3 labels so we must separate this into three classification problems
iris_x_train,iris_y_train, iris_x_test, iris_y_test = train_test(iris, 1/3)
#first classification testing for Iris-setosa
iris_setosa_y_train = iris_y_train.replace('Iris-setosa', 1).replace('Iris-versicolor', 0).replace('Iris-virginica', 0)
iris_setosa_y_test = iris_y_test.replace('Iris-setosa', 1).replace('Iris-versicolor', 0).replace('Iris-virginica', 0)
iris_setosa_class_stats = fit_gaussian_naive_bayes(iris_x_train, iris_setosa_y_train)
iris_setosa_naive_bayes_predicted_data = predict_gaussian_naive_bayes(iris_setosa_class_stats, iris_x_test)

#second classification testing for Iris-versicolor
iris_versicolor_y_train = iris_y_train.replace('Iris-setosa', 0).replace('Iris-versicolor', 1).replace('Iris-virginica', 0)
iris_versicolor_y_test = iris_y_test.replace('Iris-setosa', 0).replace('Iris-versicolor', 1).replace('Iris-virginica', 0)
iris_versicolor_class_stats = fit_gaussian_naive_bayes(iris_x_train, iris_versicolor_y_train)
iris_versicolor_naive_bayes_predicted_data = predict_gaussian_naive_bayes(iris_versicolor_class_stats, iris_x_test)

#third classification testing for Iris-virginica
iris_virginica_y_train = iris_y_train.replace('Iris-setosa', 0).replace('Iris-versicolor', 0).replace('Iris-virginica', 1)
iris_virginica_y_test = iris_y_test.replace('Iris-setosa', 0).replace('Iris-versicolor', 0).replace('Iris-virginica', 1)
iris_virginica_class_stats = fit_gaussian_naive_bayes(iris_x_train, iris_virginica_y_train)
iris_virginica_naive_bayes_predicted_data = predict_gaussian_naive_bayes(iris_virginica_class_stats, iris_x_test)



In [504]:
#using the multi-class binary classification approach
iris_naive_bayes_y_predict = []
#using the prediction data, we label the sample with the class that has the highest posterior
for index in range(0, len(iris_virginica_naive_bayes_predicted_data)):
    posteriors = [iris_setosa_naive_bayes_predicted_data[index][1], iris_versicolor_naive_bayes_predicted_data[index][1], iris_virginica_naive_bayes_predicted_data[index][1]]
    iris_naive_bayes_y_predict.append(np.argmax(posteriors))

#compute the accuracy
iris_y_test_number_values = iris_y_test.replace('Iris-setosa', 0).replace('Iris-versicolor', 1).replace('Iris-virginica', 2)
iris_naive_bayes_results = [iris_naive_bayes_y_predict[index]==actual for index, actual in enumerate(iris_y_test_number_values)]
print(iris_naive_bayes_results.count(True)/len(iris_naive_bayes_results))


0.74


In [505]:
#using multiclass
iris_y_train_multi = iris_y_train.replace('Iris-setosa', 0).replace('Iris-versicolor', 1).replace('Iris-virginica', 2)
iris_y_test_multi = iris_y_test.replace('Iris-setosa', 0).replace('Iris-versicolor', 1).replace('Iris-virginica', 2)
iris_class_stats = fit_gaussian_naive_bayes(iris_x_train, iris_y_train_multi)
iris_naive_bayes_predicted_data = predict_gaussian_naive_bayes(iris_class_stats, iris_x_test)

In [506]:
#using multi-class classification
iris_naive_bayes_y_predict = [x[0] for x in iris_naive_bayes_predicted_data]

iris_naive_bayes_results_multi = [iris_naive_bayes_y_predict[index]==actual for index, actual in enumerate(iris_y_test_multi)]
print(iris_naive_bayes_results_multi.count(True)/len(iris_naive_bayes_results_multi))


0.94


In [507]:
#Next we go on to winnow-2 but we need to one hot encode the x values before training
#We know that the max of each columns is 8, so we use that as out vector size
iris_x_train_one_hot = one_hot(iris_x_train, 8)
iris_x_test_one_hot = one_hot(iris_x_test, 8)

In [508]:
#we already have our y values ready so we can begin to train
#iris setosa
iris_setosa_weights = fit_winnow2(iris_x_train_one_hot, iris_setosa_y_train,0.5,2)
iris_setosa_winnow_predict_data = predict_winnow2(iris_x_test_one_hot, iris_setosa_weights, 0.5)

#iris versicolor
iris_versicolor_weights = fit_winnow2(iris_x_train_one_hot, iris_versicolor_y_train,0.5,2)
iris_versicolor_winnow_predict_data = predict_winnow2(iris_x_test_one_hot, iris_versicolor_weights, 0.5)

#iris virginica
iris_virginica_weights = fit_winnow2(iris_x_train_one_hot, iris_virginica_y_train,0.5,2)
iris_virginica_winnow_predict_data = predict_winnow2(iris_x_test_one_hot, iris_virginica_weights, 0.5)

iris_winnow_y_predict = []
#using the prediction data, we label the sample with the class that has the furthest distance from the threshold
for index in range(0, len(iris_setosa_winnow_predict_data)):
    posteriors = [iris_setosa_winnow_predict_data[index][1], iris_versicolor_winnow_predict_data[index][1], iris_virginica_winnow_predict_data[index][1]]
    iris_winnow_y_predict.append(np.argmax(posteriors))

#compute the accuracy
iris_winnow_results = [iris_winnow_y_predict[index]==actual for index, actual in enumerate(iris_y_test_number_values)]
print(iris_winnow_results.count(True)/len(iris_winnow_results))


0.68


# House Votes

In [509]:
#importing house vote data
#Missing Attribute Values: Denoted by "?"
house_vote = pd.read_csv('house-votes-84.data', sep=",", header=None)
house_vote.columns = ["target", "handicapped-infants", "water-project-cost-sharing", "adoption-of-the-budget-resolution", "physician-fee-freeze",
               "el-salvador-aid", "religious-groups-in-schools", "anti-satellite-test-ban", "aid-to-nicaraguan-contras", "mx-missile", "immigration",
               "synfuels-corporation-cutback", "education-spending", "superfund-right-to-sue", "crime", "duty-free-exports", "export-administration-act-south-africa"]

In [510]:
#our labels are democrat and republican, so let's relabel them to 0 and 1 respectively
house_vote['target'] = house_vote['target'].replace('democrat', 0).replace('republican', 1)

#labels are represented as "y" or "n", so we change this to "1" and "0" respectively
house_vote = house_vote.replace("y", 1).replace("n", 0)
#we also have some "?" for missing data. Given that this data is categorical, let's take the mean given the class and round it to 0 or 1 for imputation
house_vote = impute(house_vote, True)

In [511]:
#split the data into training and testing
house_vote_x_train,house_vote_y_train, house_vote_x_test, house_vote_y_test = train_test(house_vote, 1/3)


In [512]:
#fit the stats using bernoulli thisw time because our data is categorical with 0 and 1 vals
class_stats = fit_bernoulli_naive_bayes(house_vote_x_train, house_vote_y_train)
#use the bernoulli prediction
y_predict = predict_bernoulli_naive_bayes(class_stats, house_vote_x_test)

In [513]:
#compute the accuracy
iris_winnow_results = [y_predict[index]==actual for index, actual in enumerate(house_vote_y_test)]
print(iris_winnow_results.count(True)/len(iris_winnow_results))


0.896551724137931


In [514]:
#no more preprocessing for winnow this time because the data is categorical with 0 and 1 data 
house_vote_weights = fit_winnow2(house_vote_x_train, house_vote_y_train,0.5,2)
house_vote_winnow_predict_data = predict_winnow2(house_vote_x_test, house_vote_weights, 0.5)
house_vote_winnow_y_predict = [x[0] for x in house_vote_winnow_predict_data]
#compute the accuracy
house_vote_winnow_results = [house_vote_winnow_y_predict[index]==actual for index, actual in enumerate(house_vote_y_test)]
print(house_vote_winnow_results.count(True)/len(house_vote_winnow_results))

0.8206896551724138


In [357]:
breast_cancer[breast_cancer['target']==0].mean()

Clump Thickness                2.956332
Uniformity of Cell Size        1.325328
Uniformity of Cell Shape       1.443231
Marginal Adhesion              1.364629
Single Epithelial Cell Size    2.120087
Bare Nuclei                    1.346847
Bland Chromatin                2.100437
Normal Nucleoli                1.290393
Mitoses                        1.063319
target                         0.000000
dtype: float64

In [358]:
breast_cancer[breast_cancer['target']==1].mean()

Clump Thickness                7.195021
Uniformity of Cell Size        6.572614
Uniformity of Cell Shape       6.560166
Marginal Adhesion              5.547718
Single Epithelial Cell Size    5.298755
Bare Nuclei                    7.627615
Bland Chromatin                5.979253
Normal Nucleoli                5.863071
Mitoses                        2.589212
target                         1.000000
dtype: float64