# 1. Standardization of Data

In [1]:
import matplotlib.pyplot as plt
from sklearn.datasets import load_wine
import numpy as np

samples, labels = load_wine(return_X_y = True)
#print(labels)

means = np.array([])
stDev = np.array([])
for i in range(0,len(samples[0])):
    means = np.append(means, sum(samples[:,i]/float(len(samples))))
    stDev= np.append(stDev, np.std(samples[:,i]))

samplesCentered = samples - means
covX = np.cov(samplesCentered.transpose())
vals, vects = np.linalg.eig(covX)
inds = vals.argsort()[-2:][::-1]    # 2 dimensions

#print(vals)

u1 = np.reshape(vects[:,inds[0]],(13,1))
u2 = np.reshape(vects[:,inds[1]],(13,1))
U = np.concatenate((u1,u2),1)
#print("Matrix U")
#print(U)

#print(U.shape)
#print(samples.shape)

samples2d = np.matmul(samplesCentered, U)
#samples2dT = samples2d.transpose()

fig, ax = plt.subplots(2,1,figsize = (5,10))
i = 0
for sample in samples2d:
    if labels[i] == 0:
        ax[0].plot(sample[0],sample[1], 'rs')
    elif labels[i] == 1:
        ax[0].plot(sample[0],sample[1], 'g^')
    elif labels[i] ==2:
        ax[0].plot(sample[0],sample[1], 'bo')
    i += 1

ax[0].set_title('Centered Data')
#print(samples2d.shape)

samplesUniform = samplesCentered
#print(len(samplesUniform))
for i in range(0,len(samplesUniform)):
    samplesUniform[i] = np.divide(samplesUniform[i],stDev)


covX = np.cov(samplesUniform.transpose())
vals, vects = np.linalg.eig(covX)
inds = vals.argsort()[-2:][::-1]    # 2 dimensions

u1 = np.reshape(vects[:,inds[0]],(13,1))
u2 = np.reshape(vects[:,inds[1]],(13,1))
U = np.concatenate((u1,u2),1)
samples2dU = np.matmul(samplesCentered, U)

i =0
for sample in samples2dU:
    if labels[i] == 0:
        ax[1].plot(sample[0],sample[1], 'rs')
    elif labels[i] == 1:
        ax[1].plot(sample[0],sample[1], 'g^')
    elif labels[i] ==2:
        ax[1].plot(sample[0],sample[1], 'bo')
    i += 1

ax[1].set_title('Centered and Normalized Data')


Text(0.5,1,'Centered and Normalized Data')

As given by the two plots above, using centralized and normalized data provides for significantly better classification of the data. In the first graph, the centered data causes there to be significant overlap between the blue circles and green triangles. In contrast, there is very slight overlap when using the normalized and centered data.

# 2. Naive Bayes Classification

In [2]:
from sklearn.naive_bayes import GaussianNB as Gauss
import random
import math

def prob(x, mean, stdev): # Using Gaussian Distribution
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent

def classProb(classAttributes, unknown): #Get the probabilities that an unknown vector of features is each class
    probabilities = [1.0,1.0,1.0]
    for attr in classAttributes:
        for i in range(len(attr)):
            mean = attr[i][0]
            std = attr[i][0]
            probabilities[i] *= prob(unknown[i], mean, std)
    return probabilities

def getClass(classAttributes, unknown):
    probs = classProb(classAttributes, unknown)
    return probs.index(max(probs))
    
def getAttr(data, labels):
    class1 = []
    class2 = []
    class3 = []

    for i in range(0,len(labels)):
        if labels[i] == 0:
            class1.append(data[i])        
        elif labels[i] ==1:
            class2.append(data[i])
        elif labels[i] ==2:
            class3.append(data[i])
            
    classAttributes = [] #(mean, std)
    class1 = np.array(class1)
    class2 = np.array(class2)
    class3 = np.array(class3)

    for classes in [class1, class2, class3]:
        means = np.array([])
        stDev = np.array([])
        for i in range(0,len(classes[0])):
            means = np.append(means, sum(classes[:,i]/float(len(classes))))
            stDev= np.append(stDev, np.std(classes[:,i]))
        
        classAttributes.append((means, stDev))
    
    return classAttributes

In [8]:
samplesCentered = samples2d
samplesUniform = samples2dU
shuffleIndex = random.sample(range(178), 178)
samplesCopy = samples
samplesUCopy = samplesUniform
samplesCCopy = samplesCentered

for i in range(len(samples)):
    samples[i] = samplesCopy[shuffleIndex[i]]
    samplesCentered[i] = samplesCCopy[shuffleIndex[i]]
    samplesUniform[i] = samplesUCopy[shuffleIndex[i]]
    labels[i] = labels[shuffleIndex[i]]


labelsList = []
samplesList = []
samplesCList = []
samplesUList = []
indices = [0, 35, 70, 106, 142, 178]
for i in range(len(indices)-1):
    ind1 = indices[i]
    ind2 = indices[i+1]
    labelsList.append(labels[ind1:ind2])
    samplesList.append(samples[ind1:ind2])
    samplesCList.append(samplesCentered[ind1:ind2])
    samplesUList.append(samplesUniform[ind1:ind2])
    
resultsC = []
resultsU = []
resultsG = []
clf = Gauss()
for i in range(0,5): # have each partitioned data be validation
    samplesCListTrain = samplesCList.copy()
    samplesUListTrain = samplesUList.copy()
    labelsTrain = labelsList.copy()
    valC = samplesCListTrain.pop(i)
    valU = samplesUListTrain.pop(i)
    valLabels = labelsTrain.pop(i)
    
    predictionsC = []
    predictionsU = []
    predictionsG = []
    trainC = np.concatenate(samplesCListTrain)
    trainU = np.concatenate(samplesUListTrain)
    trainLabels = np.concatenate(labelsTrain)
    
    attrCTest = getAttr(trainC, trainLabels)
    attrUTest = getAttr(trainU, trainLabels)
    clf.fit(trainU, trainLabels)
    
    #Using 0-1 loss to determine errors
    errorC = 0
    errorU = 0
    errorG = 0 
    for i in range(len(valC)):
        predictC = getClass(attrCTest, valC[i])
        predictU = getClass(attrUTest, valU[i])
        predictG = clf.predict(valU[i].reshape((1,len(valU[i]))))

        if predictC != int(valLabels[i]):
            errorC += 1
        if predictU != int(valLabels[i]):
            errorU += 1
        if predictG != int(valLabels[i]):
            errorG += 1
    
    resultsC.append(errorC)
    resultsU.append(errorU)
    resultsG.append(errorG)
     

print('Errors given by using Naive Bayes for K = 5')
print('Centered Data: ', resultsC, '  Mean:', np.mean(resultsC), '   Variance:', np.var(resultsC))
print('Standardized Data:', resultsU, '  Mean:', np.mean(resultsU), '   Variance:', np.var(resultsU))
print('Using scikitlearn Gaussian: ', resultsG, '  Mean:', np.mean(resultsG), '   Variance:', np.var(resultsG))



Errors given by using Naive Bayes for K = 5
Centered Data:  [17, 14, 13, 12, 18]   Mean: 14.8    Variance: 5.36
Standardized Data: [17, 14, 13, 12, 18]   Mean: 14.8    Variance: 5.36
Using scikitlearn Gaussian:  [0, 0, 0, 1, 0]   Mean: 0.2    Variance: 0.16000000000000003


In the second part, the Naive Bayes Classification was created by calculating the probability that each sample's features is a certain classes feature. From there, the entire proability that a sample is in a certain class can be determined. Based on the ouput and using 0-1 loss to determine the error, the mean and variance of the error can be found between the 5 different validation sets used for K=5 cross validation. 