In [7]:
import numpy as np
from random import sample
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt

In [31]:
class BayesianClassifier:

    '''
    Bayesian Clasifier
    
    ----------
    Paramaters
    ----------
    
    - x: data to be classified
    - clusters:
     
    ------
    Return
    ------
    Index which shows what cluster x belongs to
    '''
    
    def fit(self, data_X, data_y):
        '''
        Fit function. 
        ----------
        Paramaters
        ----------
        - data_X: Data Matrix training set.
        - data_y: Vector of classes. data_X[i,:] class is
            data_y[i].
        '''
        
        # Prepare data. We need data in clusters
        classes_set = list(set(data_y))
        clusters = [[] for i in classes_set]
        
        # If i-data has class j, introduces it in cluster j
        for i in range(0, len(data_y)):
            clusters[classes_set.index(data_y[i])].append(data_X[i])
        
        k = len(clusters)
        clusters = np.array([np.array(c) for c in clusters])
        
        d = len(data_X[0]) #Dimensions
        
        self.sizeCluster = []
        self.covarianceCluster = []
        self.meanCluster = []
        self.N = len(data_X)

        for i in range (0, k):
            #Compute size of cluster
            self.sizeCluster.append(len(clusters[i]))

        

        
        
        for i in range (0, k):

            #Compute mean of cluster
            self.meanCluster.append([np.average([clusters[i][:,j]]) for j in range(0, d)])

            #Compute the stimated covariance matrix of cluster
            aux = np.zeros((d,d))
            for j in range (0, len(clusters[i])):
                nDisper = np.subtract(clusters[i][j], self.meanCluster[i])
                product = np.outer(nDisper,nDisper)
                aux = aux + product
            aux = aux / self.sizeCluster[i]
            self.covarianceCluster.append(aux)
            
        self.inverse = [np.linalg.inv(i) for i in self.covarianceCluster]   
        
        
    def predict(self, x):
        '''
        Predict function.
        ----------
        Paramaters
        ----------
        - test_X: Data Matrix. Function will predict the
            class for each data test_X[i,:].

        ------
        Return
        ------
        - test_y: Vector of classes. test_X[i,:] class is
            test_y[i].
        '''
        solutions = []
        #Compute the goal function for the cluster. 
        #We will compute the function in multiple steps

        #First we find the distance between point x and cluster's mean
        distances = [np.subtract(x, i) for i in self.meanCluster]

        solutions = [np.dot(np.dot(distances[i], self.inverse[i]),distances[i]) for i in range(0, len(distances))]

        solutions = [solutions[i] + np.log(np.linalg.norm(self.covarianceCluster[i]) -2*np.log(self.sizeCluster[i]/self.N)                                       for i in range(0, len(solutions)))]

        #Now we can calculate the whole objective funciont
        #Np.linalg.norm gives the Frobenius norm for a matrix
        #solutions.append((aux2 + np.log(np.linalg.norm(self.covarianceCluster[i]) -2*np.log(self.sizeCluster[i]/n))))

        #We return the index which minices our goal function
        return solutions.index(min(solutions))

In [32]:
def clusterPlot(x,k,clusters, xAxis, yAxis,):
    '''
    Draw all members of a cluster list and the point x in the cluster k.
    
    ----------
    Paramaters
    ----------
    - x = single point
    - k = cluster which x belongs to
    - clusters: Clusters list and each cluster is a data Matrix of points
    - xAxis: coordinate x in graphic.
    - yAxis: coordinate y in graphic.
    '''
    for i in range (0, len(clusters)):
        c = clusters[i]
        p = plt.plot(c[:,xAxis], c[:,yAxis], 'o')
        if (k == i):
            plt.plot(x[xAxis], x[yAxis],'x', color = p[-1].get_color()) #Last used color, so it belongs to i-cluster
    plt.show()

In [33]:
data = pd.read_csv('seeds_dataset.txt')
data = data.values
X = data[:,:-1]
y = data[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7)

model = BayesianClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test[0])

NameError: name 'i' is not defined

In [3]:
[1,2,3,1].index(1)

0