# Winery classification with the multivariate Gaussian

In thos notebook, I am going to classify wines using 13 features. There are 178 wines with corresponding 13 features. We are going to classify each wine based on which winery it was made from. There are 3 wineries/classes/label. I will divide this into a training set of 130 points and a test set of 48 points.

In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm, multivariate_normal

In [9]:
# Load data set.
data = np.loadtxt('wine.data.txt', delimiter=',')
featurenames = ['Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash','Magnesium', 'Total phenols', 
                'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 
                'OD280/OD315 of diluted wines', 'Proline']

features=[0,1,2,3,4,5,6,7,8,9,10,11,12]

# Split 178 instances into training set (trainx, trainy) of size 130 and test set (testx, testy) of size 48
np.random.seed(0)
perm = np.random.permutation(178)
trainx = data[perm[0:130],1:14]
trainy = data[perm[0:130],0]
testx = data[perm[130:178], 1:14]
testy = data[perm[130:178],0]

In [10]:
def fit_generative_model(x,y):
    k=3  # labels 1,2,...,k
    d=(x.shape)[1]  # number of features
    mu=np.zeros((k+1,d))
    sigma=np.zeros((k+1,d,d))
    pi=np.zeros(k+1)
    
    for label in range(1,k+1):
        indices = (y==label)
        mu[label] = np.mean(x[indices,:], axis=0)
        sigma[label] = np.cov(x[indices,:], rowvar=0, bias=1)
        pi[label] = float(sum(indices))/float(len(y))
    return mu, sigma, pi

In [11]:
# Fit a Gaussian generative model to the training data
mu, sigma, pi = fit_generative_model(trainx, trainy)

In [17]:
# This function returns predictions for each wine in the test data and computes the errors
def test_model(mu, sigma, pi, features, testx, testy):
    n_test=len(testx)
    score=np.zeros((n_test,4))
    
    for i in range(0,n_test):
        for label in range(1,4):
            score[i,label]=np.log(pi[label])+ \
            multivariate_normal.logpdf(testx[i,features], mean=mu[label,:],cov=sigma[label,:,:])
    
    
    predictions=np.argmax(score[:,1:4], axis=1)+1
        
    errors = np.sum(predictions!=testy)
    percent_error=errors/n_test*100
    print("Errors: " + str(errors) + "/" + str(n_test))
    print(percent_error)
    
    return(predictions, percent_error)
                                  
                                  
    

In [18]:
predictions, percent_error = test_model(mu, sigma, pi, features, testx, testy)

Errors: 2/48
4.166666666666666
