In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer


In [3]:
breast_cancer = load_breast_cancer()
df = pd.DataFrame(breast_cancer.data, columns = breast_cancer.feature_names)
X, y = load_breast_cancer(return_X_y = True)
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 42, train_size = 0.8)

In [4]:
# calculate phi, mu and sigma( probability, mean and covariance)
def GDA_fit(X_train, y_train):
    n_classes = len(np.unique(y_train))
    n_features = X_train.shape[1]
    size_train = len(y_train)
    
    mu = np.zeros(( n_classes, n_features))
    phi = np.zeros(n_classes)
    sigma = np.zeros((n_classes, n_features, n_features))
    
    for i in range(n_classes):
        indices = (y_train == i) # this is a mask array
        
        phi[i] = float(np.sum(indices).sum())/ size_train
        mu[i] = np.mean( X_train[ indices,:], axis = 0)
        sigma[i] = np.cov( X_train[ indices,:], rowvar = 0)
    
    return phi, mu, sigma
phi, mu, sigma = GDA_fit(X_train, y_train)

def multivariate_gaussian(X_val, mu, sigma):# probability of x given mu, sigma
    d = len(X_val)
    sigma_det = np.linalg.det(sigma)
    sigma_inv = np.linalg.inv(sigma)
    
    fac = np.einsum('k,kl,l->', X_val - mu,sigma_inv, X_val - mu)
    
    N = np.sqrt( (2*np.pi)**d * sigma_det)
    score = np.exp(-1/2* fac)/N
    return score

def GDA_predict(X_test, phi, mu, sigma):
    n_class = len( np.unique(y_train))
    n_test = X_test.shape[0]
    scores = np.zeros((n_test, n_class))
    for label in range(n_class):
        for i in range(n_test):
            scores[i, label] = multivariate_gaussian(X_test[i], mu[label], sigma[label])*phi[label]
    predictions = np.argmax(scores, axis = 1)
    return predictions
y_pred = GDA_predict(X_test, phi,mu, sigma)
accuracy = (y_pred == y_test).sum()
print(f"accuracy = {round(accuracy/ len(y_test)*100,2)}")


accuracy = 95.61
