In [1]:
#Import modules used for various functions
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import math
warnings.filterwarnings("ignore")

In [2]:
class GaussianAnalysis:
    
    def __init__(self, x, y, tx, ty):      
        self.x = x
        self.y = y
        self.phi = None
        self.u0 = None
        self.u1 = None
        self.sigma = None
        self.thetaT = None
        self.theta0 = None
        self.tx = tx
        self.ty = ty
    
    #To find phi parameter
    def find_phi(self):
        self.phi = np.mean(self.y)
    
    #To find mean parameters - u0 and u1
    def mew(self, n):
        
        mew_n = np.zeros([1, 2], float)
        den = 0
        for i in range(self.y.shape[0]):
            if self.y[i] == n:
                mew_n += self.x[i]
                den += 1
        return (mew_n / den).T
    
    #To find covariance
    def find_sigma(self):
        
        sigma = np.zeros((2, 2))
        for i in range(self.y.shape[0]):
            col = np.reshape(self.x[i,:], (self.x.shape[1], 1))
            if self.y[i] == 1:
                sigma += np.dot((col - self.u0), (col - self.u0).T)
            else:
                sigma += np.dot((col - self.u1), (col - self.u1).T)
        sigma /= self.y.shape[0]
        return sigma
    
    #To find theta parameters of the equivalent sigmoid using the formulae found
    def find_parameters(self):
        
        diff_u = self.u1 - self.u0
        sigma_inv = np.linalg.inv(self.sigma)
        self.thetaT = np.dot(sigma_inv, diff_u)
        phi_fact = self.phi/(1 - self.phi)
        const_fact = math.log(phi_fact)
        M1 = np.dot(sigma_inv, self.u0)
        M2 = np.dot(sigma_inv, self.u1)
        self.theta0 = (np.dot(self.u0.T, M1) - np.dot(self.u1.T, M2))*0.5 + const_fact
        
    #To compute sigmoid
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
       
    #Using the final value of Gaussian distribution analysis parameters and returning the expected output
    def GDA_calculation(self, x, y):
        
        predicted = []
        for i in range(y.shape[0]):
            x_i = np.reshape(x[i,:], (x.shape[1], 1))
            parameter = np.dot((self.thetaT).T, x_i) + self.theta0
            prob = self.sigmoid(parameter)
            if prob >= 0.5:
                predicted.append(1)
            else:
                predicted.append(0)
        return predicted
            
    #Calculating TP, TN, FP, FN and finding accuracy
    def accuracy(self, predicted, expected):
        TP= TN= FP= FN = 0
        for i in range(expected.shape[0]):
            if predicted[i]== 1:
                if expected[i] == 1:
                    TP+=1
                else:
                    FP+=1
            else:
                if expected[i] == 0:
                    TN+=1
                else:
                    FN+=1
        accuracy = (TP+TN)/(TP+TN+FP+FN)
        return accuracy

In [3]:
class dataPrepare(GaussianAnalysis):
    
    #Prepare the data set values:
    def __init__(self, train, test):
        
        #Features(x) and Target(y) from training data set
        x = np.array(train[['x_1', 'x_2']], dtype=float)
        y = np.array(train[['y']], dtype=float)
        
        #Features(tx) and Target(ty) from test data set
        tx = np.array(test[['x_1', 'x_2']], dtype=float)
        ty = np.array(test[['y']], dtype=float)
        
        super().__init__(x, y, tx, ty)
    
    def callFunctions(self):
        
        self.find_phi()
        self.u0 = self.mew(0)
        self.u1 = self.mew(1)
        self.sigma = self.find_sigma()
        self.find_parameters()
        predicted = self.GDA_calculation(self.x, self.y)
        trainingAccuracy = self.accuracy(predicted, self.y)
        print('Accuracy of model over training set = ', trainingAccuracy*100,"%")
        
        predicted = self.GDA_calculation(self.tx, self.ty)
        testAccuracy = self.accuracy(predicted, self.ty)
        print('Accuracy of model over test set = ', testAccuracy*100,"%")

In [4]:
#DATA SET 1

train1 = pd.read_csv("ds1_train.csv")
test1 = pd.read_csv("ds1_test.csv")

gda1 = dataPrepare(train1, test1)
gda1.callFunctions()

Accuracy of model over training set =  86.625 %
Accuracy of model over test set =  83.0 %


In [5]:
#DATA SET 2

train2 = pd.read_csv("ds2_train.csv")
test2 = pd.read_csv("ds2_test.csv")

gda2 = dataPrepare(train2, test2)
gda2.callFunctions()

Accuracy of model over training set =  91.375 %
Accuracy of model over test set =  91.0 %
