## TP2 : Classification using Linear & Quadratic Discriminant Analysis

First think of configuring your notebook :

In [1]:
import csv
# import os
from pylab import *
import numpy as np
from numpy import linalg as la


## Reading synthetic data
Load the training and test data sets |synth_train.txt| and
|synth_test.txt| already used for Knn. Targets belong to {1,2} and entries belong to R^2.
We have 100 training data samples and 200 test samples.

* the 1st column contains the label of the class the sample, 
* columns 2 & 3 contain the coordinates of each sample in 2D.

In [2]:
train = np.loadtxt('synth_train.txt')

test = np.loadtxt('synth_test.txt')

## Recall about the main steps of discriminant analysis:
* estimation of weights `pi_1` and `pi_2` for each class,
* estimation of empirical means `mu_1` and `mu_2` for each class, 
* estimation of empirical covariance matrices  `sigma_1` and `sigma_2`,
* computation of the common averaged covariance `sigma` (average of intra-class covariances),
* computation of log-probabilities of belonging to each class,
* decision of classification,
* display results.


## TO DO : linear & quadratic discriminant analysis (LDA & QDA)
1. Implement a classifier using LDA of the data set. 
2. Then implement QDA classification.
3. In each case (LDA & QDA) show the decision boundary and
compute the error rate respectively for the training set and the test set. 
4. Compare and comment on your results with LDA and QDA.
5. You may also compare your results to K nearest neighbours.

_Indication 1 : matrices `sigma` are of size 2x2.
More generally, be careful of the sizes of vectors and matrices you
manipulate._

_Indication 2 : to display the regions of decision, you may use:_


First, we implement 2 class, LDA and QDA to create two classifiers : 

In [91]:
from tkinter import Y
import pandas as pd
import plotly.express as px


class LDA():
    """This class implement the linear discriminant analysis specifically for the loaded dataset synth. This class need to be loaded with a train and test dataset.
    """
    def __init__(self, train, test):
        self.train_df = pd.DataFrame(train,columns = ['classe', 'x1', 'x2'])
        self.test_df = pd.DataFrame(test,columns = ['classe', 'x1', 'x2'])
        self.type = "LDA"
        
        
    def get_pi_estimators(self)->list:
        """Returns the pi estimators for each class.

        Returns:
            list: list of floats
        """
        return [pi for pi in self.train_df.classe.value_counts(normalize=True, ascending=True).values]
    
    def get_mu_estimators(self)->np.ndarray:
        """Returns the mu estimators for each class.

        Returns:
            np.ndarray: an array where each line returns the vector mu (estimator) for the concerned class
        """
        classes = [1,2]
        mu = np.zeros((len(classes), 2))
        for i, c in enumerate(classes):
            mu[i] = self.train_df[self.train_df.classe==c][['x1', 'x2']].sum(axis=0).to_numpy() / self.train_df[self.train_df.classe==c].shape[0]
        return mu
    
    def get_sigma_estimators(self)->np.ndarray:
        """Returns the mean sigma estimator.

        Returns:
            np.ndarray: Sigma array in dimension 2x2
        """
        mu = self.get_mu_estimators()
        classes = [1,2]
        sigma_moy = np.zeros((2,2))
        for i, c in enumerate(classes):
            train_df_c = self.train_df[self.train_df.classe==c]
            sigma = np.zeros((2,2))
            for j in range(train_df_c.shape[0]):
                xn = train_df_c[['x1', 'x2']].iloc[j].to_numpy().reshape((2,1))
                sigma += ( train_df_c[['x1', 'x2']].iloc[j].to_numpy().reshape((2,1)) - mu[i].reshape((2,1)) ) @ ( train_df_c[['x1', 'x2']].iloc[j].to_numpy().reshape((2,1)) - mu[i].reshape((2,1)) ).T
            sigma_moy += sigma
        
        return sigma_moy/self.train_df.shape[0]
    
    def get_log_probabilities(self, df:pd.DataFrame)->np.ndarray:
        """Compute the log_probability for the entries.

        Args:
            df (pd.DataFrame): DataFrame with the columns x1 and x2

        Returns:
            np.ndarray: Matrix in dim Nx2 where N is the shape of entries
        """
        pi = self.get_pi_estimators()
        mu = self.get_mu_estimators()
        sigma = self.get_sigma_estimators()
        prediction = np.zeros((len(df), mu.shape[0]))
        for i in range(df.shape[0]):
            x = df[['x1', 'x2']].iloc[i].to_numpy().reshape((2,1))
            y = np.zeros(mu.shape[0])
            for j in range(mu.shape[0]):
                y[j] = np.log(pi[j]) + x.T @ la.inv(sigma) @ mu[j].reshape((2,1)) - 1/2 * mu[j].reshape((2,1)).T @ la.inv(sigma) @ mu[j].reshape((2,1))
            prediction[i] = y
        return prediction
    
    def classification(self, train=True)->np.ndarray:
        """Returns the classification using discriminant analysis.

        Args:
            train (bool, optional): Use the trainset if True and the testset if not. Defaults to True.

        Returns:
            np.ndarray: Vector with class id for each entry.
        """
        if train:
            df = self.train_df
        else:
            df = self.test_df
        prediction = self.get_log_probabilities(df)
        return np.argmax(prediction, axis=1) + 1
    
    def error_rate(self, train=True)->float:
        classes = self.classification(train=train)
        if train:
            results = (classes == self.train_df.classe.to_numpy())
            error_rate = 1 - np.count_nonzero(results)/self.train_df.shape[0]
        else:
            results = (classes == self.test_df.classe.to_numpy())
            error_rate = 1 - np.count_nonzero(results)/self.test_df.shape[0]
        return error_rate
        
    
    def plot_decision_boundary(self):
        """Plot the decision boundary
        """
        Nx1=100 # number of samples for display
        Nx2=100
        x1=np.linspace(-2.5,1.5,Nx1)  # sampling of the x1 axis 
        x2=np.linspace(-0.5,3.5,Nx2)  # sampling of the x2 axis
        [X1,X2]=np.meshgrid(x1,x2)  
        df = pd.DataFrame({'x1': X1.flatten('F'), 'x2': X2.flatten('F')})
        prediction = self.get_log_probabilities(df)
        classe = list(np.argmax(prediction, axis=1) + 1)
        df['classe'] = [f'classe {i}' for i in classe]
        fig = px.scatter(df, x="x1", y="x2", color = "classe", title=f"Decision boundary with {self.type}")
        fig.write_image(f"figures/decision_boundary_with_{self.type}.png")
        fig.show()
        
    

        
            
        

In [92]:
class QDA(LDA):
    """This class implement the linear discriminant analysis specifically for the loaded dataset synth. This class need to be loaded with a train and test dataset.
    """
    def __init__(self, train, test):
        super().__init__(train, test)
        self.type = "QDA"
    
    def get_sigma_estimators(self)->list:
        """Returns the sigma estimator for each class.

        Returns:
            list: List of matrix in dim 2x2
        """
        mu = self.get_mu_estimators()
        classes = [1,2]
        sigma_list =[]
        for i, c in enumerate(classes):
            train_df_c = self.train_df[self.train_df.classe==c]
            sigma = np.zeros((2,2))
            for j in range(train_df_c.shape[0]):
                xn = train_df_c[['x1', 'x2']].iloc[j].to_numpy().reshape((2,1))
                sigma += ( train_df_c[['x1', 'x2']].iloc[j].to_numpy().reshape((2,1)) - mu[i].reshape((2,1)) ) @ ( train_df_c[['x1', 'x2']].iloc[j].to_numpy().reshape((2,1)) - mu[i].reshape((2,1)) ).T
            sigma_list.append(sigma/train_df_c.shape[0])
        
        return sigma_list
    
    def get_log_probabilities(self, df):
        """Compute the log_probability for the entries.

        Args:
            df (pd.DataFrame): DataFrame with the columns x1 and x2

        Returns:
            np.ndarray: Matrix in dim Nx2 where N is the shape of entries
        """
        pi = self.get_pi_estimators()
        mu = self.get_mu_estimators()
        sigma = self.get_sigma_estimators()
        prediction = np.zeros((len(df), mu.shape[0]))
        for i in range(df.shape[0]):
            x = df[['x1', 'x2']].iloc[i].to_numpy().reshape((2,1))
            y = np.zeros(mu.shape[0])
            for j in range(mu.shape[0]):
                y[j] = np.log(pi[j]) - 1/2 * np.log(la.det(sigma[j])) -1/2 *  (x - mu[j].reshape((2,1))).T @ la.inv(sigma[j]) @ (x - mu[j].reshape((2,1)))
            prediction[i] = y
        return prediction

Now, we initialize our classifiers :

In [93]:
lda_classifier = LDA(train, test)
qda_classifier = QDA(train,test)

And then we can plot the decision boundary : 

In [94]:
lda_classifier.plot_decision_boundary()

In [95]:
qda_classifier.plot_decision_boundary()

We can observe a huge difference between LDA and QDA. In LDA, the separation between class 1 and 2 is a linear function  while with the QDA the separation is no longer a linear function (because of the quadratic term in x when we compute the log probability). This is exactly what we have pointed out in class.

In [97]:
error_rate_LDA_train = lda_classifier.error_rate()
error_rate_QDA_train = qda_classifier.error_rate()
error_rate_LDA_test = lda_classifier.error_rate(train=False)
error_rate_QDA_test = qda_classifier.error_rate(train=False)

df_error_rate = pd.DataFrame(dict(train=[error_rate_LDA_train, error_rate_QDA_train], test=[error_rate_LDA_test, error_rate_QDA_test]), index=['LDA', 'QDA'])
df_error_rate

Unnamed: 0,train,test
LDA,0.03,0.055
QDA,0.05,0.05


For the training dataset, we get better results with the LDA but for the test dataset it is the opposite.

In [99]:
qda_classifier.get_sigma_estimators()

[array([[0.51664642, 0.30792087],
        [0.30792087, 0.48765965]]),
 array([[0.24283742, 0.01935998],
        [0.01935998, 0.20883663]])]

If we look at the two covariance matrices, we notice how different they are, so LDA suffers from high bias.

In [100]:
from sklearn.neighbors import KNeighborsClassifier

neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(train[:,1:], train[:,0])
error_rate_K_train = 1 - np.count_nonzero(neigh.predict(train[:,1:]) == train[:,0])/train.shape[0]
error_rate_K_test = 1 - np.count_nonzero(neigh.predict(test[:,1:]) == test[:,0])/test.shape[0]
df_error_rate.loc['K_neighbors'] = [error_rate_K_train, error_rate_K_test]
df_error_rate

Unnamed: 0,train,test
LDA,0.03,0.055
QDA,0.05,0.05
K_neighbors,0.03,0.045


In [103]:
Nx1=100 # number of samples for display
Nx2=100
x1=np.linspace(-2.5,1.5,Nx1)  # sampling of the x1 axis 
x2=np.linspace(-0.5,3.5,Nx2)  # sampling of the x2 axis
[X1,X2]=np.meshgrid(x1,x2)  
df_k_neighbors_boundary = pd.DataFrame({'x1': X1.flatten('F'), 'x2': X2.flatten('F')})
classe = list(neigh.predict(np.vstack((X1.flatten('F'), X2.flatten('F'))).T))
df_k_neighbors_boundary['classe'] = [f'classe {i}' for i in classe]
fig = px.scatter(df_k_neighbors_boundary, x="x1", y="x2", color = "classe", title="Decision boundary with K_neighbors")
fig.write_image(f"figures/decision_boundary_with_K_neighbors.png")
fig.show()
        

## TO DO : LDA & QDA using scikit-learn module

The module `scikit-learn` is dedicated to machine learning algorithms. Many of them are available in a simple manner. For LDA and QDA, have a look at the tutorial available at http://scikit-learn.org/stable/modules/lda_qda.html 

**Warning** : you may have a critical view of the way LDA and QDA are illustrated in the proposed example...




In [44]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

skl_LDA_classifier = LinearDiscriminantAnalysis(store_covariance=True)
skl_QDA_classifier = QuadraticDiscriminantAnalysis(store_covariance=True)

skl_LDA_classifier.fit(train[:,1:], train[:,0])
skl_QDA_classifier.fit(train[:,1:], train[:,0])



In [104]:
df_skl_LDA_boundary = pd.DataFrame({'x1': X1.flatten('F'), 'x2': X2.flatten('F')})
classe = list(skl_LDA_classifier.predict(np.vstack((X1.flatten('F'), X2.flatten('F'))).T))
df_skl_LDA_boundary['classe'] = [f'classe {i}' for i in classe]
fig = px.scatter(df_skl_LDA_boundary, x="x1", y="x2", color = "classe", title="Decision boundary with LDA from sklearn")
fig.write_image(f"figures/decision_boundary_LDA_sklearn.png")
fig.show()

In [105]:
df_skl_QDA_boundary = pd.DataFrame({'x1': X1.flatten('F'), 'x2': X2.flatten('F')})
classe = list(skl_QDA_classifier.predict(np.vstack((X1.flatten('F'), X2.flatten('F'))).T))
df_skl_QDA_boundary['classe'] = [f'classe {i}' for i in classe]
fig = px.scatter(df_skl_QDA_boundary, x="x1", y="x2", color = "classe", title="Decision boundary with with QDA from sklearn")
fig.write_image(f"figures/decision_boundary_QDA_sklearn.png")
fig.show()

In [106]:
error_rate_LDA_skl_train = 1 - np.count_nonzero(skl_LDA_classifier.predict(train[:,1:]) == train[:,0])/train.shape[0]
error_rate_LDA_skl_test = 1 - np.count_nonzero(skl_LDA_classifier.predict(test[:,1:]) == test[:,0])/test.shape[0]
df_error_rate.loc['LDA with sklearn'] = [error_rate_LDA_skl_train, error_rate_LDA_skl_test]

error_rate_QDA_skl_train = 1 - np.count_nonzero(skl_QDA_classifier.predict(train[:,1:]) == train[:,0])/train.shape[0]
error_rate_QDA_skl_test = 1 - np.count_nonzero(skl_QDA_classifier.predict(test[:,1:]) == test[:,0])/test.shape[0]
df_error_rate.loc['QDA with sklearn'] = [error_rate_QDA_skl_train, error_rate_QDA_skl_test]

In [107]:
df_error_rate

Unnamed: 0,train,test
LDA,0.03,0.055
QDA,0.05,0.05
K_neighbors,0.03,0.045
LDA with sklearn,0.03,0.055
QDA with sklearn,0.05,0.05


We get the same results as for our implementation, which is logical.

In [108]:
df_error_rate.to_csv('error_rate_comparison.csv', sep=';')