In [71]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import jdc
from sklearn.model_selection import train_test_split

# 3.1 Preprocessing

In [72]:
class Preprocessing:
    def __init__(self, url, column_names_csv):
        self.url = url
        self.column_names_csv = column_names_csv
        self.dataset = None

In [73]:
%%add_to Preprocessing
def load_dataset(self):
    column_names_df = pd.read_csv(self.column_names_csv)
    column_names = column_names_df['Column_Names'].tolist()
    self.dataset = pd.read_csv(self.url, header=None, names=column_names)
    self.dataset['class'] = self.dataset['class'].map({'e': 0, 'p': 1})

In [74]:
%%add_to Preprocessing
def display_data_types(self):
    if self.dataset is not None:
        return self.dataset.dtypes
    else:
        return "Dataset not loaded."

In [75]:
%%add_to Preprocessing
def display_unique_values(self):
    if self.dataset is not None:
        for column in self.dataset.columns:
            unique_values = self.dataset[column].unique()
            print(f"Unique values for {column}:\n{unique_values}\n")
    else:
        print("Dataset not loaded.")

In [76]:
%%add_to Preprocessing
def encode_categorical_features(self):
    if self.dataset is not None:
        label_encoder = LabelEncoder()
        categorical_features = self.dataset.select_dtypes(include=['object']).columns
        for feature in categorical_features:
            self.dataset[feature] = label_encoder.fit_transform(self.dataset[feature])
    else:
        print("Dataset not loaded.")

In [77]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'
column_names_csv = 'mushroom_feature_names.csv'

In [78]:
preprocessing = Preprocessing(url, column_names_csv)

# Q 3.1.1

In [79]:
preprocessing.load_dataset()

In [80]:
preprocessing.dataset

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,0,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,0,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,1,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,0,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,0,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,0,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,0,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,1,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


# Q 3.1.2

In [10]:
preprocessing.display_data_types()

class                        int64
cap-shape                   object
cap-surface                 object
cap-color                   object
bruises                     object
odor                        object
gill-attachment             object
gill-spacing                object
gill-size                   object
gill-color                  object
stalk-shape                 object
stalk-root                  object
stalk-surface-above-ring    object
stalk-surface-below-ring    object
stalk-color-above-ring      object
stalk-color-below-ring      object
veil-type                   object
veil-color                  object
ring-number                 object
ring-type                   object
spore-print-color           object
population                  object
habitat                     object
dtype: object

# Q 3.1.3

Most of the features as generic text or string data.

<span style="color:red">Warning:</span>
: Unique values for stalk-root contains missing value

In [11]:
preprocessing.display_unique_values()

Unique values for class:
[1 0]

Unique values for cap-shape:
['x' 'b' 's' 'f' 'k' 'c']

Unique values for cap-surface:
['s' 'y' 'f' 'g']

Unique values for cap-color:
['n' 'y' 'w' 'g' 'e' 'p' 'b' 'u' 'c' 'r']

Unique values for bruises:
['t' 'f']

Unique values for odor:
['p' 'a' 'l' 'n' 'f' 'c' 'y' 's' 'm']

Unique values for gill-attachment:
['f' 'a']

Unique values for gill-spacing:
['c' 'w']

Unique values for gill-size:
['n' 'b']

Unique values for gill-color:
['k' 'n' 'g' 'p' 'w' 'h' 'u' 'e' 'b' 'r' 'y' 'o']

Unique values for stalk-shape:
['e' 't']

Unique values for stalk-root:
['e' 'c' 'b' 'r' '?']

Unique values for stalk-surface-above-ring:
['s' 'f' 'k' 'y']

Unique values for stalk-surface-below-ring:
['s' 'f' 'y' 'k']

Unique values for stalk-color-above-ring:
['w' 'g' 'p' 'n' 'b' 'e' 'o' 'c' 'y']

Unique values for stalk-color-below-ring:
['w' 'p' 'g' 'b' 'n' 'e' 'y' 'o' 'c']

Unique values for veil-type:
['p']

Unique values for veil-color:
['w' 'n' 'o' 'y']

Unique valu

# Q 3.1.4

Mahalanobis distance:

$$ D(x) = \sqrt{ (x - \mu)^T \Sigma^{-1} (x - \mu) } $$

The Mahalanobis distance is typically calculated on numerical data.

When our features have a data type of string (text),
we cannot directly calculate the Mahalanobis distance without
converting the data to a numerical format first.

# Q 3.1.5

In [12]:
preprocessing.encode_categorical_features()

In [13]:
preprocessing.display_data_types()

class                       int64
cap-shape                   int32
cap-surface                 int32
cap-color                   int32
bruises                     int32
odor                        int32
gill-attachment             int32
gill-spacing                int32
gill-size                   int32
gill-color                  int32
stalk-shape                 int32
stalk-root                  int32
stalk-surface-above-ring    int32
stalk-surface-below-ring    int32
stalk-color-above-ring      int32
stalk-color-below-ring      int32
veil-type                   int32
veil-color                  int32
ring-number                 int32
ring-type                   int32
spore-print-color           int32
population                  int32
habitat                     int32
dtype: object

# Q 3.1.6

What is a nan value?

A "NaN" value stands for "Not-a-Number."

It is a special floating-point value used in computing to represent

__undefined__ or __unrepresentable__ values, often arising from mathematical operations or data processing.

In [14]:
preprocessing.dataset.isna().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

# 3.2 Creating our model: LDA

# Q 3.2.1

In [15]:
dataset = preprocessing.dataset

In [16]:
X = dataset.drop(columns=['class'])
y = dataset['class']

In [22]:
X

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,5,2,4,1,6,1,0,1,4,0,...,2,7,7,0,2,1,4,2,3,5
1,5,2,9,1,0,1,0,0,4,0,...,2,7,7,0,2,1,4,3,2,1
2,0,2,8,1,3,1,0,0,5,0,...,2,7,7,0,2,1,4,3,2,3
3,5,3,8,1,6,1,0,1,5,0,...,2,7,7,0,2,1,4,2,3,5
4,5,2,3,0,5,1,1,0,4,1,...,2,7,7,0,2,1,0,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,3,2,4,0,5,0,0,0,11,0,...,2,5,5,0,1,1,4,0,1,2
8120,5,2,4,0,5,0,0,0,11,0,...,2,5,5,0,0,1,4,0,4,2
8121,2,2,4,0,5,0,0,0,5,0,...,2,5,5,0,1,1,4,0,1,2
8122,3,3,4,0,8,1,0,1,0,1,...,1,7,7,0,2,1,0,7,4,2


In [23]:
y

0       1
1       0
2       0
3       1
4       0
       ..
8119    0
8120    0
8121    0
8122    1
8123    0
Name: class, Length: 8124, dtype: int64

# Q 3.2.2

what is random state?

+ if you use the same random_state value in different runs of your program, you will get the same data split every time.

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Q 3.2.3

what is the Bayes rule? It describes how to update the probability for a hypothesis (or an event) based on new evidence. 

$$ P(A|B) = \frac{P(A) \cdot P(B|A)}{P(B)} $$

How is it used in Bayesian classification? 

1. Training Phase:

       + Collect and preprocess your training data, which typically consists of labeled examples.

       + Calculate class probabilities (P(A)): For each class label, compute the prior probability of that class in the training data. This is often done by counting the number of occurrences of each class in the training set.

       + Calculate conditional probabilities (P(B∣A)): For each feature, compute the likelihood of observing that feature given a specific class. This involves calculating the conditional probability of each feature occurring within each class.

2. Testing or Prediction Phase:

       + Given a new data point (features), use Bayes' Rule to calculate the posterior probability (P(A∣B)) for each possible class.

       + To predict the class label for the new data point, select the class with the highest posterior probability. This is often referred to as the Maximum A Posteriori (MAP) decision rule.

What is each term of this rule called?

+ The __"prior probability"__ (P(A)) is the initial belief in the probability of a particular class.
+ The __"likelihood"__ (P(B∣A)) describes how likely the observed features are to occur given a specific class.
+ The __"evidence"__ (P(B)) represents the overall likelihood of observing the given features, regardless of the class.
+ The __"posterior probability"__ (P(A∣B)) is the updated belief in the probability of a particular class after observing the features.

# Q 3.2.4

What are the model parameters of LDA and QDA classifiers? These classifiers are based on different assumptions about the underlying data distribution,
and as a result, they have different sets of model parameters.

1. LDA (Linear Discriminant Analysis):
    LDA assumes that all classes have a common covariance matrix, and the decision boundaries between classes are linear.

    + Class Priors (ϕ_k): These are the prior probabilities of each class in the training data.
    + Class Means (μ_k): For each class, there is a vector of means representing the mean feature values for that class. These vectors capture the central tendency of feature values for each class.
    + Shared Covariance Matrix (Σ): This matrix characterizes the shape of the distribution of data points within each class which is why it's "linear" discriminant analysis.

2. QDA (Quadratic Discriminant Analysis): QDA, in contrast to LDA, allows each class to have its own covariance matrix and does not assume a common covariance matrix. 

    + Class Priors (ϕ_k): Similar to LDA, these represent the prior probabilities of each class in the training data.
    + Class Means (μ_k): QDA also calculates the mean feature values for each class. These vectors capture the central tendency of feature values for each class.
    + Class-Specific Covariance Matrices (Σ_k): Each class has its own covariance matrix, representing the shape of the data distribution for that class. This results in more flexibility in modeling the data distribution, making QDA "quadratic" discriminant analysis.

Between QDA and LDA, which one has potentially more parameters? why?

LDA typically has potentially fewer parameters compared to QDA because it assumes a shared covariance matrix among all classes, whereas QDA allows each class to have its own covariance matrix. 

# Q 3.2.5

___The probability density function (PDF)___ for a univariate (single-variable) Gaussian distribution 

$$ f(x| \mu, \sigma^2) = \frac{1}{\sqrt{2\pi\sigma^2}} \exp\left(-\frac{(x - \mu)^2}{2\sigma^2}\right) $$

___likelihood___ function for a Gaussian distribution

$$ L(\mu, \sigma^2 | x) = f(x| \mu, \sigma^2) $$


The estimated class for the pixel:

$$ \hat{y} = \arg\max_i L_i(\mu, \sigma^2 | x) $$

Estimating Probability:

From Bayes Theorem: 
$$ P(C_k | \mathbf{X}) = \frac{\prod_{k=1}^{n} L(C_k | \mathbf{x}_i)}{\sum_i \prod_i L(C_i | \mathbf{x})}$$


To calculate the parameters of Linear Discriminant Analysis (LDA) in an __iterative__ manner:
1. Initialize Parameters
2. Define the Objective Function
3. Compute Gradients
4. Update Parameters

**Objective Function for LDA:**
The log-likelihood function for LDA is given by:
$$ \ell(\boldsymbol{\mu}, \boldsymbol{\Sigma}) = \sum_{i=1}^{N} \sum_{j=1}^{K} \delta_{ij} \log p(\mathbf{x}_i | C_j, \boldsymbol{\mu}, \boldsymbol{\Sigma}_j) $$

1. **Gradient with respect to $ \boldsymbol{\mu}_j $:**
   $$ \frac{\partial \ell}{\partial \boldsymbol{\mu}_j} = \sum_{i=1}^{N} \delta_{ij} \boldsymbol{\Sigma}^{-1} (\mathbf{x}_i - \boldsymbol{\mu}_j) $$

2. **Gradient with respect to $ \boldsymbol{\Sigma} $:**
   $$ \frac{\partial \ell}{\partial \boldsymbol{\Sigma}} = \sum_{i=1}^{N} \delta_{ij} \left( -\frac{1}{2} \boldsymbol{\Sigma}^{-1} + \frac{1}{2} \boldsymbol{\Sigma}^{-1} (\mathbf{x}_i - \boldsymbol{\mu}_j)(\mathbf{x}_i - \boldsymbol{\mu}_j)^T \boldsymbol{\Sigma}^{-1} \right) $$


In [82]:
#initialize Parameters
mu_0 = np.mean(X[y == 0], axis=0)
mu_1 = np.mean(X[y == 1], axis=0)
Sigma = np.cov(X, rowvar=False)

If the determinant is zero, the matrix is singular, and its inverse does not exist.

In [83]:
np.linalg.det(Sigma)

0.0

In [67]:
Sigma = np.eye(X.shape[1]) * Sigma

In [70]:
np.linalg.det(Sigma)

0.0