# Exercise set 2

## Declaration of genereative AI usage

In [88]:
### Import statements
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings('ignore')



## Problem 8


In [77]:
### problem 8, task ai.

spam_test = pd.read_csv('data_E2/spam_test.csv')
spam_train = pd.read_csv('data_E2/spam_train.csv')
responce_train = spam_train['SPAM']
responce_test = spam_test['SPAM']
spam_train = spam_train.drop('SPAM', axis=1)
spam_test = spam_test.drop('SPAM', axis=1)


model = LogisticRegression(penalty='none')
model.fit(spam_train, responce_train)
coefficients = model.coef_
print(coefficients)

[[ -4.63387898 -16.15985276 -14.36460009  22.88863495  26.0635997 ]]




In [78]:
### problem 8, task aii.

pred = model.predict(spam_test)
accuracy_base ="{:.4f}".format(accuracy_score(responce_test, pred))

perplexity = np.exp(-np.mean(np.log(model.predict_proba(spam_test)[np.arange(len(responce_test)), responce_test])))

print('Accuracy: ', accuracy_base) 
print('Perplexity: ', perplexity)

Accuracy:  0.8800
Perplexity:  1.8897996260639411


### problem 8, task aiii.

$$
P(Y=1 \mid x) = \frac{1}{1 + e^{-(\beta_0 + \beta_1 x_1 + \beta_2 x_2 + \dots + \beta_n x_n)}}
$$

Here:

- $P(Y=1 \mid x)$ is the probability that the outcome $Y$ is 1 given the covariate vector $x$.
- $e$ is the base of the natural logarithm.
- $\beta_0, \beta_1, \beta_2, \dots, \beta_n$ are the coefficients of the logistic regression model.
- $x_1, x_2, \dots, x_n$ are the elements of the covariate vector $x$.
- The expression $\beta_0 + \beta_1 x_1 + \beta_2 x_2 + \dots + \beta_n x_n$ represents the linear combination of the covariates weighted by their respective coefficients.


In [79]:
### problem 8, task b.

c_vals = np.logspace(-3, 3, 7)
best_v_val = None
best_accuracy = 0

for c in c_vals:
    model = LogisticRegression(penalty='l1', C=c, solver='liblinear')
    model.fit(spam_train, responce_train)
    pred = model.predict(spam_test)
    accuracy = accuracy_score(responce_test, pred)
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_c_val = c

# Train with the best C
best_model = LogisticRegression(penalty='l1', C=best_c_val, solver='liblinear')
best_model.fit(spam_train, responce_train)

# Evaluate
y_pred = best_model.predict(spam_test)
accuracy ="{:.4f}".format(accuracy_score(responce_test, y_pred))
perplexity = np.exp(-np.mean(np.log(best_model.predict_proba(spam_test)[np.arange(len(responce_test)), responce_test])))

print("Best C:", best_c_val)
print("Coefficients:", best_model.coef_[0])
print("Baseline Accuracy:", accuracy_base)
print("Accuracy with lasso:", accuracy)
print("Perplexity:", perplexity)

Best C: 10.0
Coefficients: [ 0.         -2.07440282  0.          3.63006359  6.77996696]
Baseline Accuracy: 0.8800
Accuracy with lasso: 0.8800
Perplexity: 1.4067547873582138


### problem 8, task b (written).
the predicted class probabilities from the unregularized regressor in Task a are much higher than they are from the regulated task probablilities in task b

## Problem 9

## Problem 10

In [80]:
### problem 10, task a

train = pd.read_csv('data_E2/penguins_train.csv')
test = pd.read_csv('data_E2/penguins_test.csv')

train_res = train['species'].apply(lambda x: 'Adelie' if x == 'Adelie' else 'notAdelie')
test_res = test['species'].apply(lambda x: 'Adelie' if x == 'Adelie' else 'notAdelie')


adelie_data = train[train['species'] == 'Adelie']
not_adelie_data = train[train['species'] == 'notAdelie']

train = train.drop('species', axis=1)
test = test.drop('species', axis=1)

stats = []

for i in ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']:
    stats.append({
        'Feature': i,
        'Mean_Ad': adelie_data[i].mean(),
        'StdDev_Ad': adelie_data[i].std(),
        'Mean_notAd': not_adelie_data[i].mean(),
        'StdDev_notAd': not_adelie_data[i].std()
    })

stats = pd.DataFrame(stats)
stats = stats.set_index('Feature')
    
print(stats)

num_ad = len(adelie_data)
num_not_ad = len(not_adelie_data)

prob_ad = (num_ad + 1) / (num_ad+num_not_ad+2*1)
prob_not_ad = (num_not_ad + 1) / (num_ad+num_not_ad+2*1)

print()
print("Adelie probability:", prob_ad)
print("not Adelie probability:", prob_not_ad)





                    Mean_Ad   StdDev_Ad  Mean_notAd  StdDev_notAd
Feature                                                          
bill_length_mm       38.124    2.781528      47.818      3.599472
bill_depth_mm        18.336    1.204118      15.890      1.965441
flipper_length_mm   188.880    6.320074     211.300     11.792855
body_mass_g        3576.000  461.343148    4657.000    787.531017

Adelie probability: 0.33766233766233766
not Adelie probability: 0.6623376623376623


### problem 10, task b

$$
\hat{p} = \frac{1}{\sqrt{2\pi \times \text{StdDev}_{\text{Ad}}^i}} \exp\left(-\frac{(x_i - \text{Mean}_{\text{Ad}}^i)^2}{2 \times \text{StdDev}_{\text{Ad}}^2}\right)
$$


In [81]:
### Problem 10, task c:

def naive_bays_classifier(x, stats):
    p_ad = prob_ad
    p_not_ad = prob_not_ad
    for i in ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']:
        p_ad *= (1 / (np.sqrt(2 * np.pi) * stats.loc[i, 'StdDev_Ad'])) * np.exp(-((x[i] - stats.loc[i, 'Mean_Ad'])**2) / (2 * stats.loc[i, 'StdDev_Ad']**2))
        p_not_ad *= (1 / (np.sqrt(2 * np.pi) * stats.loc[i, 'StdDev_notAd'])) * np.exp(-((x[i] - stats.loc[i, 'Mean_notAd'])**2) / (2 * stats.loc[i, 'StdDev_notAd']**2))
    return 'Adelie' if p_ad > p_not_ad else 'notAdelie'

pred = test.apply(lambda x: naive_bays_classifier(x, stats), axis=1)
print(pred[0:3])

0    Adelie
1    Adelie
2    Adelie
dtype: object


## Problem 11

### Problem 11 task a: 

- According to the authors, is discriminative learning better than generative learning?
    - No, the autors argue that in contrast to the widely-held belief that "in almost all situations, discriminative learning is better than generative learning", as the training set size increases, one algorithm will imarge to be a better fit than the other. 

### Problem 11 task b: 

- Ng and Jordan denote by ℎ𝐺𝑒𝑛 and ℎ𝐷𝑖𝑠 two models chosen by optimizing different objectives. Which two families do the authors discuss, and what are the (ℎ𝐺𝑒𝑛, ℎ𝐷𝑖𝑠) pairs for those models? What objectives are being optimised?
    - the two families tha that the authors discuss are Gaussian/Normal and multinomial distributions. 
    - the (ℎ𝐺𝑒𝑛, ℎ𝐷𝑖𝑠) pairs For Gaussian/Normal distributions are Normal Discriminant Analysis (ℎ𝐺𝑒𝑛) and logistic regression (ℎ𝐷𝑖𝑠).
    - the (ℎ𝐺𝑒𝑛, ℎ𝐷𝑖𝑠) pairs For multinomial distributions are Naive Bayes classifier (ℎ𝐺𝑒𝑛) and logistic regression (ℎ𝐷𝑖𝑠).
    - hGen optimises the joint liklyhood of inputs and labels
    - hDis either optimises the conditional likelihood p(y|x) or the 0-1 training error

### Problem 11 task c: 
- Study Figure 1 in the paper. Explain what it suggests (see the last paragraph of the Introduction). Reflect on what this means for the families in Task b.
    - figure 1 suggests that while initially, the generative model (naive Bayes) performs better due to its faster approach to its asymptotic error, as the number of training examples increases, the discriminative model (logistic regression) catches up and surpasses the generative model due to its lower asymptotic error.
    - for the families in task b, this means that the size of hte dataset is an important metric for deciding which model to use.


## Problem 12

### problem 12 task a.

No, the naieves bayes assumption does not hold as: 
1. x1, x2 are not discrete. 
2. Naive Bayes assumes that x1 and x2 are independent. hoewever the probability of y is defined as a function that includes an interaction term between x1 and x2, thus x1 and x2 are not independant.

In [90]:
### problem 12 task b.

n_values = [2**i for i in range(3, 13)]
out_table = pd.DataFrame(columns=['n', 'NB', 'LR', 'LRi', 'OptimalBays', 'Dummy'])

out_table['n'] = n_values
out_table_accuracy = out_table.set_index('n')
out_table_perplexity = out_table.set_index('n')
test = pd.read_csv('data_E2/toy_test.csv')
test_res = test['y']
test = test.drop('y', axis=1)

def naive_bays_classifier(x, stats):
    p_0 = prob_0
    p_1 = prob_1
    for i in ['x1', 'x2']:
        p_0 *= (1 / (np.sqrt(2 * np.pi) * stats.loc[i, 'StdDev_0'])) * np.exp(-((x[i] - stats.loc[i, 'Mean_0'])**2) / (2 * stats.loc[i, 'StdDev_0']**2))
        p_1 *= (1 / (np.sqrt(2 * np.pi) * stats.loc[i, 'StdDev_1'])) * np.exp(-((x[i] - stats.loc[i, 'Mean_1'])**2) / (2 * stats.loc[i, 'StdDev_1']**2))
    return 0 if p_0 > p_1 else 1

for n in n_values:
    train = pd.read_csv('data_E2/toy_train_' + str(n) + '.csv')
    train_res = train['y']
    train = train.drop('y', axis=1)

    model = GaussianNB()
    model.fit(train, train_res)
    pred = model.predict(test)
    perplexity = np.exp(-np.mean(np.log(model.predict_proba(test)[np.arange(len(test_res)), test_res])))
    out_table_accuracy.loc[n, 'NB'] = accuracy_score(test_res, pred)
    out_table_perplexity.loc[n, 'NB'] = perplexity
    
    model = LogisticRegression(penalty='none')
    model.fit(train, train_res)
    pred = model.predict(test)
    perplexity = np.exp(-np.mean(np.log(model.predict_proba(test)[np.arange(len(test_res)), test_res])))
    out_table_accuracy.loc[n, 'LR'] = accuracy_score(test_res, pred)
    out_table_perplexity.loc[n, 'LR'] = perplexity

    model = LogisticRegression(penalty='l1', C=1, solver='liblinear')
    model.fit(train, train_res)
    pred = model.predict(test)
    perplexity = np.exp(-np.mean(np.log(model.predict_proba(test)[np.arange(len(test_res)), test_res])))
    out_table_accuracy.loc[n, 'LRi'] = accuracy_score(test_res, pred)
    out_table_perplexity.loc[n, 'LRi'] = perplexity

    stats = []
    for i in ['x1', 'x2']:
        stats.append({
            'Feature': i,
            'Mean_0': train[train_res == 0][i].mean(),
            'StdDev_0': train[train_res == 0][i].std(),
            'Mean_1': train[train_res == 1][i].mean(),
            'StdDev_1': train[train_res == 1][i].std()
        })
    stats = pd.DataFrame(stats)
    stats = stats.set_index('Feature')

    num_0 = len(train[train_res == 0])
    num_1 = len(train[train_res == 1])

    prob_0 = (num_0 + 1) / (num_0+num_1+2*1)
    prob_1 = (num_1 + 1) / (num_0+num_1+2*1)

    pred = test.apply(lambda x: naive_bays_classifier(x, stats), axis=1)
    perplexity = np.exp(-np.mean(np.log(model.predict_proba(test)[np.arange(len(test_res)), test_res])))

    out_table_accuracy.loc[n, 'OptimalBays'] = accuracy_score(test_res, pred)
    out_table_perplexity.loc[n, 'OptimalBays'] = perplexity



    model = DummyClassifier(strategy='most_frequent')
    model.fit(train, train_res)
    pred = model.predict(test)
    perplexity = np.exp(-np.mean(np.log(model.predict_proba(test)[np.arange(len(test_res)), test_res])))
    out_table_perplexity.loc[n, 'Dummy'] = perplexity
    out_table_accuracy.loc[n, 'Dummy'] = accuracy_score(test_res, pred)

print("accuracy table:")
print(out_table_accuracy)
print()
print("perplexity table:")
print(out_table_perplexity)

    



accuracy table:
          NB      LR     LRi OptimalBays   Dummy
n                                               
8     0.5973  0.6837  0.6957      0.5936  0.5688
16    0.6537  0.6334  0.6277       0.656  0.4312
32    0.7037  0.7337  0.7365      0.7046  0.5688
64    0.6843  0.7531  0.7506      0.6845  0.5688
128   0.7444  0.7533  0.7528      0.7447  0.5688
256   0.7507    0.75  0.7511      0.7507  0.5688
512   0.7525  0.7528  0.7532      0.7525  0.5688
1024  0.7476  0.7505  0.7506      0.7476  0.5688
2048  0.7518  0.7524  0.7527      0.7517  0.5688
4096  0.7513   0.751  0.7509      0.7514  0.5688

perplexity table:
             NB        LR       LRi OptimalBays Dummy
n                                                    
8     81.029145  1.804528   1.88989     1.88989   inf
16     2.557424  6.102342  2.221673    2.221673   inf
32     1.954326  1.717563  1.694754    1.694754   inf
64     1.892499  1.653345  1.667624    1.667624   inf
128    1.665193  1.657166  1.652694    1.652694   inf

## Problem 13

## Problem 14

## Problem 15

## Problem 16