# Linear discriminant analysis

In [3]:
import numpy as np
import pandas as pd

import seaborn as sns


from sklearn import preprocessing



%matplotlib inline
plt.style.use('seaborn-white')

## Data

In [4]:
df = pd.read_csv('https://raw.githubusercontent.com/kirenz/classification/main/_static/data/Default.csv')

# Note: factorize() returns two objects: a label array and an array with the unique values.
# We are only interested in the first object. 
df['default2'] = df.default.factorize()[0]
df['student2'] = df.student.factorize()[0]
df.head(3)

Unnamed: 0,default,student,balance,income,default2,student2
0,No,No,729.526495,44361.625074,0,0
1,No,Yes,817.180407,12106.1347,0,1
2,No,No,1073.549164,31767.138947,0,0


In [6]:
X = df[['balance', 'income', 'student2']]
y = df.default2


True default status,No,Yes
Predicted default status,Unnamed: 1_level_1,Unnamed: 2_level_1
No,9645,254
Yes,22,79


In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis(solver='svd')
y_pred = lda.fit(X, y).predict(X)


In [7]:
df_ = pd.DataFrame({'True default status': y,
                    'Predicted default status': y_pred})

df_.replace(to_replace={0:'No', 1:'Yes'}, inplace=True)

df_.groupby(['Predicted default status','True default status']).size().unstack('True default status')

True default status,No,Yes
Predicted default status,Unnamed: 1_level_1,Unnamed: 2_level_1
No,9645,254
Yes,22,79


In [8]:
decision_prob = 0.2
y_prob = lda.fit(X, y).predict_proba(X)

df_ = pd.DataFrame({'True default status': y,
                    'Predicted default status': y_prob[:,1] > decision_prob})
df_.replace(to_replace={0:'No', 1:'Yes', 'True':'Yes', 'False':'No'}, inplace=True)

df_.groupby(['Predicted default status','True default status']).size().unstack('True default status')

True default status,No,Yes
Predicted default status,Unnamed: 1_level_1,Unnamed: 2_level_1
False,9435,140
True,232,193


In [9]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

qda = QuadraticDiscriminantAnalysis()
pred = qda.fit(X, y).predict(X)

In [10]:
qda.priors_

array([0.9667, 0.0333])

In [11]:
qda.means_

array([[8.03943750e+02, 3.35661666e+04, 2.91403745e-01],
       [1.74782169e+03, 3.20891471e+04, 3.81381381e-01]])

In [12]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y, pred).T

array([[9636,  239],
       [  31,   94]])

In [None]:
from sklearn.metrics import classification_report, precision_score

print(classification_report(y_test, pred, digits=3))

In [None]:
from sklearn import neighbors


knn = neighbors.KNeighborsClassifier(n_neighbors=1)
pred = knn.fit(X_train, y_train).predict(X_test)
print(confusion_matrix(y_test, pred).T)
print(classification_report(y_test, pred, digits=3))