In [3]:
# https://machinelearningmastery.com/linear-discriminant-analysis-with-python/

In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, GridSearchCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [5]:
# define dataset 
X,y = make_classification(n_samples=1000, n_features=10, n_classes=2, n_informative=10, n_redundant=0, 
                         n_repeated=0, random_state=4)


#### Basic LDA

In [6]:
# define model
model = LinearDiscriminantAnalysis()

In [7]:
# define cross-validation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=5, random_state=3)

In [8]:
# define scores
scores =  cross_val_score(model, X,y, scoring='accuracy', cv=cv, n_jobs=-1)

In [9]:
print('Mean Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

Mean Accuracy: 0.819 (0.038)


We may decide to use the Linear Discriminant Analysis as our final model and make predictions on new data.

In [10]:
model.fit(X,y)

LinearDiscriminantAnalysis()

In [11]:
# define new data
row = [0.12777556,-3.64400522,-2.23268854,-1.82114386,1.75466361,0.1243966,1.03397657,2.35822076,1.01001752,0.56768485]
# make a prediction
yhat = model.predict([row])
# summarize prediction
print('Predicted Class: %d' % yhat)

Predicted Class: 1


#### LDA Hyperparameter 

In [2]:
X, y = make_classification(n_features=20, n_samples=10000, n_classes=2, n_redundant=0, 
                          n_repeated=0, random_state=6)
model = LinearDiscriminantAnalysis()
cv = RepeatedStratifiedKFold(n_splits=25, n_repeats=5, random_state=10)

In [3]:
# defind grid search cv parameter dict
grid = dict()
grid['solver'] = ['svd', 'lsqr', 'eigen']

#Shrinkage adds a penalty to the model that acts as a type of regularizer, reducing the complexity of the model.
# this is how we regularize LDA
grid['shrinkage'] = np.arange(0,1,0.01)
gridCV = GridSearchCV(estimator=model, param_grid=grid, scoring='accuracy',
                      cv=cv, n_jobs=-1, verbose=2)
res = gridCV.fit(X,y)


Fitting 125 folds for each of 300 candidates, totalling 37500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 444 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 3164 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done 7692 tasks      | elapsed:   14.9s
[Parallel(n_jobs=-1)]: Done 13532 tasks      | elapsed:   27.0s
[Parallel(n_jobs=-1)]: Done 19044 tasks      | elapsed:   45.1s
[Parallel(n_jobs=-1)]: Done 23260 tasks      | elapsed:   57.5s
[Parallel(n_jobs=-1)]: Done 28116 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 33628 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 37500 out of 37500 | elapsed:  1.5min finished


In [15]:
# summary
print('Mean Accuracy: %.3f' % res.best_score_)
print('Config: %s' % res.best_params_)

Mean Accuracy: 0.883
Config: {'solver': 'svd'}
