## Evaluate PCA with Logistic Regression Algorithm for Classification

- PCA (Principles Component Analysis) is a dimensionality reduction technique that can be used to significantly speed up your supervised learning algorithms.
- In this notebook we will evaluate PCA with Logistic Regression Algorithm for Classification.

In [1]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

In [2]:
# generate random dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=10, n_redundant=10, random_state=7)

In [3]:
# define the pipeline
steps = [
    ('pca', PCA(n_components=10)),
    ('m', LogisticRegression())]
model = Pipeline(steps=steps)

In [4]:
# evaluate model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

Accuracy: 0.824 (0.034)


## Compare with SVD and LDA

### Singular Value Decomposition (SVD)

In [5]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [6]:
steps = [
    ('lda', LinearDiscriminantAnalysis(n_components=1)),
    ('m', LogisticRegression())]

model = Pipeline(steps=steps)

# evaluate model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

Accuracy: 0.825 (0.034)


### Linear Discriminant Analysis (LDA)

In [7]:
from sklearn.decomposition import TruncatedSVD

In [8]:
# define the pipeline
steps = [('svd', TruncatedSVD(n_components=10)), ('m', LogisticRegression())]
model = Pipeline(steps=steps)

# evaluate model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

Accuracy: 0.824 (0.034)
