In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import mglearn

import warnings
warnings.filterwarnings('ignore')

# Dimensionality Reduction with PCA

In [2]:
from sklearn.datasets import load_wine
wine = load_wine()
X = wine.data

In [3]:
# scale data to have the same variance
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(wine.data)

In [4]:
from sklearn.decomposition import PCA
# keep all the principal components of the data
pca3 = PCA()
# fit PCA model to the scaled data
pca3.fit(X_scaled)

print('-'*20 + 'Explained variance ratio' + '-'*20)
print(pca3.explained_variance_ratio_)

--------------------Explained variance ratio--------------------
[0.36198848 0.1920749  0.11123631 0.0706903  0.06563294 0.04935823
 0.04238679 0.02680749 0.02222153 0.01930019 0.01736836 0.01298233
 0.00795215]


In [5]:
# keep the first two principal components of the data
pca4 = PCA(n_components=0.95)
# fit PCA model to data
pca4.fit(X_scaled)

print('-'*20 + 'Explained variance ratio' + '-'*20)
print(pca4.explained_variance_ratio_)

--------------------Explained variance ratio--------------------
[0.36198848 0.1920749  0.11123631 0.0706903  0.06563294 0.04935823
 0.04238679 0.02680749 0.02222153 0.01930019]


## PCA as a pre-processing step for classification

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(X_scaled, wine.target):
    X_train = X_scaled[train_index]
    X_test = X_scaled[test_index]
    y_train = wine.target[train_index]
    y_test = wine.target[test_index]
    
model = DecisionTreeClassifier(random_state=0).fit(X_train, y_train)

print("Accuracy on training set: {:.2f}".format(model.score(X_train, y_train)))
print("Accuracy on test set: {:.2f}".format(model.score(X_test, y_test)))

Accuracy on training set: 1.00
Accuracy on test set: 0.94


In [7]:
# transform data onto the first ten principal components
X2_pca4 = pca4.transform(X_scaled)

for train_index, test_index in split.split(X_scaled, wine.target):
    X2_train = X2_pca4[train_index]
    X2_test = X2_pca4[test_index]
    y2_train = wine.target[train_index]
    y2_test = wine.target[test_index]
    
model = DecisionTreeClassifier(random_state=0).fit(X2_train, y2_train)

print("Accuracy on training set: {:.2f}".format(model.score(X2_train, y2_train)))
print("Accuracy on test set: {:.2f}".format(model.score(X2_test, y2_test)))

Accuracy on training set: 1.00
Accuracy on test set: 0.92
