In [19]:
# Dimenionality reduction using Feature extraction
from sklearn.preprocessing import StandardScaler 
from sklearn import datasets 
from sklearn.decomposition import PCA, KernelPCA, NMF, TruncatedSVD
from sklearn.datasets import make_circles 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
from scipy.sparse import csr_matrix

In [2]:
# 9.1 Reducing features using principal components 
digits = datasets.load_digits()
features = StandardScaler().fit_transform(digits.data)

pca = PCA(n_components = 0.99, whiten = True)

features_pca = pca.fit_transform(features)

print("original number of features: ", features.shape[1])
print("reduced number of features: ", features_pca.shape[1])

# Principal component analysis (PCA) is a linear dimensionality reduction technique.
# it projects observations onto the principal components of the feature matrix that retian the most variance 


original number of features:  64
reduced number of features:  54


In [6]:
# 9.2 Reducing features when data is linearly inseparable

features, _ = make_circles(n_samples=1000, random_state = 1, noise = 0.1, factor = 0.1)

kpca = KernelPCA(kernel = "rbf", gamma = 15, n_components = 1)
features_kpca = kpca.fit_transform(features)

print("original number of features: ", features.shape[1])
print("reduced number of features: ", features_kpca.shape[1])

# PCA is able to reduce the dimensionality of the feature matrix 


original number of features:  2
reduced number of features:  1


In [10]:
# 9.3 reducing features by maximizing class separability 
iris = datasets.load_iris() 
features = iris.data
target = iris.target 
print(features.shape)

lda = LinearDiscriminantAnalysis(n_components = 1) 
features_lda = lda.fit(features, target).transform(features)

print("Original number of features: ", features.shape[1])
print("Reduced number of features: ", features_lda.shape[1])

# LDA (linear Discriminant Analysis) is a classification algo that reduce the dimensionality.
# In PCA were only interested in the component axes that maximize the variance in the data 
# In LDA have the additional goal of maximizing the differences between classes


(150, 4)
Original number of features:  4
Reduced number of features:  1


In [12]:
lda.explained_variance_ratio_

array([0.9912126])

In [15]:
lda = LinearDiscriminantAnalysis(n_components = 1) 
features_lda = lda.fit(features, target)

print("features_lda: ", features_lda)
lda_var_ratios = lda.explained_variance_ratio_
print("lda_var_ratios: ", lda_var_ratios)

def select_n_componets(var_ratio, goal_var):
    total_variance = 0.0 
    n_components = 0 
    for explained_variance in var_ratio:
        total_variance += explained_variance 

        n_components += 1

        if total_variance >= goal_var:
            break 

    return n_components

n_components = select_n_componets(lda_var_ratios, 0.95)
print("Number of components: ", n_components)

features_lda:  LinearDiscriminantAnalysis(n_components=1)
lda_var_ratios:  [0.9912126]
Number of components:  1


In [17]:
# 9.4 reducing features using matrix factorization 

digits = datasets.load_digits()
features = digits.data 
nmf = NMF(n_components = 10, random_state = 1)
features_nmf = nmf.fit_transform(features)

print("original number of features: ", features.shape[1])
print("reduced number of features: ", features_nmf.shape[1])

# NMF (non-negative matrix factorization) is a matrix factorization technique that 
# decomposes the feature matrix into two lower-rank matrices.
# The first matrix contains the principal components, and the second matrix contains 
# the weights of the principal components.




original number of features:  64
reduced number of features:  10




In [20]:
# 9.5 Reducin features on sparse data 

digits = datasets.load_digits()
features = StandardScaler().fit_transform(digits.data)
features_sparse = csr_matrix(features)

tsvd = TruncatedSVD(n_components = 10)
features_sparse_tsvd = tsvd.fit(features_sparse).transform(features_sparse)

print("original number of features: ", features_sparse.shape[1])
print("reduced number of features: ", features_sparse_tsvd.shape[1])

# TruncatedSVD (Truncated Singular Value Decomposition) 


original number of features:  64
reduced number of features:  10


In [21]:
tsvd.explained_variance_ratio_[:3].sum()

np.float64(0.30039385373686134)

In [26]:
tsvd_new = TruncatedSVD(n_components = features_sparse.shape[1] - 1)
features_tsvd = tsvd.fit(features)

# List of explanied variance 
tsvd_var_ratios = tsvd.explained_variance_ratio_

def select_n_components(var_ratio, goal_var):
    total_variance = 0.0 
    n_components = 0 
    for explained_variance in var_ratio:
        total_variance += explained_variance 

        n_components += 1
        if total_variance >= goal_var:
            break

    return n_components

select_n_components(tsvd_var_ratios, 0.95)
        

10

<image src="assets/svd.png">
<image src="assets/explain.png">