In [1]:
from sklearn.datasets import load_iris
from sklearn.datasets import load_sample_images
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA

# 1. PCA

In [2]:
iris = load_iris()

In [3]:
X = iris.data

In [4]:
X_centered = X - X.mean(axis=0)

In [5]:
U, s, Vt = np.linalg.svd(X_centered)
c1 = Vt.T[:, 0]
c2 = Vt.T[:, 1]

In [6]:
c1

array([ 0.36158968, -0.08226889,  0.85657211,  0.35884393])

In [7]:
c2

array([-0.65653988, -0.72971237,  0.1757674 ,  0.07470647])

In [8]:
W2 = Vt.T[:, :2]
X2D = X_centered.dot(W2)

In [9]:
pca = PCA(n_components=2)
X2D = pca.fit_transform(X)

In [10]:
pca.explained_variance_ratio_

array([ 0.92461621,  0.05301557])

In [11]:
# choose right dim
pca = PCA()
pca.fit(X)
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1
len(pca.explained_variance_ratio_)

4

In [12]:
# you can set n_components to be a float between 0.0 and 1.0, indicating the ratio of variance you wish to preserve
pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X)

# 2. PCA for Compression

In [13]:
pca = PCA(n_components = 2)
X_reduced = pca.fit_transform(X)
X_recovered = pca.inverse_transform(X_reduced)

In [15]:
# loading mnist data, header must be None
mnist = pd.read_csv("/Users/lgrcyanny/Codecookies/machine-learning-workspace/datasets/mnist/mnist_train.csv", header=None)

In [16]:
mnist = mnist[0:100]

In [75]:
X = mnist[:, 1:]
y = mnist[:, :1]

In [76]:
from sklearn.decomposition import IncrementalPCA
n_batches = 10
inc_pca = IncrementalPCA(n_components=154)
for X_batch in np.array_split(X, n_batches):
    inc_pca.partial_fit(X_batch)
X_reduced = inc_pca.transform(X)

# 3. Kernal PCA

In [20]:
from sklearn.decomposition import KernelPCA
rbf_pca = KernelPCA(n_components=2, kernel="rbf", gamma=0.04)
X_reduced = rbf_pca.fit_transform(mnist)

In [21]:
from sklearn.decomposition import KernelPCA
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_iris

In [22]:
clf = Pipeline([
        ("kpca", KernelPCA(n_components=2)),
        ("log_reg", LogisticRegression())
    ])

param_grid = [{
        "kpca__gamma": np.linspace(0.03, 0.05, 10),
        "kpca__kernel": ["rbf", "sigmoid"]
    }]

grid_search = GridSearchCV(clf, param_grid, cv=3)

In [28]:
iris = load_iris()
X = iris.data
y = iris.target

In [29]:
grid_search.fit(X, y)

GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('kpca', KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',
     fit_inverse_transform=False, gamma=None, kernel='linear',
     kernel_params=None, max_iter=None, n_components=2, n_jobs=1,
     random_state=None, remove_zero_eig=False, tol=0)), ('log_reg', LogisticRegre...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'kpca__kernel': ['rbf', 'sigmoid'], 'kpca__gamma': array([ 0.03   ,  0.03222,  0.03444,  0.03667,  0.03889,  0.04111,
        0.04333,  0.04556,  0.04778,  0.05   ])}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [30]:
grid_search.best_params_

{'kpca__gamma': 0.050000000000000003, 'kpca__kernel': 'rbf'}

In [31]:
# reconstruction
rbf_pca = KernelPCA(n_components = 2, kernel="rbf", gamma=0.0500,
                    fit_inverse_transform=True)
X_reduced = rbf_pca.fit_transform(X)
X_preimage = rbf_pca.inverse_transform(X_reduced)

In [32]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(X, X_preimage)

0.28262329114144297

# LLE

In [33]:
from sklearn.manifold import LocallyLinearEmbedding
lle = LocallyLinearEmbedding(n_components=2, n_neighbors=10)
X_reduced = lle.fit_transform(X)

# Test with Mnist data

In [65]:
from sklearn.datasets import fetch_mldata
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize

In [46]:
mnist = pd.read_csv("/Users/lgrcyanny/Codecookies/machine-learning-workspace/datasets/mnist/mnist_train.csv", header=None)

In [49]:
mnist = np.array(mnist)

In [52]:
X = mnist[:, 1:]
y = mnist[:, :1]

In [67]:
y = label_binarize(y, classes=[0, 1, 2, 3, 4, 5, 6, 7, 9])

array([[0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [69]:
random_clf = RandomForestClassifier(max_depth=6)
random_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=6, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [70]:
predictions = random_clf.predict(X_test)

In [72]:
auc_score = roc_auc_score(y_test, predictions)
print "auc_score", auc_score

auc_score 0.742456276015


## Use PCA process data

In [74]:
from sklearn.decomposition import PCA

In [82]:
pca = PCA(n_components=0.95)

In [83]:
X_reduced = pca.fit_transform(X)

In [84]:
explained_vairance_ratio = np.cumsum(pca.explained_variance_ratio_)

In [85]:
X_reduced.shape

(60000, 154)

In [92]:
y = label_binarize(y, classes=[0, 1, 2, 3, 4, 5, 6, 7, 9])

In [93]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [94]:
random_clf_reduced = RandomForestClassifier(max_depth=6)
random_clf_reduced.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=6, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [95]:
predictions = random_clf_reduced.predict(X_test)

In [96]:
auc_score = roc_auc_score(y_test, predictions)
print "auc_score", auc_score 
# with PCA the auc_score increased

auc_score 0.751810217972
