In [1]:
from helper import *
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [2]:
SUBSET = '20k'
DATA_DIR = 'data/'

## LBP

In [3]:
DATA_FILENAME = 'lbp_subset'
(Xl_train, y_train), (Xl_test, y_test) = \
    load_npz(DATA_DIR + DATA_FILENAME + SUBSET + '.npz')

((20000, 10), dtype('int16')) ((20000,), dtype('uint8'))
((10000, 10), dtype('int16')) ((10000,), dtype('uint8'))


In [4]:
scaler = StandardScaler()
X_train = scaler.fit_transform(Xl_train)
X_test = scaler.transform(Xl_test)

In [5]:
for variance in [0.90, 0.95, 0.97, 0.99]:
    pca = PCA(variance)
    pca.fit(X_train)
    print(f'{variance:.2f}: {pca.n_components_}', end=' ')

0.90: 3 0.95: 5 0.97: 5 0.99: 7 

In [6]:
pca = PCA(0.99)
pca.fit(X_train)
PCA_train = pca.transform(X_train)
PCA_test = pca.transform(X_test)
pca.n_components_

7

In [7]:
clf = LogisticRegression(class_weight='balanced', max_iter=3000)
clf.fit(PCA_train, y_train)
clf.n_iter_

array([20], dtype=int32)

In [8]:
clf.score(PCA_train, y_train)

0.7365

In [9]:
y_hat = clf.predict(PCA_test)
accuracy = metrics.accuracy_score(y_test, y_hat)
print(accuracy)

0.7397


#### 74.0% Test Acc compared to 74.2% w/o PCA dimentionality reduction (10 down to 7)

## Landmark-Dist

In [10]:
DATA_FILENAME = 'ldist_subset'
(Xd_train, y_train), (Xd_test, y_test) = \
    load_npz(DATA_DIR + DATA_FILENAME + SUBSET + '.npz')

((20000, 64), dtype('float32')) ((20000,), dtype('uint8'))
((10000, 64), dtype('float32')) ((10000,), dtype('uint8'))


In [11]:
scaler = StandardScaler()
X_train = scaler.fit_transform(Xd_train)
X_test = scaler.transform(Xd_test)

In [12]:
for variance in [0.90, 0.95, 0.97, 0.99]:
    pca = PCA(variance)
    pca.fit(X_train)
    print(f'{variance:.2f}: {pca.n_components_}', end=' ')

0.90: 3 0.95: 3 0.97: 3 0.99: 3 

In [13]:
pca = PCA(0.99)
pca.fit(X_train)
PCA_train = pca.transform(X_train)
PCA_test = pca.transform(X_test)
pca.n_components_

3

In [14]:
clf = LogisticRegression(class_weight='balanced', max_iter=3000)
clf.fit(PCA_train, y_train)
clf.n_iter_

array([13], dtype=int32)

In [15]:
clf.score(PCA_train, y_train)

0.6762

In [16]:
y_hat = clf.predict(PCA_test)
accuracy = metrics.accuracy_score(y_test, y_hat)
print(accuracy)

0.6844


#### 68.4% Test Acc compared to 75.4% w/o PCA dimentionality reduction (64 down to 3)