In [1]:
import numpy as np

a = np.array([
    [17, 24, 1, 8, 15],
    [23, 5, 7, 14, 16],
    [4, 6, 13, 20, 22],
    [10, 12, 19, 21, 3],
    [11, 18, 25, 2, 9]
], dtype=float)

## Exercise 1

In [3]:
x, s, y = np.linalg.svd(a)

explained_var = s ** 2 / np.sum(s ** 2)
print(f'Explained variance = {explained_var}')

Explained variance = [0.76470588 0.09201289 0.08513021 0.03251685 0.02563417]


According to the [NumPy documentation](https://numpy.org/doc/stable/reference/generated/numpy.linalg.svd.html), `np.linalg.svd` returns the Hermitian transpose of $V$, which is the transpose of $V$ for real matrices. We can see that treating `y` as $V^T$ gives the correct reconstruction, and treating `y` as $V$ does not.

In [6]:
sigma = np.zeros_like(a)
np.fill_diagonal(sigma, s)

print('Correct (y = V^T)')
print(x @ sigma @ y)
print()

print('Incorrect (y = V)')
print(x @ sigma @ y.T)

Correct (y = V^T)
[[17. 24.  1.  8. 15.]
 [23.  5.  7. 14. 16.]
 [ 4.  6. 13. 20. 22.]
 [10. 12. 19. 21.  3.]
 [11. 18. 25.  2.  9.]]

Incorrect (y = V)
[[ 14.76140681  23.07906839 -15.73687265  12.03822041  -3.4481241 ]
 [ 19.59371802   5.75040259 -10.94388017  22.75873899  -0.53886058]
 [ 15.34302925   0.46199784 -12.78540028  17.1745351  -20.27182516]
 [ 16.33151316  10.89095607   5.69931752  19.1526371  -16.44271953]
 [ -1.02966724  18.61178131  -2.0699959   25.19939361 -12.97100847]]


In [9]:
b = x @ sigma @ y
print(f'All components close? {np.allclose(b, a)}')
print(f'Frobenius norm of difference = {np.linalg.norm(a - b)}')

All components close? True
Frobenius norm of difference = 8.523187293941492e-14


Project $A$ to the first two principal components.

In [25]:
w = y.T[:, :2]
ar = a @ w
print(ar)

[[-2.90688837e+01 -1.23024779e+01]
 [-2.90688837e+01 -1.01407417e+01]
 [-2.90688837e+01 -5.31241717e-13]
 [-2.90688837e+01  1.01407417e+01]
 [-2.90688837e+01  1.23024779e+01]]


## Exercise 2

In [26]:
from sklearn.decomposition import PCA

pca = PCA(2)
pca.fit(a)
ar2 = pca.transform(a)
print(ar2)

[[ 1.23024779e+01 -1.10967458e+01]
 [ 1.01407417e+01  4.23857973e+00]
 [-6.25055563e-14  1.37163321e+01]
 [-1.01407417e+01  4.23857973e+00]
 [-1.23024779e+01 -1.10967458e+01]]


Variance explained by each component:

In [29]:
print(pca.explained_variance_ratio_)

[0.39105478 0.3618034 ]


The reduced matrix is not the same -- `PCA` *centers* the data before performing the SVD.

In [33]:
pca_full = PCA(5).fit(a)
print(np.cumsum(pca_full.explained_variance_ratio_))

[0.39105478 0.75285817 0.89105478 1.         1.        ]


Looking at the cumulative sum of the explained variance ratios, we see that 3 components are necessary to explain at least 80% of the variance.

## Exercise 3

Center the data.

In [35]:
a_center = a - np.mean(a, axis=1, keepdims=True)

Now do the SVD analysis.

In [37]:
x, s, y = np.linalg.svd(a_center)
sigma = np.zeros_like(a)
np.fill_diagonal(sigma, s)

w = y.T[:, :2]
ar3 = a @ w
print(ar3)

[[-1.23024779e+01  1.10967458e+01]
 [-1.01407417e+01 -4.23857973e+00]
 [ 5.97299987e-14 -1.37163321e+01]
 [ 1.01407417e+01 -4.23857973e+00]
 [ 1.23024779e+01  1.10967458e+01]]


The reduced matrix is the same as that returned by Scikit-Learn (up to sign, which is OK because singular vectors are only defined up to a constant multiple).

In [38]:
print(f'All components close with same sign? {np.allclose(ar3, ar2)}')
print(f'All components close with opposite sign? {np.allclose(ar3, -ar2)}')

All components close with same sign? False
All components close with opposite sign? True


## Exercise 4

In [53]:
import pandas
df = pandas.read_csv('diagnosis.csv')
df.head(5)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


Since we are going to use the reduced data for classification, we should exclude the **diagnosis** feature from the PCA dimension reduction. We also need to exclude the erroneously loaded **Unnamed: 32** feature, and the useless **id** feature.

In [54]:
diagnosis = np.array([1. if d == 'M' else 0. for d in df['diagnosis']])
df = df.drop('diagnosis', axis=1)
df = df.drop('Unnamed: 32', axis=1)
df = df.drop('id', axis=1)
df.head(5)

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [55]:
diagnosis_pca = PCA(2).fit(df)
dfr = diagnosis_pca.transform(df)

In [58]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(dfr, diagnosis, test_size=.25, random_state=20)

In [59]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression().fit(x_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix

for x, y in [(x_train, y)]