In [4]:
import pandas as pd
import numpy as np
import json
import zipfile
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import cm
from sklearn.decomposition import PCA, FastICA

np.set_printoptions(suppress=True, linewidth=np.inf)

In [79]:
'''
Attribute Information:
1. CRIM      per capita crime rate by town
2. ZN        proportion of residential land zoned for lots over 25,000 sq.ft.
3. INDUS     proportion of non-retail business acres per town
4. CHAS      Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
5. NOX       nitric oxides concentration (parts per 10 million)
6. RM        average number of rooms per dwelling
7. AGE       proportion of owner-occupied units built prior to 1940
8. DIS       weighted distances to five Boston employment centres
9. RAD       index of accessibility to radial highways
10. TAX      full-value property-tax rate per $10,000
11. PTRATIO  pupil-teacher ratio by town
12. B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
13. LSTAT    % lower status of the population
14. MEDV     Median value of owner-occupied homes in $1000's
'''

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data'
cols = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'TGT']
boston = pd.read_csv(url, sep=' ', skipinitialspace=True, header=None, names=cols[0:13], index_col=False)
boston

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296.0,15.3,396.90,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.90,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273.0,21.0,391.99,9.67
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273.0,21.0,396.90,9.08
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273.0,21.0,396.90,5.64
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273.0,21.0,393.45,6.48


In [65]:
boston_centered = pd.concat([pd.DataFrame(boston[cols[i]] - boston[cols[i]].mean()) for i in range(13)], axis=1)
boston_normalized = pd.concat([pd.DataFrame(boston[cols[i]] / boston[cols[i]].std()) for i in range(13)], axis=1)
boston_centered.mean(), boston_normalized.var()

(CRIM      -3.082295e-15
 ZN         2.076161e-14
 INDUS      5.960976e-15
 CHAS      -1.189760e-16
 NOX        6.582350e-18
 RM        -2.632940e-17
 AGE       -5.420346e-15
 DIS       -2.913787e-16
 RAD        4.683123e-15
 TAX        3.370163e-13
 PTRATIO   -2.632940e-15
 B         -5.504600e-14
 LSTAT     -8.969549e-16
 dtype: float64, CRIM       1.0
 ZN         1.0
 INDUS      1.0
 CHAS       1.0
 NOX        1.0
 RM         1.0
 AGE        1.0
 DIS        1.0
 RAD        1.0
 TAX        1.0
 PTRATIO    1.0
 B          1.0
 LSTAT      1.0
 dtype: float64)

In [154]:
n, d = boston_normalized.shape
U, D, VT = np.linalg.svd(boston_normalized, full_matrices=False)
Hauptkomponenten_qi = VT.T
Projektionen_ai = (U @ np.diag(D))
Hauptkomponenten_qi.std(axis=1), (D / (n - 1))

(array([0.27205825, 0.23998015, 0.26964294, 0.26666026, 0.27598248, 0.2641832 , 0.24182258, 0.25665465, 0.27571495, 0.26835463, 0.27728218, 0.27580876, 0.27632962]),
 array([0.65193608, 0.10741177, 0.05327283, 0.04922365, 0.04120506, 0.03668657, 0.03435383, 0.03241302, 0.02411841, 0.02255619, 0.02030152, 0.01919223, 0.01141862]))