## 3. Generate a data matrix, X with 200 samples and 10 features. (You can assume any distribution).

In [1]:
# importing necessary packages
import numpy as np
import matplotlib as plt

In [37]:
# seed to get constant values
np.random.seed(42)

Let us assume normal distribution

In [38]:
# generate a normal dataset
X = np.random.normal(0, 2, size = (200,10))

### (a) Perform PCA on the matrix and get the first 2 principal components for all the samples

In [39]:
# SVD
U,S,Vt = np.linalg.svd(X)

In [40]:
print(U.shape,S.shape,Vt.shape)

(200, 200) (10,) (10, 10)


In [41]:
# Function to get the principal components 
def get_principal_comps (X, Vt,n):
    # X: data matrix, Vt: right singular matrix
    PC = X@(Vt.T)
    return PC[:,:n]

In [42]:
# perform PCA and get the first 2 principal components for all samples
PC = get_principal_comps(X,Vt,2)

In [43]:
print(PC[0:10])

[[ 2.49709594 -0.83997717]
 [-1.57825482  2.69370692]
 [-1.12753302  2.63318681]
 [-4.33505452  0.7743873 ]
 [-1.35639119  0.69920028]
 [ 0.99089568 -0.3587793 ]
 [-0.87945984 -2.62018755]
 [-0.59341777  0.03773606]
 [ 0.18302839  1.04688458]
 [-2.22860789 -1.14654354]]


In [44]:
# function to obtain variance explained
def get_var(S,n):
    # S: vector of single values
    eig = S**2
    return eig[n]/np.sum(eig)


In [45]:
first_comp_var_exp = get_var(S,1)
print(first_comp_var_exp)

0.12332788940034294


In [46]:
second_comp_var_exp = get_var(S,2)
print(second_comp_var_exp)

0.11835069469664507


### (b) Calculate the sum of the distance between all the samples and the plane formed by the obtained 2 principal components.

In [47]:
# Calculate the mean of the data
mean = np.mean(X, axis=0)

# Calculate the sum of distances between samples and the plane formed by the first 2 principal components
sum_of_distances = 0
for i in range(X.shape[0]):
    distance = np.linalg.norm(X[i] - mean - PC[i, 0] * Vt[0, :] - PC[i, 1] * Vt[1, :])
    sum_of_distances += distance

print("Direction of the plane formed by the first 2 principal components:", Vt[:2, :])
print("Sum of distances for the plane formed by the first 2 principal components:", sum_of_distances)


Direction of the plane formed by the first 2 principal components: [[ 0.04732586 -0.75694464  0.05041621  0.3287462  -0.25543734  0.24298967
   0.31315648 -0.0457064   0.07084292  0.29105366]
 [ 0.26978587 -0.22294344  0.62154962 -0.06075614 -0.12742787 -0.1366051
  -0.52379048 -0.29620584 -0.29200073 -0.07238067]]
Sum of distances for the plane formed by the first 2 principal components: 1030.4159959400874


### (c) Generate 50 random planes and calculate the sum of distances between the samples and each one of the planes. Verify that the sum distances are the least for the plane obtained from principal components

In [48]:
# Generate 50 random planes and calculate the sum of distances for each plane
num_planes = 50
min_sum_of_distances = float('inf')
best_plane = None

for _ in range(num_planes):
    random_normal = np.random.normal(size=10)
    random_normal /= np.linalg.norm(random_normal)
    
    sum_of_dist = 0
    for i in range(X.shape[0]):
        distance = np.linalg.norm(X[i] - mean - np.dot(X[i] - mean, random_normal) * random_normal)
        sum_of_dist += distance
        
    if sum_of_dist < min_sum_of_distances:
        min_sum_of_distances = sum_of_dist
        best_plane = random_normal

print("Sum of distances for the best plane obtained from random planes:", min_sum_of_distances)
print("Distance from the plane formed by the first 2 principal components is:", sum_of_distances)

Sum of distances for the best plane obtained from random planes: 1132.264611741975
Distance from the plane formed by the first 2 principal components is: 1030.4159959400874


### Hence it is verified that the sum distances are the least for the plane obtained from principal components