In [None]:
import numpy as np

data = np.genfromtxt('dataset.txt', delimiter=',', skip_header=1, usecols=(0,1,2,3))
print(f"shape of data: {data.shape}")

cov_mat = np.cov(data.T) 
print("\ncovariance matrix:")
print(cov_mat)

Dataset shape: (150, 4)
Number of samples: 150
Number of features: 4

Covariance Matrix:
[[ 0.68569351 -0.03926846  1.27368233  0.5169038 ]
 [-0.03926846  0.18800403 -0.32171275 -0.11798121]
 [ 1.27368233 -0.32171275  3.11317942  1.29638747]
 [ 0.5169038  -0.11798121  1.29638747  0.58241432]]


## Part (b): Feature Correlations
Finding the most and least correlated feature pairs

In [None]:
# correlation matrix
corr = np.corrcoef(data.T)
features = ['f1', 'f2', 'f3', 'f4']

if data.shape[1] == len(features):
    max_c = float('-inf')
    min_c = float('inf')
    max_p = (features[0], features[1])
    min_p = (features[0], features[1])
    
    print("correlation matrix shape:", corr.shape)
    
    # find max and min correlations
    for i in range(len(corr)):
        for j in range(i+1, len(corr)):  
            if i != j:  
                c = corr[i,j]
                if not np.isnan(c):  
                    if abs(c) > abs(max_c):
                        max_c = c
                        max_p = (features[i], features[j])
                    if abs(c) < abs(min_c):
                        min_c = c
                        min_p = (features[i], features[j])

    if max_c != float('-inf'):
        print(f"most correlated: {max_p[0]} and {max_p[1]} (r={max_c:.3f})")
        print(f"least correlated: {min_p[0]} and {min_p[1]} (r={min_c:.3f})")
    else:
        print("no valid correlations found")

    print("\ncorrelation matrix:")
    print(corr)

Correlation matrix shape: (4, 4)
No valid correlations found in the data

Correlation Matrix:
[[ 1.         -0.10936925  0.87175416  0.81795363]
 [-0.10936925  1.         -0.4205161  -0.35654409]
 [ 0.87175416 -0.4205161   1.          0.9627571 ]
 [ 0.81795363 -0.35654409  0.9627571   1.        ]]


## Part (c): Sample Statistics
Computing sample mean and variance for each feature

In [None]:
# calculate stats
means = np.mean(data, axis=0)
vars = np.var(data, axis=0)

print("stats for each feature:")
for i in range(len(features)):
    print(f"\n{features[i]}:")
    print(f"mean: {means[i]:.3f}")
    print(f"variance: {vars[i]:.3f}")

Sample Statistics for each feature:

Feature 1:
Mean: 5.843
Variance: 0.681

Feature 2:
Mean: 3.054
Variance: 0.187

Feature 3:
Mean: 3.759
Variance: 3.092

Feature 4:
Mean: 1.199
Variance: 0.579


## Part (d): Eigenvalue Analysis
Computing eigenvalues of the covariance matrix and comparing with feature variances

In [None]:
# get eigenvalues
eig_vals = np.linalg.eigvals(cov_mat)
eig_vals = np.sort(eig_vals)[::-1]  # sort descending

print("eigenvalues:")
for i, ev in enumerate(eig_vals):
    print(f"λ{i+1}: {ev:.3f}")

print("\ncomparing variances and eigenvalues:")
print("variances:", np.diag(cov_mat))
print("eigenvalues:", eig_vals)

print(f"\nsum of variances: {np.sum(np.diag(cov_mat)):.3f}")
print(f"sum of eigenvalues: {np.sum(eig_vals):.3f}")

Eigenvalues of the covariance matrix:
λ1: 4.225
λ2: 0.242
λ3: 0.079
λ4: 0.024

Comparison of feature variances and eigenvalues:

Feature Variances (diagonal of covariance matrix):
[0.68569351 0.18800403 3.11317942 0.58241432]

Eigenvalues:
[4.22484077 0.24224357 0.07852391 0.02368303]

Sum of feature variances: 4.569291275167785
Sum of eigenvalues: 4.569291275167785
