# PCA and SVD [Pre Final Exam]
**Name: Sherly R. Jao**
**Course and Year: BSCS-3**
**Subj: CS-3101N**

In [4]:
# Parsing the file
def parse_arff(file_path):
    attributes = []
    data = []

    with open(file_path, 'r') as file:
        in_data_section = False

        for line in file:
            line = line.strip()

            if not line:
                continue

            if line.startswith('%'):
                continue

            if in_data_section:
                data.append(line.split(','))
            else:
                if line.lower().startswith('@attribute'):
                    attribute_name = line.split()[1]
                    attributes.append(attribute_name)
                elif line.lower().startswith('@data'):
                    in_data_section = True

    return attributes, data


def dot_product(v1, v2):
    return sum(x * y for x, y in zip(v1, v2))

def vector_norm(v):
    return sum(x**2 for x in v)**0.5

def matrix_multiplication(matrix1, matrix2):
    result = []
    for i in range(len(matrix1)):
        row = []
        for j in range(len(matrix2[0])):
            element = sum(matrix1[i][k] * matrix2[k][j] for k in range(len(matrix2)))
            row.append(element)
        result.append(row)
    return result

def qr_algorithm(matrix, num_iterations=100):
    n = len(matrix)
    eigenvalues = [matrix[i][i] for i in range(n)]
    eigenvectors = [[1.0 if i == j else 0.0 for j in range(n)] for i in range(n)]

    for _ in range(num_iterations):
        # QR decomposition
        q, r = qr_decomposition(matrix)

        # Update the matrix
        matrix = matrix_multiplication(r, q)

        # Update eigenvectors
        eigenvectors = matrix_multiplication(eigenvectors, q)

    return eigenvalues, eigenvectors

def qr_decomposition(matrix):
    n = len(matrix)
    q = [[0.0] * n for _ in range(n)]
    r = [[0.0] * n for _ in range(n)]

    for j in range(n):
        v = [matrix[i][j] for i in range(j, n)]
        norm_v = vector_norm(v)
        q[j][j] = 1.0
        for i in range(j + 1, n):
            q[i][j] = v[i - j] / norm_v

        for i in range(j, n):
            for k in range(j, n):
                r[i][k] = matrix[i][k] - 2 * q[i][j] * dot_product(q[j], matrix[i])

    return q, r

arff_file_path = '2020.arff'
attributes, data = parse_arff(arff_file_path)

selected_eigenvectors = []
num_selected_eigenvectors = 2

# Limit to a small dataset for testing
attributes = attributes[:6]
data = [row[:6] for row in data[:5]]

header = " | ".join(attributes)
max_widths = [len(attribute) for attribute in attributes]
for row in data:
    for i, value in enumerate(row):
        max_widths[i] = max(max_widths[i], len(value))

numerical_data = [
    [float(value) if value.lower() != 'm' else None for value in row[2:]]
    for row in data
]

numerical_data = [row for row in numerical_data if all(value is not None for value in row)]

mean_values = [
    sum(filter(None, col)) / len(list(filter(None, col))) if numerical_data else None
    for col in zip(*numerical_data)
]

centered_data = [
    [(col - mean) if col is not None else None for col, mean in zip(row, mean_values)]
    for row in numerical_data
]

covariance_matrix = [[0] * len(mean_values) for _ in range(len(mean_values))]

for i in range(len(mean_values)):
    for j in range(len(mean_values)):
        for k in range(len(centered_data)):
            covariance_matrix[i][j] += centered_data[k][i] * centered_data[k][j]

covariance_matrix = [[elem / (len(centered_data) - 1) for elem in row] for row in covariance_matrix]

eigenvalues, eigenvectors = qr_algorithm(covariance_matrix)

sorted_indices = list(range(len(eigenvalues)))
sorted_indices = sorted(sorted_indices, key=lambda i: eigenvalues[i], reverse=True)
sorted_eigenvalues = [eigenvalues[i] for i in sorted_indices]
sorted_eigenvectors = [eigenvectors[i][:] for i in sorted_indices]

for i in range(num_selected_eigenvectors):
    selected_eigenvectors.append(sorted_eigenvectors[i])

result_matrix = matrix_multiplication(centered_data, selected_eigenvectors)

# Print the result
print("\nTable:")
header_str = " | ".join(f"{attr:>{width}}" for attr, width in zip(attributes, max_widths))
print(f"{header_str}")

for row in data:
    row_str = " | ".join(f"{value:>{width}}" for value, width in zip(row, max_widths))
    print(row_str)

print("\nCentered Data:")
for row in centered_data:
    print(row)

print("\nMean Values:")
for mean in mean_values:
    print(mean)

print("\nCovariance Matrix:")
for matrix in covariance_matrix:
    print(matrix)

print("\nEigenvalues:", eigenvalues)
print("Eigenvectors:", eigenvectors)

print("\nSorted Eigenvalues:")
print(sorted_eigenvalues)

print("\nSorted Eigenvectors:")
for row in sorted_eigenvectors:
    print(row)

print("\nSelected Eigenvectors Matrix:")
for row in selected_eigenvectors:
    print(row)

print("\nResult Matrix:")
for row in result_matrix:
    print(row)


Table:
Num | Country |    X1 |   X2 |    X3 |   X4
 10 | Hungary |     m |    m |     m |    m
 22 |  Poland | -0.03 | 0.58 | -0.03 | 0.85
 27 | Hungary |     m |    m |     m |    m
 73 |  Poland |  0.01 | 0.71 |  0.08 | 1.13
 74 |  Poland | -0.13 |  1.1 | -0.43 | 0.27

Centered Data:
[0.019999999999999997, -0.21666666666666679, 0.09666666666666668, 0.09999999999999998]
[0.06, -0.08666666666666678, 0.20666666666666667, 0.3799999999999999]
[-0.08000000000000002, 0.30333333333333334, -0.30333333333333334, -0.48]

Mean Values:
-0.049999999999999996
0.7966666666666667
-0.12666666666666668
0.75

Covariance Matrix:
[0.0052000000000000015, -0.016900000000000005, 0.019300000000000005, 0.031599999999999996]
[-0.016900000000000005, 0.07323333333333337, -0.06543333333333336, -0.10010000000000002]
[0.019300000000000005, -0.06543333333333336, 0.07203333333333334, 0.1169]
[0.031599999999999996, -0.10010000000000002, 0.1169, 0.19239999999999996]

Eigenvalues: [0.0052000000000000015, 0.0732333333333

**What is similar or dissimilar between with or without the use of libraries?**

The disparity in results between implementing the calculations with and without libraries is substantial. Without utilizing libraries, the process becomes notably more challenging and time-consuming, as each step must be individually computed. This results in a considerably longer runtime, particularly when dealing with large datasets. The absence of optimization through libraries not only makes the calculations cumbersome but also significantly burdens the computational resources, causing considerable lag on my laptop. To mitigate this, I've restricted the dataset for testing to a smaller subset, preventing potential overload and ensuring a more manageable execution.

Variations in results can occur due to the precision of decimal points in computations. The inherent differences in how numerical calculations are handled by different libraries or methods may lead to slight discrepancies in the final output. These variations can be exacerbated when performing manual calculations without the optimization provided by specialized libraries. It's common to encounter such nuances in numerical precision, and it's often a trade-off between computational efficiency and exactness.