### Queeny Mae Escabarte 21103388

In [1]:
def read_arff(file_path):
    attributes = []
    data = []
    data_start = False
    count = 0
    with open(file_path, 'r') as file:
        
        for line in file:
            line = line.strip()
            
            if not line or line.startswith('%'):
                continue
            if data_start:
               data.append([value if value.lower() != 'm' else None for value in line.split(',')])
            elif line.lower().startswith('@attribute'):
                attributes.append(line.split()[1])
            elif line.lower().startswith('@data'):
                data_start = True

    return attributes, data

def center_data(data):
    numeric_data = get_num_data(data)
    mean_vals = [
        sum(col) / len(col) if col else None
        for col in zip(*numeric_data)
    ]
    return [
        [value - mean_vals[col] for col, value in enumerate(row)]
        for row in numeric_data
    ], mean_vals

def get_num_data(data):
    return [[float(value) if value is not None else None for value in row[2:]] 
                    for row in data if any(value is not None for value in row[2:])]
    
def mult_matrices(m, n):
    return [[dot_product(row, col) for col in trans(n)] for row in m]
    
def trans(m):
    return [[row[i] for row in m] for i in range(len(m[0]))]

def dot_product(a, b):
    return sum(x * y for x, y in zip(a, b))

def subtract_vectors(a, b):
    return [x - y for x, y in zip(a, b)]

def scalar_multiply(scalar, vector):
    return [scalar * x for x in vector]

def normalize(vector):
    mag = magnitude(vector)
    return [x / mag if mag != 0 else 0 for x in vector]

def magnitude(vector):
    return (sum(x ** 2 for x in vector)) ** 0.5

def pca(data, num_components):
    cd, mean = center_data(data)
    cov = mult_matrices(trans(cd), cd)
    u, _, v_t = svd(cov)

    principal_comp = [row[:num_components] for row in trans(v_t)]
    projected_data = mult_matrices(cd, principal_comp)

    return projected_data, cov

def svd(data):
    cd, mean = center_data(data)

    cov = mult_matrices(trans(cd), cd)

    vec = [1.0] * len(cov[0])
    for _ in range(50):
        vec = mult_matrices([vec], cov)[0]
        vec = scalar_multiply(1.0 / magnitude(vec), vec)

    u = [[1.0 if i == j else 0.0 for j in range(len(vec))] for i in range(len(cov))]
    v = [[1.0 if i == 0 else 0.0 for i in range(len(vec))] for _ in range(len(cov[0]))]

    val = dot_product(cd[0], vec)
    s = [val]

    return u, s, trans(v)


def multiply_matrices(mat1, mat2):
    return [[dot_product(row, col) for col in trans(mat2)] for row in mat1]

def normalize_columns(matrix):
    if isinstance(matrix[0], (float, int)):
        return normalize(matrix)
    return [normalize(column) for column in trans(matrix)]

def row_form(row):
    formatted_row = ["{:<5}".format(value) if value is not None else "N/A" for value in row]
    return "| {:<5} | {:<16} | {:<5} | {:<5} | {:<5} | {:<5} |{:<5} |{:<5} |{:<5} |{:<5} |".format(*formatted_row)

def print_table(rows, col):
    header = row_form(col)
    print(header)
    print("-" * len(header))
    for row in rows:
        print(row_form(row))
        
attrib, data = read_arff('2017.arff')
attrib = attrib[:10]
data = [row[:10] for row in data[:16]]
proj, cov = pca(data, 3)

print("\nOriginal Data:")
print_table(data, attrib)

print("\nCentered Data:")
centered_data, mean_values = center_data(data)
for row in centered_data:
    print(row)

print("\nCovariance Matrix:")
for row in cov:
    print(row)

print("\nPrincipal Components (U matrix):")
for row in proj[0]:
    print(row)

print("\nSingular Values (S matrix):")
print(proj[1])


u_svd, s_svd, v_t_svd = svd(data)

print("\nMatrix U (SVD):")
for row in u_svd:
    print(row)

print("\nMatrix S (SVD):")
print(s_svd)

print("\nMatrix V^T (SVD):")
for row in v_t_svd:
    print(row)



Original Data:
| Num   | Country          | X1    | X2    | X3    | X4    |X5    |X6    |X7    |X8    |
----------------------------------------------------------------------------------------
| 10    | Hungary          | 0.14  | 0.53  | 0.19  | 1.41  |0.33  |0.14  |0.89  |1.08  |
| 22    | Poland           | 0.01  | 0.5   | 0.07  | 1.4   |0.06  |0.03  |1.01  |0.65  |
| 27    | Hungary          | 0.03  | 0.74  | 0.01  | 1.02  |0     |0.03  |0.35  |0.93  |
| 73    | Poland           | 0     | 0.58  | 0.15  | 1.29  |0.22  |0.01  |0.72  |0.85  |
| 74    | Poland           | 0     | 0     | 0     | 0     |0     |0     |0     |0     |
| 100   | Poland           | 0     | 0     | 0     | 0     |0     |0     |0     |0     |
| 139   | Poland           | 0.01  | 0.61  | 0.14  | 1.24  |0     |0.06  |0.63  |3.56  |
| 142   | 'Czech Republic' | 0.13  | 0.14  | 0.13  | 3.76  |0.41  |0.13  |6.02  |0.16  |
| 175   | Poland           | 0.02  | 0.53  | 0.05  | 1.14  |0.02  |0.04  |0.89  |1.41  |
| 217

When I compared my custom PCA and SVD implementation with scikit-learn's library, the results turned out a bit different. One  possible reason for this is how we handle things like standardization and categorical data. Scikit-learn's PCA automatically standardizes data, making sure each feature plays a fair role in building principal components. In my custom implementation, I only centered the data without standardizing it. Also, scikit-learn is smart enough to handle categorical data on its own, whereas my code doesn't. Another factor is the convergence criteria. My SVD has a fixed number of iterations, while scikit-learn adapts until it converges. So, aligning my code more with scikit-learn's default behaviors, especially in terms of standardization and handling categorical data, might make our results closer.