In [3]:
def parse_arff(file_path):
    data_started = False
    attrs = []
    data = []
    nom_mapping = []
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()

            if not line or line.startswith('%'):
                continue

            if line.lower().startswith('@relation'):
                continue

            if line.lower().startswith('@attribute'):
                parts = line.split()
                attr_name = parts[1].strip()

                if '{' in line:
                    values = line[line.index('{') + 1:line.index('}')].split(',')
                    attrs.append((attr_name, 'nominal', values))
                    attr_info = ((attr_name, 'nominal', values))

                    attr_name, attr_type, attr_values = attr_info

                    nom_mapping.append({value: index for index, value in enumerate(attr_values)})
                else:
                    attrs.append((attr_name, 'numeric', 0))

            if line.lower().startswith('@data'):
                data_started = True
                continue

            if data_started:
                data_line = line.split(',')

                data.append(line.split(','))

    return attrs, data

def standardize_data(features):
    mean_X = []
    std_X = []
    std_Features = features

    for index in range(len(features)):
        mean_X.append(sum(features[index]) / len(features[index]))

        std_X.append(((sum((x - mean_X[index]) ** 2 for x in features[index])) / (len(features[index]))) ** 0.5)

    for indexStd in range(len(std_Features)):
        std_Features[indexStd] = [((x - mean_X[indexStd]) / std_X[indexStd]) for x in std_Features[indexStd]]

    return std_Features, mean_X, std_X

def calculate_covariance_matrix(features):

    n = len(features[0])
    num_samples = len(features)
    
    mean_values = [sum(feature) / num_samples for feature in features]

    cov_matrix = [[0] * num_samples for _ in range(num_samples)]

    for x in range(num_samples):
        for y in range(num_samples):
            cov_matrix[x][y] = sum((features[x][i] - mean_values[x]) * (features[y][i] - mean_values[y]) for i in range(n)) / (n - 1)
    
    return cov_matrix

def dot_prod(vec1, vec2):
    return sum(x * y for x, y in zip(vec1, vec2))

def scalar_mult(scalar, vec):
    return [scalar * x for x in vec]

def subtract_vecs(vec1, vec2):
    return [x - y for x, y in zip(vec1, vec2)]

def transpose(matrix):
    return [[row[i] for row in matrix] for i in range(len(matrix[0]))]

def mat_mult(mat1, mat2):
    return [[dot_prod(row, col) for col in transpose(mat2)] for row in mat1]

def svd(data):
    mean_vec = [sum(feature) / len(data) for feature in transpose(data)]
    centered_data = [subtract_vecs(row, mean_vec) for row in data]

    cov_matrix = mat_mult(transpose(centered_data), centered_data)

    singular_vec = [1.0] * len(cov_matrix[0])
    for _ in range(50):
        singular_vec = mat_mult([singular_vec], cov_matrix)[0]
        magnitude = sum(x ** 2 for x in singular_vec) ** 0.5
        singular_vec = scalar_mult(1.0 / magnitude, singular_vec)

    u_mat = [singular_vec]
    v_mat = [singular_vec]

    singular_val = dot_prod(centered_data[0], singular_vec)
    s_mat = [[singular_val if i == j else 0.0 for j in range(len(u_mat))] for i in range(len(v_mat))]

    return u_mat, s_mat, transpose(v_mat)

def perform_pca(data, num_components):
    mean_vec = [sum(feature) / len(data) for feature in transpose(data)]
    centered_data = [subtract_vecs(row, mean_vec) for row in data]

    cov_matrix = mat_mult(transpose(centered_data), centered_data)
    u_mat, _, v_t = svd(cov_matrix)

    principal_comp = [row[:num_components] for row in transpose(v_t)]
    projected_data = mat_mult(centered_data, principal_comp)

    return projected_data

def main():
    datasetFile = ['.//DATASET//2017.arff','.//DATASET//2018.arff','.//DATASET/2019.arff','.//DATASET//2020.arff','./DATASET/2021 Q1.arff']
    
    for file_path in datasetFile:
            dataset_label = file_path.split('/')[-1].split('.')[0]
    
            attrs, data = parse_arff(file_path)
    
            for row in data:
                for i in range(len(attrs)):
                    attr_name, attr_type, attr_values = attrs[i]
                    if attr_type == 'nominal':
                        nom_mapping = {value: index for index, value in enumerate(attr_values)}
                        row[i] = nom_mapping.get(row[i])
                    elif attr_type == 'numeric':
                        try:
                            row[i] = float(row[i])
                        except Exception as e:
                            row[i] = -1
    
    data_as_list = [list(map(float, row)) for row in data]
    
    num_comp_pca = 3
    projected_data_pca = perform_pca(data_as_list, num_comp_pca)
    
    print("\nProjected Data (PCA):")
    for row in projected_data_pca:
                print(row)
    
    u_svd, s_svd, v_t_svd = svd(data_as_list)
    
    print("\nMatrix U (SVD):")
    for row in u_svd:
        print(row)
    
    print("\nMatrix S (SVD):")
    for row in s_svd:
        print(row)
    
    print("\nMatrix V^T (SVD):")
    for row in v_t_svd:
        print(row)

if __name__ == "__main__":
    main()

FileNotFoundError: [Errno 2] No such file or directory: './/DATASET//2017.arff'

In [64]:
def read_arff(file_path):
    attributes = []
    data = []
    data_start = False
    count = 0
    with open(file_path, 'r') as file:
        
        for line in file:
            line = line.strip()
            
            if not line or line.startswith('%'):
                continue
            if data_start:
                data.append(line.split(','))
            elif line.lower().startswith('@attribute'):
                attributes.append(line.split()[1])
            elif line.lower().startswith('@data'):
                data_start = True

    return attributes, data
def truncate_string(input_str, max_length):
    if len(input_str) > max_length:
        return input_str[:max_length] + '...'
    else:
        return input_str
print("| {:<5} | {:<16} | {:<5} | {:<5} | {:<5} | {:<5} | {:<5} |".format(*attrib[:7]))




for row in data[:20]:
    print("| {:<5} | {:<16} | {:<5} | {:<5} | {:<5} | {:<5} | {:<5} |".format(*row[:10]))

| Num   | Country          | X1    | X2    | X3    | X4    | X5    |
| 10    | Hungary          | m     | m     | m     | m     | m     |
| 22    | Poland           | -0.03 | 0.58  | -0.03 | 0.85  | 0.02  |
| 27    | Hungary          | m     | m     | m     | m     | m     |
| 73    | Poland           | 0.01  | 0.71  | 0.08  | 1.13  | 0.11  |
| 74    | Poland           | -0.13 | 1.1   | -0.43 | 0.27  | -0.05 |
| 100   | Poland           | 0     | 0     | 0     | 0     | 0     |
| 139   | Poland           | 0.07  | 0.63  | 0.18  | 1.32  | 0.08  |
| 142   | 'Czech Republic' | 0     | 0.19  | 0.13  | 2.88  | 0.44  |
| 175   | Poland           | 0.06  | 0.52  | 0.05  | 1.14  | 0.05  |
| 217   | Poland           | 0     | 0     | 0     | 0     | 0     |
| 257   | Hungary          | m     | m     | m     | m     | m     |
| 270   | Poland           | 0.02  | 0.65  | 0.03  | 1.06  | 0.02  |
| 276   | Poland           | 0.23  | 0.52  | 0.32  | 1.79  | 0.23  |
| 290   | Poland           | 0.22 