# CS 3101 Discrete Structures 3 Pre-Final Exam
Joss Chary Borj M. Ecleo <br>
BS Computer Science 3

### Principal Component Analysis
#### FROM SCRATCH

In [11]:
import os
import numpy as np

def read_arff(file_path):
    data_started = False
    attributes = []
    data = []
    nominal_mappings = []
    
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()

            if not line or line.startswith('%'):
                continue

            if line.lower().startswith('@relation'):
                continue

            if line.lower().startswith('@attribute'):
                parts = line.split()
                attr_name = parts[1].strip()

                if '{' in line:
                    values = line[line.index('{') + 1:line.index('}')].split(',')
                    attributes.append((attr_name, 'nominal', values))
                    attribute_info = ((attr_name, 'nominal', values))
                    attr_name, attr_type, attr_values = attribute_info
                    nominal_mappings.append({value: index for index, value in enumerate(attr_values)})

                else:
                    attributes.append((attr_name, 'numeric', 0))

            if line.lower().startswith('@data'):
                data_started = True
                continue

            if data_started:
                data.append(line.split(','))

    return attributes, data

def standardize_matrix(matrix):
    means = np.mean(matrix, axis=0)
    std_devs = np.std(matrix, axis=0)
    return (matrix - means) / std_devs

def calculate_covariance_matrix(matrix):
    return np.cov(matrix, rowvar=False)

def normalize_vector(vector):
    norm = np.linalg.norm(vector)
    return vector / norm

def matrix_multiply(matrix, vector):
    return np.dot(matrix, vector)

def compute_eigenvalues_and_eigenvectors(matrix, num_simulations=1000):
    eigenvalues, eigenvectors = np.linalg.eig(matrix)

    # Sorting eigenvalues and eigenvectors
    sorted_indices = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[sorted_indices]
    eigenvectors = eigenvectors[:, sorted_indices]

    # Taking only the top 'num_simulations' eigenvalues and eigenvectors
    eigenvalues = eigenvalues[:num_simulations]
    eigenvectors = eigenvectors[:, :num_simulations]

    return eigenvalues, eigenvectors

def apply_transformation(matrix, eigenvectors, k):
    return np.dot(matrix, eigenvectors[:, :k])

def perform_pca(matrix, k):
    standardized_matrix = standardize_matrix(matrix)
    cov_matrix = calculate_covariance_matrix(standardized_matrix)
    eigenvalues, eigenvectors = compute_eigenvalues_and_eigenvectors(cov_matrix)
    transformed_matrix = apply_transformation(standardized_matrix, eigenvectors, k)
    return transformed_matrix

if __name__ == "__main__":
    base_path = '/Users/jossecleo/Documents/V4 data'
    file_names = [
        '2017.arff',
        '2018.arff',
        '2019.arff',
        '2020.arff',
        '2021 Q1.arff'
    ]

    file_paths = [os.path.join(base_path, file_name) for file_name in file_names]

    all_data = []
    for file in file_paths:
        attributes, raw_data = read_arff(file)
        for row in raw_data:
            for i in range(len(attributes)):
                attr_name, attr_type, attr_values = attributes[i]
                if attr_type == 'nominal':
                    nominal_mapping = {value: index for index, value in enumerate(attr_values)}
                    row[i] = nominal_mapping.get(row[i])
                elif attr_type == 'numeric':
                    try:
                        row[i] = float(row[i])
                    except Exception as e:
                        row[i] = 0
        for d in raw_data:
            all_data.append(d)

    num_components = 2
    transformed_data = perform_pca(all_data, num_components)

    # print("Original Data:")
    # for sample in all_data:
    #    print(sample)
    print(f"\nTransformed Data Size: {len(transformed_data)}")
    print("\nTransformed Data (after PCA):")
    for sample in transformed_data:
        print(sample)


Transformed Data Size: 2250

Transformed Data (after PCA):
[-3.75929947  0.22842648]
[-3.87323055  0.23078719]
[-3.6960769   0.22066712]
[-3.93003109  0.23062655]
[-4.01207289  0.23472451]
[-4.01047934  0.23325511]
[-3.74557269  0.20604003]
[-3.83693241  0.2354584 ]
[-3.88063479  0.21687224]
[-2.19770484  0.1489538 ]
[-3.90929411  0.22389944]
[-3.90624437  0.21333341]
[-3.99969227  0.22330841]
[-3.99883421  0.22251719]
[-3.87254962  0.21135634]
[-3.87917951  0.23682682]
[-3.90039055  0.20840494]
[-3.72210427  0.19587485]
[-3.84036347  0.2264502 ]
[-3.75715121  0.19880585]
[-3.80768388  0.13175845]
[-3.8751796   0.22156423]
[-3.78182587  0.20998166]
[-3.78794696  0.21356335]
[-3.83226439  0.21030733]
[-3.77305381  0.21101963]
[-3.80917084  0.21370197]
[-3.79788743  0.20991073]
[-3.95279406  0.22216069]
[-3.7118527   0.19935874]
[-3.79077616  0.21010874]
[-3.85380961  0.21383757]
[-3.97277834  0.22577546]
[-3.74923698  0.2081106 ]
[-3.95113923  0.22063478]
[-3.76706249  0.20250964]
[-3.

### Principal Component Analysis
#### USING LIBRARIES

In [10]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

def parse_arff(file_path):
    data_started = False
    attrs = []
    data = []

    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()

            if not line or line.startswith('%'):
                continue

            if line.lower().startswith('@relation'):
                continue

            if line.lower().startswith('@attribute'):
                parts = line.split()
                attr_name = parts[1].strip()
                attr_type = parts[2].strip()

                if attr_type.startswith('{'):
                    values = [v.strip('\'') for v in parts[3:]]
                    attrs.append((attr_name, 'nominal', values))
                else:
                    attrs.append((attr_name, 'numeric'))

            if line.lower().startswith('@data'):
                data_started = True
                continue

            if data_started:
                data_line = line.split(',')
                data.append(data_line)

    return attrs, data

def preprocess_data(attrs, data):
    label_encoders = []
    for i in range(len(attrs)):
        attr_info = attrs[i]
        
        if len(attr_info) == 3 and attr_info[1] == 'nominal':
            attr_name, attr_type, attr_values = attr_info
            label_encoder = LabelEncoder()
            data[:, i] = label_encoder.fit_transform(data[:, i])
            label_encoders.append((i, label_encoder))
        elif len(attr_info) == 2 and attr_info[1] == 'numeric':
            attr_name, attr_type = attr_info
            data[:, i] = np.where(np.char.isnumeric(data[:, i]), data[:, i], np.nan)
            data[:, i] = data[:, i].astype(float)

    return data, label_encoders
    
def process_dataset(file_path):
    dataset_label = file_path.split('/')[-1].split('.')[0]
    print(f"\nProcessing dataset: {dataset_label}")

    attrs, data = parse_arff(file_path)

    data = np.array(data)
    data, label_encoders = preprocess_data(attrs, data)

    df = pd.DataFrame(data, columns=[attr[0] for attr in attrs])

    imputer = SimpleImputer(strategy='mean')
    df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

    num_components = 3
    pca = PCA(n_components=num_components)
    projected_data = pca.fit_transform(df_imputed.drop(columns=['S']))

    print("\nProjected Data in scikit-learn:")
    print(pd.DataFrame(projected_data, columns=[f'PC{i+1}' for i in range(num_components)]))

    svd = TruncatedSVD(n_components=num_components)
    svd_result = svd.fit_transform(df_imputed.drop(columns=['S']))

    print("\nSVD Result in scikit-learn:")
    print(pd.DataFrame(svd_result, columns=[f'Component{i+1}' for i in range(num_components)]))

def main():
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)

    file_names = [
        '2017.arff',
        '2018.arff',
        '2019.arff',
        '2020.arff',
        '2021 Q1.arff'
    ]
    # file_paths = ['./v4-data/2017.arff']
    for file_path in file_names:
        process_dataset(file_path)

if __name__ == "__main__":
    main()


Processing dataset: 2017

Projected Data in scikit-learn:
              PC1            PC2            PC3
0   -3.961987e+07  306196.159897   -5530.566238
1    2.230281e-05       0.148551      -0.109245
2    2.098723e-05       0.148199      -0.065342
3    1.943163e-05       0.093844      -0.141812
4   -3.961987e+07  -21648.460224   -4644.361342
5   -3.961987e+07  -21648.478090   -4644.345845
6   -1.123634e-01  -19756.683976      53.400055
7    2.863033e-02    5032.215207     -13.592332
8   -1.172538e-01  -20616.710246      55.673561
9    3.939242e-06       0.013237       0.002930
10  -3.961987e+07     210.177458   -4703.236608
11  -1.226013e-01  -21554.776938      58.315914
12  -3.961987e+07  -21648.599025   -4644.240946
13  -3.961987e+07  -21648.608645   -4644.232601
14  -1.217361e-01  -21401.845360      57.899802
15  -1.241380e-01  -21822.855383      59.172711
16  -2.013782e-05      -0.143660       0.091772
17   9.013685e-01  158479.606029    -428.579911
18   2.325719e-05       0.145

#### Conclusion: 
The results obtained from my custom program and those produced using the sklearn library exhibit discrepancies. This variance may stem from differences in the eigendecomposition process, particularly the sorting of eigenvalues and eigenvectors in a descending order. Another potential source of disparity could be the covariance calculation; inaccuracies in this step may lead to deviations in the eigendecomposition results.

Moreover, the differences between my custom implementation and sklearn are multifaceted, involving variations in algorithms, functions, optimization techniques, and data preprocessing steps. Notably, my approach does not incorporate data centering, while sklearn's SVD computes a reduced-rank approximation of the original matrix. Additionally, my implementation computes the complete SVD, in contrast to skle

### Singular Value Decomposition
#### FROM SCRATCH

In [15]:
import os
import numpy as np

def read_arff(file_path):
    data_started = False
    attributes = []
    data = []
    nominal_mappings = []
    
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()

            if not line or line.startswith('%'):
                continue

            if line.lower().startswith('@relation'):
                continue

            if line.lower().startswith('@attribute'):
                parts = line.split()
                attr_name = parts[1].strip()

                if '{' in line:
                    values = line[line.index('{') + 1:line.index('}')].split(',')
                    attributes.append((attr_name, 'nominal', values))
                    attribute_info = ((attr_name, 'nominal', values))
                    attr_name, attr_type, attr_values = attribute_info
                    nominal_mappings.append({value: index for index, value in enumerate(attr_values)})

                else:
                    attributes.append((attr_name, 'numeric', 0))

            if line.lower().startswith('@data'):
                data_started = True
                continue

            if data_started:
                data.append(line.split(','))

    return attributes, data

def standardize_matrix(matrix):
    means = np.mean(matrix, axis=0)
    std_devs = np.std(matrix, axis=0)
    return (matrix - means) / std_devs

def calculate_covariance_matrix(matrix):
    return np.cov(matrix, rowvar=False)

def normalize_vector(vector):
    norm = np.linalg.norm(vector)
    return vector / norm

def svd(matrix):
    # Step 1: Compute the covariance matrix
    covariance_matrix = np.dot(matrix.T, matrix)

    # Step 2: Compute the eigenvalues and eigenvectors of the covariance matrix
    eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)

    # Sorting eigenvalues and eigenvectors
    sorted_indices = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[sorted_indices]
    eigenvectors = eigenvectors[:, sorted_indices]

    # Step 3: Compute the singular values and singular vectors
    singular_values = np.sqrt(eigenvalues)
    singular_vectors = np.dot(matrix, eigenvectors) / singular_values

    return singular_values, singular_vectors

def apply_svd(matrix, k):
    # Standardize the matrix
    standardized_matrix = standardize_matrix(matrix)

    # Perform SVD
    singular_values, singular_vectors = svd(standardized_matrix)

    # Select the top 'k' singular values and vectors
    singular_values = singular_values[:k]
    singular_vectors = singular_vectors[:, :k]

    # Normalize the singular vectors
    normalized_singular_vectors = np.apply_along_axis(normalize_vector, 0, singular_vectors)

    # Project the original matrix onto the singular vectors
    transformed_matrix = np.dot(standardized_matrix, normalized_singular_vectors)

    return transformed_matrix

if __name__ == "__main__":
    base_path = '/Users/jossecleo/Documents/V4 data'
    file_names = [
        '2017.arff',
        '2018.arff',
        '2019.arff',
        '2020.arff',
        '2021 Q1.arff'
    ]

    file_paths = [os.path.join(base_path, file_name) for file_name in file_names]

    all_data = []
    for file in file_paths:
        attributes, raw_data = read_arff(file)
        for row in raw_data:
            for i in range(len(attributes)):
                attr_name, attr_type, attr_values = attributes[i]
                if attr_type == 'nominal':
                    nominal_mapping = {value: index for index, value in enumerate(attr_values)}
                    row[i] = nominal_mapping.get(row[i])
                elif attr_type == 'numeric':
                    try:
                        row[i] = float(row[i])
                    except Exception as e:
                        row[i] = 0
        for d in raw_data:
            all_data.append(d)

    num_components = 2
    transformed_data = apply_svd(all_data, num_components)

    print(f"\nTransformed Data Size: {len(transformed_data)}")
    print("\nTransformed Data (after SVD):")
    for sample in transformed_data:
        print(sample)

ValueError: shapes (2250,85) and (2250,2) not aligned: 85 (dim 1) != 2250 (dim 0)

### Singular Value Decomposition
#### USING LIBRARIES