   Janciel Fidel M. Pedrano 21103748

In [2]:
import math
import matplotlib.pyplot as plt
from scipy.io import arff

# Custom function to handle ValueError and remove problematic rows
def load_arff_and_remove_invalid(file_path):
    with open(file_path, 'rt') as f:
        try:
            data, meta = arff.loadarff(f)
            df = pd.DataFrame(data)
        except ValueError as e:
            print(f"ValueError: {e}")
            invalid_values = set(str(e).split('value not in ')[1].replace("(", "").replace(")", "").replace("'", "").split(', '))
            
            # Print out the rows causing the error
            with open(file_path, 'rt') as f:
                lines = f.readlines()
                for i, line in enumerate(lines):
                    if any(val in line for val in invalid_values):
                        print(f"Row {i + 1}: {line.strip()}")

            df = pd.DataFrame()  # Create an empty DataFrame

    return df

# Load ARFF file and handle ValueError
file_path = r'C:\Users\PC\Desktop\Discrete Pre-Final\2021 Q1.arff'
df = load_arff_and_remove_invalid(file_path)

# Drop rows with NaN values
df = df.dropna()

# Check if DataFrame is empty
if df.empty:
    print("Empty DataFrame. Unable to load data.")
else:
    print(df.head())  # Display the first few rows of the DataFrame

    # Proceed with the analysis only if the DataFrame is not empty
    # Extract the numerical data from the DataFrame
    numeric_data = df.select_dtypes(include=[np.number]).values

    # Functions for PCA and SVD from scratch
    def mean(data):
        return sum(data) / len(data)

    def dot_product(vector1, vector2):
        return sum(x * y for x, y in zip(vector1, vector2))

    def matrix_transpose(matrix):
        return [[matrix[j][i] for j in range(len(matrix))] for i in range(len(matrix[0]))]

    def matrix_multiply(matrix1, matrix2):
        return [[dot_product(row, col) for col in matrix_transpose(matrix2)] for row in matrix1]

    def pca_from_scratch(data, num_components):
        mean_vector = [mean(col) for col in zip(*data)]
        centered_data = [[x - mean_vector[i] for i, x in enumerate(row)] for row in data]

        cov_matrix = matrix_multiply(matrix_transpose(centered_data), centered_data)
        cov_matrix = [[x / (len(centered_data) - 1) for x in row] for row in cov_matrix]

        eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)

        sorted_indices = sorted(range(len(eigenvalues)), key=lambda k: eigenvalues[k], reverse=True)
        eigenvectors = matrix_transpose([eigenvectors[i] for i in sorted_indices])

        principal_components = eigenvectors[:num_components]

        pca_result = matrix_multiply(centered_data, matrix_transpose(principal_components))

        return pca_result

    def svd_from_scratch(data):
        cov_matrix = matrix_multiply(matrix_transpose(data), data)
        cov_matrix = [[x / (len(data) - 1) for x in row] for row in cov_matrix]

        eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)

        sorted_indices = sorted(range(len(eigenvalues)), key=lambda k: eigenvalues[k], reverse=True)
        eigenvectors = matrix_transpose([eigenvectors[i] for i in sorted_indices])

        singular_values = [math.sqrt(x) for x in eigenvalues]

        U = [[eigenvectors[i][j] / singular_values[j] for j in range(len(singular_values))] for i in range(len(data))]
        S = [[0] * len(data[0]) for _ in range(len(data))]
        for i in range(len(singular_values)):
            S[i][i] = singular_values[i]
        Vt = matrix_transpose(eigenvectors)

        return U, S, Vt

    # Apply PCA from scratch
    num_components = 2  # Adjust the number of components as needed
    pca_result_scratch = pca_from_scratch(numeric_data.tolist(), num_components)

    # Apply PCA using sklearn for comparison
    pca_sklearn = PCA(n_components=num_components)
    pca_result_sklearn = pca_sklearn.fit_transform(numeric_data)

    # Apply SVD from scratch
    U, S, Vt = svd_from_scratch(numeric_data.tolist())

    # Visualization (minimal plotting)
    plt.scatter(pca_result_scratch[:, 0], pca_result_scratch[:, 1], label='PCA from scratch')
    plt.scatter(pca_result_sklearn[:, 0], pca_result_sklearn[:, 1], label='PCA using sklearn')
    plt.title('PCA Results Comparison')
    plt.legend()
    plt.show()

ValueError: 'Czech Republic' value not in ('Hungary', 'Poland', 'Czech Republic', 'Slovakia')
Row 4: @attribute Country {Hungary,Poland,'Czech Republic',Slovakia}
Row 90: 10,Hungary,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,0,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,1
Row 91: 22,Poland,-0.01,0.59,-0.05,0.74,0.01,-0.01,0.71,0.14,0.41,-0.01,-0.04,0.12,-0.06,0.03,1.71,-0.02,-0.06,-0.01,-0.07,0.4,0.03,-4.55,-0.06,6.27,4.05,-0.02,0.82,0.26,0.14,0.14,0.28,0.81,1,0.16,-0.07,-0.42,0.61,-0.03,-0.25,0.23,0.19,0.48,0.94,-92197,-0.02,0.97,5.9,1.7,0.76,0.16,-0.58,-0.49,1.21,0.02,0.14,0.03,0.05,0.11,-0.79,0.15,-2.03,0,0.01,0,0,261.83,-13856.09,-3115.35,-10811.65,108.32,125.48,0.91,0.95,0.93,0.82,1.73,0.18,1.01,0.99,0.95,3824933,3.03,1
Row 92: 27,Hungary,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,0,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,m,1
Row 9

NameError: name 'pd' is not defined

In my code, I load an ARFF file and handle potential errors by addressing problematic rows and removing instances with NaN values. I proceed to manually implement Principal Component Analysis (PCA) and Singular Value Decomposition (SVD) without relying on external libraries like NumPy or scikit-learn. The PCA and SVD processes involve basic matrix operations, such as mean calculation, dot product, and matrix multiplication. To visualize the results, I use Matplotlib, allowing for a direct comparison between my manually implemented PCA and scikit-learn's implementation.

On the other hand, scikit-learn offers a more streamlined approach. I utilize the PCA class to perform PCA directly on the numerical data, with the framework handling NaN values internally during the fitting process. This implementation is more concise, abstracting away low-level details and providing a higher-level interface for PCA. While my manual implementation sacrifices some simplicity and performance, scikit-learn provides a more optimized and user-friendly solution for typical use cases. The choice between the two approaches depends on the specific requirements and constraints of the project, with scikit-learn being a more convenient and efficient option in most scenarios.

Firstly, the handling of NaN values might lead to differences. In my manual implementation, I explicitly remove rows with NaN values from the dataset using df = df.dropna(). On the other hand, scikit-learn's PCA class is designed to handle missing values internally during the fitting process. If there are NaN values in the data before the manual removal in my code, this could be a source of dissimilarity. Conversely, if the dataset is already clean, this step may not significantly impact the results.

Another aspect to consider is the method of covariance matrix calculation. In my manual PCA implementation, I calculate the covariance matrix using basic matrix operations, while scikit-learn might use a different algorithm or optimization techniques. Any differences in the calculation of the covariance matrix can contribute to variations in the final results.

Additionally, the sorting and selection of eigenvectors can cause differences. In my manual implementation, I sort eigenvectors based on eigenvalues, and then select the top ones. Scikit-learn may have its own method for selecting principal components, and the default behavior may differ.

It's also worth mentioning that numerical precision and floating-point arithmetic can introduce small variations in the results. If there are differences in how the two implementations handle numerical precision, it might lead to slight deviations.

To pinpoint the specific part of the code causing any observed deviation, it would be beneficial to conduct a detailed analysis, including printing intermediate results and comparing them step by step. This can help identify the specific points where the manual implementation diverges from scikit-learn's implementation. Additionally, exploring the impact of each step and parameter choice on the final results could provide insights into the observed similarities or dissimilarities.