In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# --- Task 1: Scikit-learn PCA ---

# Step 1: Import modules and the dataset
# Please ensure your file is named 'data.csv' or change the filename below.
try:
    data = pd.read_csv("data.csv", names=["xi1", "xi2", "xi3", "xi4", "xi5"])
    print("--- Task 1: Scikit-learn PCA ---")
    print("Step 1: Importing modules and dataset")
    print("|| Data: " + str(data.shape) + " ||")

    # Step 2: Examine the dataset
    print("\nStep 2: Examining the dataset")
    print("Head of the data:")
    print(data.head())
    print("\nDescription of the data:")
    print(data.describe())

    # Step 3: Fitting PCA model with 5 components
    print("\nStep 3: Fitting PCA model with 5 components")
    pca = PCA(n_components=5)
    pca.fit(data)

    # Plotting Explained Variance Ratio
    print("\nPlotting Explained Variance Ratio...")
    plt.figure(figsize=(8, 6))
    plt.bar(range(1, 6), pca.explained_variance_ratio_, color='teal')
    plt.title("PCA Visualised")
    plt.xlabel('Principal Component')
    plt.ylabel('Explained Variance Ratio')
    plt.xticks([1, 2, 3, 4, 5])
    plt.show()

    # Step 4: Inspecting explained_variance_ratio_
    print("\nStep 4: Inspecting explained_variance_ratio_")
    print(pca.explained_variance_ratio_)

    # Step 5: Inspecting components_
    print("\nStep 5: Inspecting components_")
    print(pca.components_)

    # Step 6: Projecting data to 2D space
    print("\nStep 6: Projecting data to 2D space")
    pca_2d = PCA(n_components=2)
    new_data = pca_2d.fit_transform(data)
    print("Shape of the new data is: " + str(new_data.shape))
    # We'll just print the first few rows for brevity
    print("Transformed data (first 5 rows):\n", new_data[:5])

    # Step 7: Plotting a scatter graph of the 2D data
    print("\nStep 7: Plotting a scatter graph of the 2D data")
    plt.figure(figsize=(8, 6))
    plt.scatter(new_data[:, 0], new_data[:, 1], color="blue", marker="x")
    plt.title("Dataset Projected to 2D")
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.show()

    # --- Task 2: Custom PCA from scratch ---

    print("\n--- Task 2: Custom PCA from scratch ---")

    def implement_pca_from_scratch(data_df, n_components):
        """
        Implements PCA from scratch using NumPy.
        :param data_df: pandas DataFrame with the data.
        :param n_components: Number of components to return.
        :return: Projected data, normalized eigenvalues, and sorted eigenvectors.
        """
        # Convert DataFrame to a NumPy array for calculations
        data_np = data_df.values

        # 1. Center the data
        data_mean = np.mean(data_np, axis=0)
        centered_data = np.subtract(data_np, data_mean)

        # 2. Calculate the covariance matrix
        covariance_matrix = np.cov(centered_data, rowvar=False)

        # 3. Calculate eigenvalues and eigenvectors
        eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)

        # 4. Sort eigenvectors by eigenvalues in descending order
        sorted_indices = np.argsort(eigenvalues)[::-1]
        sorted_eigenvalues = eigenvalues[sorted_indices]
        sorted_eigenvectors = eigenvectors[:, sorted_indices]

        # 5. Normalize the eigenvalues to get explained variance ratio
        normalized_eigenvalues = sorted_eigenvalues / np.sum(sorted_eigenvalues)

        # 6. Select the top n_components eigenvectors
        projection_matrix = sorted_eigenvectors[:, :n_components]

        # 7. Project the centered data
        projected_data = np.matmul(centered_data, projection_matrix)

        return projected_data, normalized_eigenvalues, sorted_eigenvectors

    # Run the custom PCA function for Task 2
    print("\nStep 8: Creating our own PCA using NumPy.")
    projected_data_custom, normalized_eigenvalues_custom, sorted_eigenvectors_custom = implement_pca_from_scratch(data, n_components=5)

    print("\nCustom PCA - Explained Variance Ratio:")
    print(normalized_eigenvalues_custom)

    print("\nCustom PCA - Eigenvectors (components):")
    # NumPy's eig function may return eigenvectors with an arbitrary sign,
    # so we'll compare them carefully.
    print(sorted_eigenvectors_custom)

except FileNotFoundError:
    print("Error: The file 'data.csv' was not found. Please ensure the correct file is in the working directory.")