## HuReTEx PCA 0.01 (2025.08.03)

In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
def perform_pca(data_matrix, info_percentage):
    """
    Performs PCA for the given matrix and returns data reduced to the number of dimensions required to preserve the specified percentage of information.

    Arguments:
    - data_matrix (array or DataFrame): numeric data matrix
    - info_percentage (float): value from 0 to 1 indicating how much information to preserve (e.g., 0.95)

    Returns:
    - X_pca: data matrix after PCA (reduced)
    - components: matrix of principal component vectors
    """
    # Convert to DataFrame if needed
    if isinstance(data_matrix, np.ndarray):
        df = pd.DataFrame(data_matrix)
    else:
        df = data_matrix.copy()

    # Remove non-numeric columns
    df_numeric = df.select_dtypes(include=[np.number])
    if df_numeric.empty:
        raise ValueError("No numeric data found.")

    # Standardization
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_numeric)

    # PCA without dimensionality restriction â€“ used to analyze variance
    pca_full = PCA()
    pca_full.fit(X_scaled)

    # Calculate how many components are needed
    cumulative = np.cumsum(pca_full.explained_variance_ratio_)
    n_components = np.argmax(cumulative >= info_percentage) + 1  # number of components

    # Perform dimensionality reduction
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X_scaled)

    return X_pca, pca.components_