In [2]:
import numpy as np
import pandas as pd

# Load the dataset from the Excel file
data_path = "../data/health.xls"
health = pd.read_excel(data_path, index_col=0)

# Step 1: Compute the probability matrix (row profiles)
health_sum = health.values.sum()
health_P = health / health_sum  # Relative frequency matrix

# Step 2: Compute row and column marginal sums
health_r = health_P.sum(axis=1)  # Row sums
health_c = health_P.sum(axis=0)  # Column sums

# Step 3: Compute diagonal matrices
health_Dr = np.diag(health_r)  # Diagonal matrix of row sums
health_Dc = np.diag(health_c)  # Diagonal matrix of column sums
health_Drmh = np.diag(1 / np.sqrt(health_r))  # Inverse sqrt of row sums
health_Dcmh = np.diag(1 / np.sqrt(health_c))  # Inverse sqrt of column sums

# Step 4: Compute the centered matrix (S matrix)
health_P_matrix = health_P.values  # Convert to NumPy array
health_S = np.dot(np.dot(health_Drmh, health_P_matrix - np.outer(health_r, health_c)), health_Dcmh)

# Step 5: Singular Value Decomposition (SVD)
U, D, Vt = np.linalg.svd(health_S, full_matrices=False)

# Step 6: Extract row and column standard coordinates (first dimension)
health_rsc = np.dot(health_Drmh, U[:, 0])  # Row standard coordinates
health_csc = np.dot(health_Dcmh, Vt.T[:, 0])  # Column standard coordinates

# Step 7: Compute canonical correlation
health_cor = np.dot(health_rsc.T, np.dot(health_P_matrix, health_csc))

# Step 8: Compute squared correlation
health_cor_squared = health_cor ** 2

# Step 9: Compute inertia of the first dimension
health_inertia_dim1 = np.dot(health_rsc.T, np.dot(health_Dr, health_rsc))

# Print results
print("Canonical Correlation:", health_cor)
print("Squared Correlation (represents variance explained by first dimension):", health_cor_squared)
print("Inertia of the First Dimension:", health_inertia_dim1)

Canonical Correlation: 0.36959852502491086
Squared Correlation (represents variance explained by first dimension): 0.13660306970058966
Inertia of the First Dimension: 0.9999999999999997
