# Chapter 4: Chi-Square Distance and Inertia

In [1]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import pdist, squareform

In [2]:
# Load the data
data_path = "../data/readers.xls"
df = pd.read_excel(data_path, index_col=0)

# Step 1: Calculate Chi2 statistic and inertia
row_sum = df.sum(axis=1)
col_sum = df.sum(axis=0)
total_sum = df.values.sum()

# Expected frequencies table
expected_table = np.outer(row_sum, col_sum) / total_sum

# Chi2 statistic
chi2_stat = np.sum((df.values - expected_table) ** 2 / expected_table)
inertia = chi2_stat / total_sum

print(f"Chi2 Statistic: {chi2_stat}")
print(f"Inertia (Chi2 / total sum): {inertia}")

# Step 2: Calculate row profiles
table_pro = df.div(df.sum(axis=1), axis=0)

# Step 3: Calculate column masses
table_col_mass = df.sum(axis=0) / df.values.sum()

# Step 4: Calculate Chi2 distance for a specific row (example: E5)
row_label = 'E5'
chi2_distance_E5 = np.sum((table_pro.loc[row_label] - table_col_mass) ** 2 / table_col_mass)
print(f"Chi2 Distance for E5: {chi2_distance_E5}")

# Step 5: Calculate Chi2 distances for all rows (profiles)
chi2_distances = np.sum((table_pro - table_col_mass) ** 2 / table_col_mass, axis=1)
print("Chi2 Distances for all profiles:")
print(chi2_distances)

# Step 6: Calculate pairwise Chi2 distances between all rows, including the average
average_profile = pd.Series(table_col_mass, name='Average')
table_pro_with_avg = pd.concat([table_pro, average_profile.to_frame().T])

# Scale each value by the sqrt of the column masses
table_pro_scaled = table_pro_with_avg.div(np.sqrt(table_col_mass), axis=1)

# Calculate pairwise distances using the Euclidean metric
dist_matrix = pdist(table_pro_scaled, metric='euclidean')
dist_matrix = squareform(dist_matrix)

# Convert to a DataFrame for better readability
distance_df = pd.DataFrame(dist_matrix, index=table_pro_with_avg.index, columns=table_pro_with_avg.index)

print("Pairwise Chi2 Distance Matrix:")
print(distance_df)

Chi2 Statistic: 25.97724142102321
Inertia (Chi2 / total sum): 0.08326038916994619
Chi2 Distance for E5: 0.1859164906900525
Chi2 Distances for all profiles:
E1    0.353360
E2    0.117023
E3    0.027392
E4    0.039438
E5    0.185916
dtype: float64
Pairwise Chi2 Distance Matrix:
               E1        E2        E3        E4        E5   Average
E1       0.000000  0.373700  0.635251  0.791942  1.000805  0.594441
E2       0.373700  0.000000  0.469615  0.506557  0.770364  0.342087
E3       0.635251  0.469615  0.000000  0.259140  0.370357  0.165506
E4       0.791942  0.506557  0.259140  0.000000  0.284528  0.198591
E5       1.000805  0.770364  0.370357  0.284528  0.000000  0.431180
Average  0.594441  0.342087  0.165506  0.198591  0.431180  0.000000
