# Chapter 4: Chi-Square Distance and Inertia

In [1]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import pdist, squareform

In [2]:
# Function to calculate chi2 statistic and inertia
def calculate_chi2(table):
    """
    Calculate the Chi-square statistic and inertia for the table.
    
    Parameters:
    table (pd.DataFrame): The contingency table with observed frequencies.

    Returns:
    tuple: Chi2 statistic and inertia (chi2 / total sum of the table).
    """
    row_sum = table.sum(axis=1)
    col_sum = table.sum(axis=0)
    total_sum = table.values.sum()

    # Expected frequencies table
    expected_table = np.outer(row_sum, col_sum) / total_sum

    # Chi2 statistic
    chi2_stat = np.sum((table.values - expected_table) ** 2 / expected_table)
    inertia = chi2_stat / total_sum

    return chi2_stat, inertia

In [3]:
# Function to calculate chi2 distance for a specific profile row
def calculate_chi2_distance(table_pro, table_col_mass, row_label):
    """
    Calculate the Chi-square distance for a specific profile row from the centroid.
    
    Parameters:
    table_pro (pd.DataFrame): Row profiles (normalized rows).
    table_col_mass (pd.Series): Column masses.
    row_label (str): Label of the row to calculate the chi2 distance for.

    Returns:
    float: Chi2 distance for the given row.
    """
    chi2_distance = np.sum((table_pro.loc[row_label] - table_col_mass) ** 2 / table_col_mass)
    return chi2_distance

In [4]:
# Function to calculate chi2 distances for all rows (profiles)
def calculate_chi2_distances(table_pro, table_col_mass):
    """
    Calculate the Chi-square distances for all rows (profiles) from the centroid.
    
    Parameters:
    table_pro (pd.DataFrame): Row profiles (normalized rows).
    table_col_mass (pd.Series): Column masses.

    Returns:
    pd.Series: Chi2 distances for all rows.
    """
    chi2_distances = np.sum((table_pro - table_col_mass) ** 2 / table_col_mass, axis=1)
    return chi2_distances

In [5]:
# Function to calculate pairwise chi2 distances between all rows including the average
def calculate_pairwise_distances(table_pro, table_col_mass):
    """
    Calculate pairwise Chi-square distances between all rows (profiles), including the average profile.
    
    Parameters:
    table_pro (pd.DataFrame): Row profiles (normalized rows).
    table_col_mass (pd.Series): Column masses.

    Returns:
    pd.DataFrame: A distance matrix of pairwise Chi-square distances.
    """
    # Add the column mass (average profile) to the profiles using pd.concat()
    average_profile = pd.Series(table_col_mass, name='Average')
    table_pro_with_avg = pd.concat([table_pro, average_profile.to_frame().T])

    # Scale each value by the sqrt of the column masses
    table_pro_scaled = table_pro_with_avg.div(np.sqrt(table_col_mass), axis=1)

    # Calculate pairwise distances using the Euclidean metric
    dist_matrix = pdist(table_pro_scaled, metric='euclidean')
    dist_matrix = squareform(dist_matrix)

    # Convert to a DataFrame for better readability
    distance_df = pd.DataFrame(dist_matrix, index=table_pro_with_avg.index, columns=table_pro_with_avg.index)
    
    return distance_df

In [6]:
# Main execution
# Define the relative path to the CSV file
data_path = "../data/lector_type.csv"

# Load the data into a DataFrame
table = pd.read_csv(data_path, index_col=0)

# Chi2 statistic and inertia
chi2_stat, inertia = calculate_chi2(table)
print(f"Chi2 Statistic: {chi2_stat}")
print(f"Inertia (Chi2 / total sum): {inertia}")

# Calculate row profiles
table_pro = table.div(table.sum(axis=1), axis=0)

# Column masses
table_col_mass = table.sum(axis=0) / table.values.sum()

# Chi2 distance for a specific row (example: E5)
chi2_distance_E5 = calculate_chi2_distance(table_pro, table_col_mass, 'E5')
print(f"Chi2 Distance for E5: {chi2_distance_E5}")

# Chi2 distances for all rows (profiles)
chi2_distances = calculate_chi2_distances(table_pro, table_col_mass)
print("Chi2 Distances for all profiles:")
print(chi2_distances)

# Pairwise Chi2 distances between all rows (profiles), including the average profile
chi2_distance_matrix = calculate_pairwise_distances(table_pro, table_col_mass)
print("Pairwise Chi2 Distance Matrix:")
print(chi2_distance_matrix)

Chi2 Statistic: 25.97724142102321
Inertia (Chi2 / total sum): 0.08326038916994619
Chi2 Distance for E5: 0.1859164906900525
Chi2 Distances for all profiles:
E1    0.353360
E2    0.117023
E3    0.027392
E4    0.039438
E5    0.185916
dtype: float64
Pairwise Chi2 Distance Matrix:
               E1        E2        E3        E4        E5   Average
E1       0.000000  0.373700  0.635251  0.791942  1.000805  0.594441
E2       0.373700  0.000000  0.469615  0.506557  0.770364  0.342087
E3       0.635251  0.469615  0.000000  0.259140  0.370357  0.165506
E4       0.791942  0.506557  0.259140  0.000000  0.284528  0.198591
E5       1.000805  0.770364  0.370357  0.284528  0.000000  0.431180
Average  0.594441  0.342087  0.165506  0.198591  0.431180  0.000000
