# Chapter 7: Optimal Scaling

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Define the relative path to the Excel file containing the health data
data_path = "../data/health.xls"

# Load the data from the Excel file into a DataFrame
health = pd.read_excel(data_path, index_col=0)
print(health)

        VG    G    R    B  VB
16-24  243  789  167   18   6
25-34  220  809  164   35   6
35-44  147  658  181   41   8
45-54   90  469  236   50  16
55-64   53  414  306  106  30
65-74   44  267  284   98  20
75+     20  136  157   66  17


In [3]:
# Step 1: Extract the first dimension coordinates (column 1) from the DataFrame
# This mirrors the code provided by Greenacre for rescaling the first dimension.
first_dimension = health.iloc[:, 0].values

# Step 2: Calculate the range (max - min) of the first dimension
health_range = np.max(first_dimension) - np.min(first_dimension)

# Step 3: Rescale the coordinates to lie between 0 and 100
# Formula: scaled_value = (value - min_value) * 100 / range
health_scale = (first_dimension - np.min(first_dimension)) * 100 / health_range

# Step 4: Display the rescaled coordinates
print("Rescaled Coordinates (adapted from Greenacre's R code):")
print(health_scale)

Rescaled Coordinates (adapted from Greenacre's R code):
[100.          89.68609865  56.95067265  31.39013453  14.79820628
  10.76233184   0.        ]


In [4]:
# Step 5: Calculate the relative frequency matrix (profile matrix)
# This divides each cell by the total sum of the table to get relative frequencies
health_sum = health.values.sum()
health_P = health / health_sum  # Relative frequency matrix

# Step 6: Calculate row and column sums (marginals)
health_r = health_P.sum(axis=1)  # Row sums
health_c = health_P.sum(axis=0)  # Column sums

# Step 7: Create diagonal matrices for the row and column sums
health_Dr = np.diag(health_r)  # Diagonal matrix for row sums
health_Dc = np.diag(health_c)  # Diagonal matrix for column sums
health_Drmh = np.diag(1 / np.sqrt(health_r))  # Inverse square root of row sums
health_Dcmh = np.diag(1 / np.sqrt(health_c))  # Inverse square root of column sums

# Step 8: Centered matrix calculation (matrix S)
# S = Dr^(-1/2) * (P - row_sums * column_sums) * Dc^(-1/2)
health_P_matrix = health_P.values  # Convert DataFrame to NumPy array
health_S = np.dot(np.dot(health_Drmh, health_P_matrix - np.outer(health_r, health_c)), health_Dcmh)

# Step 9: Singular Value Decomposition (SVD)
# Decompose matrix S into U (left singular vectors), D (singular values), and Vt (right singular vectors)
U, D, Vt = np.linalg.svd(health_S, full_matrices=False)

# Step 10: Extract coordinates of the health categories (column scores)
# Column coordinates in the first dimension (Vt.T gives the transposed right singular vectors)
health_csc = np.dot(health_Dcmh, Vt.T[:, 0])  # Column standard coordinates

# Step 11: Display the optimal scale values for health categories
print("Optimal scale values (coordinates of health categories in the first dimension):")
print(health_csc)

Optimal scale values (coordinates of health categories in the first dimension):
[-1.14412143 -0.53665231  1.18795829  2.0428549   2.07602064]


In [5]:
# Mapping the expected values for comparison
expected_values = {
    'VG': 1.144,  # Very Good (Muy buena)
    'G': 0.537,   # Good (Buena)
    'R': -1.188,  # Regular (Regular)
    'B': -2.043,  # Bad (Mala)
    'VB': -2.076  # Very Bad (Muy mala)
}

# Display calculated and expected values side by side
for i, category in enumerate(health.columns):
    print(f"{category}: Calculated: {health_csc[i]:.3f}, Expected: {expected_values[category]}")

VG: Calculated: -1.144, Expected: 1.144
G: Calculated: -0.537, Expected: 0.537
R: Calculated: 1.188, Expected: -1.188
B: Calculated: 2.043, Expected: -2.043
VB: Calculated: 2.076, Expected: -2.076


In [6]:
# Step 12: Adjusting signs for consistency with book results
# The sign inversion is done to align with the results in Greenacre's book.
health_csc_inverted = -health_csc

# Step 13: Assign the inverted values to health categories
health_optimal = pd.Series({
    'Muy buena': health_csc_inverted[0],  # Corresponding to VG (Very Good)
    'Buena': health_csc_inverted[1],      # Corresponding to G (Good)
    'Regular': health_csc_inverted[2],    # Corresponding to R (Regular)
    'Mala': health_csc_inverted[3],       # Corresponding to M (Bad)
    'Muy mala': health_csc_inverted[4]    # Corresponding to VB (Very Bad)
})

# Step 14: Calculate the range of the optimal scale
# Range = max value - min value
health_range = health_optimal.max() - health_optimal.min()

# Step 15: Transform the optimal scale values to fit between 0 and 100
# Formula: transformed_value = ((value - min_value) * 100) / range
health_transformed = (health_optimal - health_optimal.min()) * 100 / health_range

In [7]:
# Step 16: Display the transformed scale values
print("Optimal scale values transformed between 0 and 100 (with corrected signs):")
print(health_transformed)

Optimal scale values transformed between 0 and 100 (with corrected signs):
Muy buena    100.000000
Buena         81.135332
Regular       27.578359
Mala           1.029946
Muy mala       0.000000
dtype: float64
