# Linear Discriminant Analysis Demo

This notebook demonstrates how to build, visualize, and analyze linear discriminant models for analysis.

**What you'll learn:**
- Build linear discriminant models from data
- Visualize 2D linear discriminant lines

# Setup

In [1]:
import cuanalytics
import numpy as np
import pandas as pd

df = cuanalytics.load_iris_data()
# print(df.head())
# Extract rows pertaining to the species "Setosa" and "Versicolor"
df_subset = df[(df['species'] == 'Iris-setosa') | (df['species'] == 'Iris-versicolor')]
df_subset = df_subset[['sepal_width', 'petal_width', 'species']]
# Fit LDA (returns LDAModel object)
lda = cuanalytics.fit_lda(df, formula='species ~ .')
# lda.summary()
# lda.visualize()
# X_transformed = lda.transform(df_subset)
# y = df_subset['species'].values

# # Find the threshold that separates the classes
# # It's between the max setosa score and min versicolor score
# setosa_scores = X_transformed[y == 'Iris-setosa']
# versicolor_scores = X_transformed[y == 'Iris-versicolor']

# threshold = (setosa_scores.max() + versicolor_scores.min()) / 2

# print(f"Decision threshold in LD1 space: {threshold:.4f}")
# print(f"Max setosa LD1: {setosa_scores.max():.4f}")
# print(f"Min versicolor LD1: {versicolor_scores.min():.4f}")

# # The point closest to THIS threshold is most uncertain
# distances_to_threshold = np.abs(X_transformed.flatten() - threshold)
# most_uncertain_idx = np.argmin(distances_to_threshold)
# print(f"\nMost uncertain point: LD1 = {X_transformed[most_uncertain_idx][0]:.4f}")
# print(f"Distance from threshold: {distances_to_threshold[most_uncertain_idx]:.4f}")
flower = pd.DataFrame({
    'sepal_length': [5.8],
    'sepal_width': [2.7],
    'petal_length': [5.1],
    'petal_width': [1.9],
})
lda.summary()
lda.transform(flower)

Linear Discriminant Analysis Model
Number of classes: 3
Classes: ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
Number of features: 4
Number of components: None
Training accuracy: 98.00%

LINEAR DISCRIMINANT ANALYSIS MODEL SUMMARY

MODEL INFORMATION:
----------------------------------------------------------------------
Number of features: 4
Number of classes: 3
Classes: ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
Number of discriminant components: 2
Solver: svd
Training samples: 150

DISCRIMINANT FUNCTIONS (for classification):
----------------------------------------------------------------------

With 3 classes, LDA creates discriminant scores for each class.
Classification: Predict the class with the highest discriminant score.

Discriminant for 'Iris-setosa':
  f_Iris-setosa(x) = 6.2462×sepal_length + 12.2461×sepal_width - 16.8374×petal_length - 21.1372×petal_width - 15.3955

Discriminant for 'Iris-versicolor':
  f_Iris-versicolor(x) = -1.5167×sepal_length - 4.3679×

array([[-5.5156825 , -0.04401811]])

In [2]:
# Get coefficients for LD1 and LD2
coef_LD1 = lda.lda.coef_[0]  # First discriminant
coef_LD2 = lda.lda.coef_[1]  # Second discriminant

print("\nLD1 coefficients:")
for i, feature in enumerate(lda.feature_names):
    print(f"  {feature}: {coef_LD1[i]:.4f}")

print("\nLD2 coefficients:")
for i, feature in enumerate(lda.feature_names):
    print(f"  {coef_LD2[i]:.4f}")

# Get the overall means (used for centering)
overall_means = lda.lda.xbar_  # Overall feature means

print("\nOverall feature means (for centering):")
for i, feature in enumerate(lda.feature_names):
    print(f"  {feature}: {overall_means[i]:.4f}")

# Get class means in discriminant space
class_means_original = lda.lda.means_  # Class means in original space

print("\nClass means in original space:")
for i, class_name in enumerate(lda.classes):
    print(f"\n{class_name}:")
    for j, feature in enumerate(lda.feature_names):
        print(f"  {feature}: {class_means_original[i, j]:.4f}")



LD1 coefficients:
  sepal_length: 6.2462
  sepal_width: 12.2461
  petal_length: -16.8374
  petal_width: -21.1372

LD2 coefficients:
  -1.5167
  -4.3679
  4.6498
  3.1864

Overall feature means (for centering):
  sepal_length: 5.8433
  sepal_width: 3.0540
  petal_length: 3.7587
  petal_width: 1.1987

Class means in original space:

Iris-setosa:
  sepal_length: 5.0060
  sepal_width: 3.4180
  petal_length: 1.4640
  petal_width: 0.2440

Iris-versicolor:
  sepal_length: 5.9360
  sepal_width: 2.7700
  petal_length: 4.2600
  petal_width: 1.3260

Iris-virginica:
  sepal_length: 6.5880
  sepal_width: 2.9740
  petal_length: 5.5520
  petal_width: 2.0260


In [3]:
import cuanalytics
import pandas as pd
import numpy as np

# Load data and fit model
df = cuanalytics.load_iris_data()
lda = cuanalytics.fit_lda(df, formula='species ~ .')

# Test flower
test_flower = pd.DataFrame({
    'sepal_length': [5.8],
    'sepal_width': [2.7],
    'petal_length': [5.1],
    'petal_width': [1.9]
})

print("="*70)
print("MANUAL LDA PREDICTION - STEP BY STEP")
print("="*70)

# Step 1: Get the transformation matrix (scalings)
scalings = lda.lda.scalings_
overall_means = lda.lda.xbar_

print("\nStep 1: Get transformation parameters")
print(f"Scalings matrix shape: {scalings.shape}")
print(f"Overall means: {overall_means}")

# Step 2: Center the flower
flower_values = test_flower.values[0]
centered_flower = flower_values - overall_means

print("\nStep 2: Center the flower (subtract overall means)")
for i, feature in enumerate(lda.feature_names):
    print(f"  {feature}: {flower_values[i]:.4f} - {overall_means[i]:.4f} = {centered_flower[i]:.4f}")

# Step 3: Project onto LD axes
LD_scores = np.dot(centered_flower, scalings)
LD1_score = LD_scores[0]
LD2_score = LD_scores[1]

print("\nStep 3: Project onto discriminant axes")
print(f"  LD1 = {LD1_score:.4f}")
print(f"  LD2 = {LD2_score:.4f}")

# Verify
official_coords = lda.transform(test_flower)
print(f"\nVerification with lda.transform():")
print(f"  Manual:   ({LD1_score:.4f}, {LD2_score:.4f})")
print(f"  Official: ({official_coords[0,0]:.4f}, {official_coords[0,1]:.4f})")
print(f"  ✓ Match: {np.allclose(LD_scores, official_coords[0])}")

# Step 4: Get class centroids in discriminant space
print("\nStep 4: Calculate class centroids in discriminant space")
class_means_original = lda.lda.means_
class_means_LD = []

for i, class_name in enumerate(lda.classes):
    centered_class_mean = class_means_original[i] - overall_means
    class_LD = np.dot(centered_class_mean, scalings)
    class_means_LD.append(class_LD)
    print(f"  {class_name}: LD1={class_LD[0]:.4f}, LD2={class_LD[1]:.4f}")

# Step 5: Calculate distances to each centroid
print("\nStep 5: Calculate Euclidean distances to centroids")
flower_coords = np.array([LD1_score, LD2_score])
distances = []

for i, class_name in enumerate(lda.classes):
    centroid = np.array(class_means_LD[i])
    distance = np.linalg.norm(flower_coords - centroid)
    distances.append(distance)
    print(f"  Distance to {class_name}: {distance:.4f}")

# Step 6: Predict the closest class
closest_class_idx = np.argmin(distances)
predicted_class = lda.classes[closest_class_idx]

print("\nStep 6: Make prediction")
print(f"  Closest centroid: {predicted_class}")
print(f"  → Manual prediction: {predicted_class}")

# Verify
official_prediction = lda.predict(test_flower)[0]
print(f"\nVerification with lda.predict():")
print(f"  Manual:   {predicted_class}")
print(f"  Official: {official_prediction}")
print(f"  ✓ Match: {predicted_class == official_prediction}")

print("\n" + "="*70)

Linear Discriminant Analysis Model
Number of classes: 3
Classes: ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
Number of features: 4
Number of components: None
Training accuracy: 98.00%
MANUAL LDA PREDICTION - STEP BY STEP

Step 1: Get transformation parameters
Scalings matrix shape: (4, 2)
Overall means: [5.84333333 3.054      3.75866667 1.19866667]

Step 2: Center the flower (subtract overall means)
  sepal_length: 5.8000 - 5.8433 = -0.0433
  sepal_width: 2.7000 - 3.0540 = -0.3540
  petal_length: 5.1000 - 3.7587 = 1.3413
  petal_width: 1.9000 - 1.1987 = 0.7013

Step 3: Project onto discriminant axes
  LD1 = -5.5157
  LD2 = -0.0440

Verification with lda.transform():
  Manual:   (-5.5157, -0.0440)
  Official: (-5.5157, -0.0440)
  ✓ Match: True

Step 4: Calculate class centroids in discriminant space
  Iris-setosa: LD1=7.6156, LD2=0.2126
  Iris-versicolor: LD1=-1.8225, LD2=-0.7180
  Iris-virginica: LD1=-5.7932, LD2=0.5054

Step 5: Calculate Euclidean distances to centroids
  Dis

In [4]:
import cuanalytics
import numpy as np

# 2 classes only
df = cuanalytics.load_iris_data()
df_2class = df[df['species'].isin(['Iris-setosa', 'Iris-versicolor'])]
lda = cuanalytics.fit_lda(df_2class, formula='species ~ .')

# Test flower
test_flower = pd.DataFrame({
    'sepal_length': [5.8],
    'sepal_width': [2.7],
    'petal_length': [5.1],
    'petal_width': [1.9]
})

# Method 1: Use projection (scalings)
LD1_score = lda.transform(test_flower)[0, 0]
print(f"LD1 score: {LD1_score:.4f}")

# Find the threshold (midpoint between class means in LD space)
class_means_LD1 = lda.transform(pd.DataFrame(lda.lda.means_, columns=lda.feature_names))
threshold = class_means_LD1.mean()
print(f"Threshold: {threshold:.4f}")

prediction_via_projection = "Iris-setosa" if LD1_score < threshold else "Iris-versicolor"
print(f"Prediction via projection: {prediction_via_projection}")

# Method 2: Use decision function (coef)
# This works directly in 4D space without projection
decision_score = lda.lda.decision_function(test_flower)[0]
print(f"\nDecision score: {decision_score:.4f}")

prediction_via_decision = lda.predict(test_flower)[0]
print(f"Prediction via decision function: {prediction_via_decision}")

# They should match!
print(f"\nSame prediction? {prediction_via_projection == prediction_via_decision}")

Linear Discriminant Analysis Model
Number of classes: 2
Classes: ['Iris-setosa', 'Iris-versicolor']
Number of features: 4
Number of components: None
Training accuracy: 100.00%
LD1 score: 8.8460
Threshold: -0.0000
Prediction via projection: Iris-versicolor

Decision score: 90.1729
Prediction via decision function: Iris-versicolor

Same prediction? True
