In [1]:
# Load necessary libraries
import numpy as np
from mpl_toolkits import mplot3d
import matplotlib.pyplot as plt
import os
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
plt.style.use('seaborn-v0_8-poster')

In [2]:
# Data path
folder = "/Users/haivanle/Documents/AMATH582/hw2data/train/"

# List of actions
actions = ['walking', 'jumping', 'running']

# Concatenate all training data
X_train = np.empty((114, 0))
for action in actions:
    for i in range(1, 6):
        fname = f"{action}_{i}.npy"
        current_X_train = np.load(os.path.join(folder, fname))
        X_train = np.hstack((X_train, current_X_train))

In [3]:
# Check the shape of X_train matrix 
X_train.shape

(114, 1500)

In [None]:
# Perform PCA
pca = PCA(n_components=5)  # Number of components = 5 for first 5 PCA modes
X_train_centered = X_train - np.mean(X_train, axis=1, keepdims=True)  # Centering the data
pca.fit(X_train_centered.T)  # Transpose X_train to have shape (n_samples, n_features) for sklearn PCA

# Combine figures
fig, axs = plt.subplots(figsize=(15, 10))

for mode in range(5):  # first 5 modes
    pca_mode = pca.components_[mode]  # Get PCA mode
    axs.plot(np.arange(114), pca_mode, label=f'PCA Mode {mode + 1}')

axs.set_xlabel('Index')
axs.set_ylabel('Value')
axs.legend()
axs.set_title('First 5 PCA Modes')
plt.savefig('582hw2f1.pdf')
plt.show()

In [None]:
# Perform Singular Value Decomposition (SVD)
# Use code from SVD notebook
centered_data = X_train - np.mean(X_train, axis=1)[:, None] 

U, S, Vt = np.linalg.svd(centered_data)

# Compute the explained variance ratio from the singular values
explained_variance_ratio = (S ** 2) / np.sum(S ** 2)

# Compute cumulative explained variance ratio
cumulative_energy = np.cumsum(explained_variance_ratio)

# Plot cumulative energy
plt.plot(cumulative_energy, marker='o', label='Cumulative Energy')
plt.xlabel('Number of Singular Values')
plt.ylabel('Cumulative Energy')
plt.title('Cumulative Energy vs Number of Singular Values')
plt.grid(True)

# Annotate percentiles
percentiles = [0.7, 0.8, 0.9, 0.95]
for percent in percentiles:
    modes_needed = np.argmax(cumulative_energy >= percent) + 1
    
# Add percentile
colors = ['blue', 'green', 'orange', 'red']
for i, percent in enumerate(percentiles):
    plt.axhline(y=percent, color=colors[i], linestyle='--', label=f'{int(percent*100)}th Variance')
plt.legend()
plt.savefig('582hw2f2.pdf')
plt.show()

# Determine how many modes are needed to reach certain percentages of energy
for percent in percentiles:
    modes_needed = np.argmax(cumulative_energy >= percent) + 1
    print(f"To approximate X_train up to {percent * 100}% of its energy, we need {modes_needed} SVD modes.")

In [None]:
# Concatenate all training data
X_train = np.empty((114, 0))
y_train = []  # List to store labels
for action_index, action in enumerate(actions):
    for i in range(1, 6):
        fname = f"{action}_{i}.npy"
        current_X_train = np.load(os.path.join(folder, fname))
        X_train = np.hstack((X_train, current_X_train))
        # Add label for each action
        y_train.extend([action_index] * current_X_train.shape[1])

# Perform PCA with 2 and 3 components
pca_2d = PCA(n_components=2)
pca_3d = PCA(n_components=3)

# Fit PCA models
X_train_centered = X_train.T - np.mean(X_train.T, axis=0)
pca_2d.fit(X_train_centered)
pca_3d.fit(X_train_centered)

# Project data onto 2D and 3D PCA space
X_2d = pca_2d.transform(X_train.T)
X_3d = pca_3d.transform(X_train.T)

# Plot 2D trajectories
plt.figure(figsize=(10, 5))
for action_index, action in enumerate(actions):
    action_indices = [i for i, label in enumerate(y_train) if label == action_index]
    plt.scatter(X_2d[action_indices, 0], X_2d[action_indices, 1], label=action)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('Trajectories in 2D PCA Space')
plt.legend()
plt.grid(True)
plt.savefig('582hw2f3.pdf')
plt.show()

In [None]:
# Plot 3D trajectories
fig = plt.figure(figsize=(15, 10))
ax = fig.add_subplot(111, projection='3d')
for action_index, action in enumerate(actions):
    action_indices = [i for i, label in enumerate(y_train) if label == action_index]
    ax.scatter(X_3d[action_indices, 0], X_3d[action_indices, 1], X_3d[action_indices, 2], label=action)
ax.set_xlabel('PC1', labelpad=20)
ax.set_ylabel('PC2', labelpad=25)
ax.set_zlabel('PC3', labelpad=15)
ax.set_title('Trajectories in 3D PCA Space')
ax.legend()
plt.grid(True)
plt.savefig('582hw2f4.pdf')
plt.show()

In [8]:
# List of actions and their corresponding integer labels
action_labels = {action: i for i, action in enumerate(actions)}

# Load and concatenate all training data
X_train = []
y_train = []
for action in actions:
    for i in range(1, 6):
        fname = f"{action}_{i}.npy"
        current_X_train = np.load(os.path.join(folder, fname))
        X_train.append(current_X_train.T)
        y_train.extend([action_labels[action]] * current_X_train.shape[1])

# Convert list to numpy array
X_train = np.concatenate(X_train, axis=0)

# Perform PCA with k modes
k = 3  # Specify the number of modes
pca = PCA(n_components=k)
X_pca = pca.fit_transform(X_train)

# Initialize centroids dictionary
centroids = {action: np.zeros(k) for action in actions}

# Compute centroids for each movement
for action in actions:
    indices = [i for i, label in enumerate(y_train) if label == action_labels[action]]
    centroids[action] = np.mean(X_pca[indices], axis=0)

# Print centroids
for action, centroid in centroids.items():
    print(f"Centroid for {action}: {centroid}")

Centroid for walking: [-36.88211143 253.35282541 175.91202104]
Centroid for jumping: [-23.88986635 499.36826149 -72.5000755 ]
Centroid for running: [  60.77197779 -752.7210869  -103.41194553]


In [9]:
# Function to compute distance between points
def distance(point1, point2):
    return np.linalg.norm(point1 - point2)

# Function to assign labels based on distance to centroids
def assign_labels(X_pca, centroids):
    trained_labels = []
    for sample in X_pca:
        distances = [distance(sample, centroid) for centroid in centroids.values()]
        closest_centroid_label = np.argmin(distances)
        trained_labels.append(closest_centroid_label)
    return trained_labels

# Ground truth labels
ground_truth_labels = np.array(y_train)

# Specify the range of k values for PCA truncation
k_values = [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 30] 

# Compute trained labels and accuracy for each k value
for k in k_values:
    # Perform PCA with k modes
    pca = PCA(n_components=k)
    X_pca = pca.fit_transform(X_train)

    # Compute centroids for each movement
    centroids = {action: np.mean(X_pca[np.array(y_train) == label], axis=0) for action, label in action_labels.items()}

    # Assign trained labels based on distance to centroids
    trained_labels = assign_labels(X_pca, centroids)
    
    # Compute accuracy
    accuracy = accuracy_score(ground_truth_labels, trained_labels)

    # Print results
    print(f"For k={k}:")
    # print(f"Trained Labels: {trained_labels}")
    print(f"Accuracy: {accuracy * 100:.2f}%\n")


For k=2:
Accuracy: 88.13%

For k=3:
Accuracy: 75.60%

For k=4:
Accuracy: 73.00%

For k=5:
Accuracy: 75.07%

For k=6:
Accuracy: 72.60%

For k=7:
Accuracy: 87.07%

For k=8:
Accuracy: 87.53%

For k=9:
Accuracy: 87.87%

For k=10:
Accuracy: 88.80%

For k=12:
Accuracy: 90.93%

For k=13:
Accuracy: 91.00%

For k=14:
Accuracy: 91.07%

For k=15:
Accuracy: 91.07%

For k=16:
Accuracy: 91.07%

For k=17:
Accuracy: 91.07%

For k=18:
Accuracy: 91.07%

For k=19:
Accuracy: 91.07%

For k=20:
Accuracy: 91.07%

For k=30:
Accuracy: 91.07%



In [10]:
# Data path
test_folder = "/Users/haivanle/Documents/AMATH582/hw2data/test/"

# Load test samples
test_samples = []
test_labels = []
for action in actions:
    fname = f"{action}_1t.npy"
    test_sample = np.load(os.path.join(test_folder, fname)).T
    test_samples.append(test_sample)
    test_labels.extend([action_labels[action]] * test_sample.shape[0])

# Convert list to numpy array
test_samples = np.concatenate(test_samples, axis=0)
test_labels = np.array(test_labels)

# Specify the range of k values for PCA truncation
k_values = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 20, 30]  

# Compute accuracy for each k value
for k in k_values:
    # Perform PCA with k modes
    pca = PCA(n_components=k)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(test_samples)

    # Compute centroids for each movement
    centroids = {action: np.mean(X_train_pca[np.array(y_train) == label], axis=0) for action, label in action_labels.items()}

    # Assign predicted labels based on distance to centroids
    predicted_labels = assign_labels(X_test_pca, centroids)
    
    # Compute accuracy
    accuracy = accuracy_score(test_labels, predicted_labels)

    # Print results
    print(f"For k={k}:")
    # print(f"Predicted Labels: {predicted_labels}")
    # print(f"Test Labels: {test_labels}")
    print(f"Accuracy: {accuracy * 100:.2f}%\n")


For k=2:
Accuracy: 98.33%

For k=3:
Accuracy: 92.33%

For k=4:
Accuracy: 74.67%

For k=5:
Accuracy: 91.67%

For k=6:
Accuracy: 71.67%

For k=7:
Accuracy: 94.33%

For k=8:
Accuracy: 93.00%

For k=9:
Accuracy: 94.33%

For k=10:
Accuracy: 94.33%

For k=11:
Accuracy: 95.33%

For k=12:
Accuracy: 95.33%

For k=15:
Accuracy: 95.33%

For k=20:
Accuracy: 95.33%

For k=30:
Accuracy: 95.33%

