# Dataset:
You will be working with two datasets: Train and Test, both containing images of 40 individuals. Each dataset has been preprocessed for this assignment.

In [1]:
# import all libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA


# Create PCA function that takes two inputs: dataset, and k= number of principle component, and return the transform data

In [2]:
def my_pca(Data,k):
    "Build this function"
    # Calculate the mean of the data
    mean = np.mean(Data, axis=0)
    
    # Center the data by subtracting the mean
    centered_data = Data - mean
    
    # Compute the covariance matrix of the centered data
    covariance_matrix = np.cov(centered_data, rowvar=False)
    
    # Perform eigendecomposition of the covariance matrix
    eigenvalues, eigenvectors = np.linalg.eigh(covariance_matrix)
    
    # Sort the eigenvalues and eigenvectors in descending order
    sorted_indices = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[sorted_indices]
    eigenvectors = eigenvectors[:, sorted_indices]
    
    # Take the top k eigenvectors
    top_k_eigenvectors = eigenvectors[:, :k]
    
    # Project the centered data onto the top k eigenvectors to get the transformed data
    transformed_data = np.dot(centered_data, top_k_eigenvectors)
    
    return transformed_data
    

# Read your TrainData and Test data. Try to remove the last column of the training data and assign it to one variable, do same thing for the TestData

In [3]:
# write the code here

# Read TrainData and TestData
train_data = pd.read_csv('TrainData.csv')
test_data = pd.read_csv('TestData.csv')

# Display the original datasets
print("Original TrainData:")
print(train_data)

print("\nOriginal TestData:")
print(test_data)

# Remove the last column from TrainData and TestData and assign it to variables
last_column_train = train_data.iloc[:, -1]  # assuming iloc is used for position based indexing
train_data = train_data.iloc[:, :-1]

last_column_test = test_data.iloc[:, -1]
test_data = test_data.iloc[:, :-1]

# Display the modified datasets
print("\nModified TrainData:")
print(train_data)

print("\nLast Column of TrainData:")
print(last_column_train)

print("\nModified TestData:")
print(test_data)

print("\nLast Column of TestData:")
print(last_column_test)


Original TrainData:
     T_1  T_2  T_3  T_4  T_5  T_6  T_7  T_8  T_9  T_10  ...  T_10296  T_10297  \
0     48   51   43   44   56   50   39   46   57    50  ...       39       41   
1     59   66   64   52   44   50   59   64   70    70  ...       30       30   
2     40   40   54   47   56   35   69   43   37    37  ...       31       32   
3     62   52   34   43   31   36   27   34   35    45  ...      170      165   
4     62   74   71   51   44   57   69   69   63    52  ...       30       32   
..   ...  ...  ...  ...  ...  ...  ...  ...  ...   ...  ...      ...      ...   
195  103  104  106  107  107  106  104  103  105   109  ...       68       70   
196  104  106  107  106  104  104  106  108  110   110  ...       65       65   
197   98  104  110  111  107  105  107  110  109   107  ...       74       73   
198  115  110  110  113  115  112  111  114  110   111  ...       92       96   
199  109  109  110  110  111  112  112  113  114   114  ...       91      100   

     T_

# Apply my_pca 

In [4]:
# Load TrainData.csv
train_data = np.genfromtxt('TrainData.csv', delimiter=',')

# Extract features (assuming the features are in columns 1 to second-to-last column)
features = train_data[:, :-1]

# Apply PCA with k=10 (you can adjust k as needed)
transformed_data = my_pca(features, k=10)

# Display the shape of the transformed data
print(f"Original data shape: {features.shape}")
print(f"Transformed data shape: {transformed_data.shape}")









Original data shape: (201, 10304)
Transformed data shape: (201, 10)


# Apply pca using sklearn

In [5]:

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Load TrainData.csv
train_data = pd.read_csv('TrainData.csv')

# Extract features (assuming the features are in columns 1 to second-to-last column)
features = train_data.iloc[:, :-1].values

# Standardize the features
scaler = StandardScaler()
standardized_features = scaler.fit_transform(features)

# Apply PCA with the desired number of components (k)
num_components = 10  # Set the desired number of principal components
pca = PCA(n_components=num_components)
transformed_data = pca.fit_transform(standardized_features)

# Display the shape of the transformed data
print(f"Original data shape: {features.shape}")
print(f"Transformed data shape: {transformed_data.shape}")


Original data shape: (200, 10304)
Transformed data shape: (200, 10)


# Create kernel PCA function that takes two inputs: dataset, and k= number of principle component, and return the transform data. In addition, you need to create three other function one for rbf_kernel, one for polynomial_kernel, and one for linear_kernel

In [10]:
import numpy as np

def rbf_kernel(x, y, gamma=1.0):
    """
    Radial Basis Function (RBF) kernel.
    """
    return np.exp(-gamma * np.linalg.norm(x - y) ** 2)

def poly_kernel(x, y, degree=3):
    """
    Polynomial kernel.
    """
    return (np.dot(x, y) + 1) ** degree

def linear_kernel(x, y):
    """
    Linear kernel.
    """
    return np.dot(x, y)

def my_kpca(data, n_components, kernel_type='rbf', kernel_param=1.0):
    """
    Kernel Principal Component Analysis (KPCA) function.

    Parameters:
    - data: Input data as an ndarray of shape (n_samples, n_features).
    - n_components: Number of principal components to retain.
    - kernel_type: Type of kernel ('rbf', 'poly', or 'linear').
    - kernel_param: Kernel parameter (e.g., gamma for RBF, degree for polynomial).

    Returns:
    - Transformed data in the KPCA space.
    """
    
    n_samples = data.shape[0]
    kernel_matrix = np.zeros((n_samples, n_samples))

    # Build the kernel matrix based on the specified kernel type
    for i in range(n_samples):
        for j in range(n_samples):
            if kernel_type == 'rbf':
                kernel_matrix[i, j] = rbf_kernel(data[i], data[j], gamma=kernel_param)
            elif kernel_type == 'poly':
                kernel_matrix[i, j] = poly_kernel(data[i], data[j], degree=kernel_param)
            elif kernel_type == 'linear':
                kernel_matrix[i, j] = linear_kernel(data[i], data[j])
            else:
                raise ValueError("Invalid kernel type. Supported types are 'rbf', 'poly', and 'linear'.")

    # Center the kernel matrix
    one_n = np.ones((n_samples, n_samples)) / n_samples
    centered_matrix = kernel_matrix - one_n.dot(kernel_matrix) - kernel_matrix.dot(one_n) + one_n.dot(kernel_matrix).dot(one_n)

    # Eigendecomposition
    eigvals, eigvecs = np.linalg.eigh(centered_matrix)
    eigvecs = eigvecs[:, ::-1]

    # Select the top n_components eigenvectors
    indices = np.argsort(eigvals)[::-1][:n_components]
    eigvals, eigvecs = eigvals[indices], eigvecs[:, indices]

    # Normalize eigenvectors by eigenvalues
    eigvecs = eigvecs / np.sqrt(eigvals)

    # Project the data onto the eigenvectors
    transformed_data = np.dot(centered_matrix, eigvecs)

    return transformed_data


# Apply my_kpca on the Train data


In [11]:
# Assuming you have already defined the my_kpca function and the kernel functions (rbf_kernel, poly_kernel, linear_kernel)

# Load TrainData.csv
train_data = np.genfromtxt('TrainData.csv', delimiter=',')

# Extract features (assuming the features are in columns 1 to second-to-last column)
features = train_data[:, :-1]

# Apply KPCA with, for example, k=10 using the RBF kernel
k = 10
transformed_data = my_kpca(features, n_components=k, kernel_type='rbf', kernel_param=1.0)

# Display the shape of the transformed data
print(f"Original data shape: {features.shape}")
print(f"Transformed data shape: {transformed_data.shape}")



Original data shape: (201, 10304)
Transformed data shape: (201, 10)


# Apply Kpca using sklearn on Training data

In [12]:
# build your code here
from sklearn.decomposition import KernelPCA
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np

# Load TrainData.csv
train_data = np.genfromtxt('TrainData.csv', delimiter=',')

# Extract features (assuming the features are in columns 1 to second-to-last column)
features = train_data[:, :-1]

# Handle missing values (replace NaN with the mean value)
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(features)

# Flatten each image into a one-dimensional array
flattened_images = features_imputed.reshape(features_imputed.shape[0], -1)

# Standardize the features
scaler = StandardScaler()
features_standardized = scaler.fit_transform(flattened_images)

# Apply Kernel PCA with RBF kernel
kpca = KernelPCA(n_components=5, kernel='rbf', gamma=1.0)
transformed_data = kpca.fit_transform(features_standardized)

# Display the shape of the transformed data
print(f"Original data shape: {flattened_images.shape}")
print(f"Transformed data shape: {transformed_data.shape}")



Original data shape: (201, 10304)
Transformed data shape: (201, 5)


# You can use the functions below as your classifier 


In [37]:
# Function to calculate distance between two points
def dis(x1, x2):
    return np.linalg.norm(x1 - x2)

# Function to perform classification 
def myclassifier(Train, Trainlabel, Test):
    " Train is the training data"
    " Trainlabel is the training labels"
    " Test is the testing data"
    pred = []

    for testpoint in Test:
        pred_dis = []
        for trainpoint in Train:
            pred_dis.append(dis(testpoint, trainpoint))

        pred.append(np.argmin(pred_dis))  

    return np.array(pred)


# Below function is to calculte the accuracy , you can use this function to get the accuracy of pca and kpca

In [30]:
def calculate_accuracy(true_labels, predicted_labels):
    # Ensure that the true labels and predicted labels have the same length
    if len(true_labels) != len(predicted_labels):
        raise ValueError("Length of true_labels and predicted_labels must be the same.")

    # Count the number of correct predictions
    correct_predictions = sum(1 for true, predicted in zip(true_labels, predicted_labels) if true == predicted)

    # Calculate accuracy as the ratio of correct predictions to total predictions
    accuracy = correct_predictions / len(true_labels)

    return accuracy




In [39]:
from sklearn.decomposition import KernelPCA
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import numpy as np

# Load TrainData.csv
train_data = np.genfromtxt('TrainData.csv', delimiter=',')

# Extract features (assuming the features are in columns 1 to second-to-last column)
features = train_data[:, :-1]

# Extract labels (assuming labels are in the last column)
labels = train_data[:, -1]

# Handle missing values (replace NaN with the mean value)
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(features)

# Flatten each image into a one-dimensional array
flattened_images = features_imputed.reshape(features_imputed.shape[0], -1)

# Standardize the features
scaler = StandardScaler()
features_standardized = scaler.fit_transform(flattened_images)

# Apply Kernel PCA with RBF kernel
kpca = KernelPCA(n_components=10, kernel='rbf', gamma=1.0)
transformed_data = kpca.fit_transform(features_standardized)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(transformed_data, labels, test_size=0.2, random_state=42)

# Use your classifier (myclassifier) on the transformed data
kpca_predictions = myclassifier(X_train, y_train, X_test)

# Calculate accuracy using your provided 'calculate_accuracy' function
kpca_accuracy = calculate_accuracy(y_test, kpca_predictions)

print("KPCA Accuracy:", kpca_accuracy)
# Use your classifier (myclassifier) on the transformed data
pca_predictions = myclassifier(X_train, y_train, X_test)

# Calculate accuracy using your provided 'calculate_accuracy' function
pca_accuracy = calculate_accuracy(y_test, pca_predictions)

print("PCA Accuracy:", pca_accuracy)


KPCA Accuracy: 0.04878048780487805
PCA Accuracy: 0.04878048780487805
