# CIS 4780 - Assignment 1

Name: Kyle Lukaszek

ID: 1113798

Due: 10/5/2023

## Import Dependencies

In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score

## Part 1 - KNN Implementation

### Define KNN Classifier Class

In [None]:
class KNNClassifier:
    def __init__(self):
        self.X_train = None
        self.y_train = None

    def fit(self, X_train, y_train):
        self.X_train = torch.tensor(X_train)
        self.y_train = torch.tensor(y_train)

    def predict(self, X_test, k):
        y_pred = []

        for x_test in X_test:
            # Calculate euclidian distances from the test point to all training points
            distances = torch.norm(self.X_train - torch.tensor(x_test), dim=1)

            # Sort euclidean distances and get the indices of the k-nearest neighbors
            indices = torch.argsort(distances)[:k]

            # Get the labels of the k-nearest neighbors
            neighbors_labels = self.y_train[indices]

            # Predict/classify the label by taking the majority vote (i.e. the sign of the sum)
            prediction = torch.sign(torch.sum(neighbors_labels))

            # Append the prediction/classification to the list of predictions
            y_pred.append(prediction.item())

        return y_pred

### Define KNN K-fold Cross Validation Function

In [None]:
def knn_k_fold_cross_validation(X, y, k_values):
    # Initialize lists to store accuracy scores for training and testing
    train_accuracies = []
    test_accuracies = []

    # Define the number of folds for cross-validation
    num_folds = 10

    # Initialize K-fold cross-validation using sci-kit learn's KFold class
    # It is better to shuffle the data before splitting it into folds
    kf = KFold(n_splits=num_folds, shuffle=True)

    # Loop through different K values
    for k in k_values:
        fold_accuracies_train = []
        fold_accuracies_test = []

        # Loop through different folds of the data
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            # Initialize and fit the KNN classifier on the training data
            knn = KNNClassifier()
            knn.fit(X_train, y_train)

            # Calculate accuracy for this fold using the current K value
            train_predictions = knn.predict(X_train, k)
            test_predictions = knn.predict(X_test, k)

            train_accuracy = accuracy_score(y_train, train_predictions)
            test_accuracy = accuracy_score(y_test, test_predictions)

            fold_accuracies_train.append(train_accuracy)
            fold_accuracies_test.append(test_accuracy)

        # Calculate the average accuracy over all folds for this K value
        avg_train_accuracy = np.mean(fold_accuracies_train)
        avg_test_accuracy = np.mean(fold_accuracies_test)

        train_accuracies.append(avg_train_accuracy)
        test_accuracies.append(avg_test_accuracy)

    return train_accuracies, test_accuracies

### Load Data And Run KNN Code

In [None]:
input_file = 'KNNClassifierInput.csv'
output_file = 'KNNClassifierOutput.csv'

# Load input data from the input file using Pandas
input_df = pd.read_csv(input_file, header=0)

# Load output data from the output file using Pandas
output_df = pd.read_csv(output_file)

# Remove empty columns from output data
output_df = output_df.dropna(axis=1)

# Convert input data to np array
X = input_df[['Input 1', 'Input 2']].values

# Convert output data to np array
y = output_df.values.squeeze()

# Define the range of K values to test
k_values = list(range(1, 31))  # Test K from 1 to 30

# Perform K-fold cross-validation
train_accuracies, test_accuracies = knn_k_fold_cross_validation(X, y, k_values)

# Select the best K based on testing accuracy
best_k = k_values[np.argmax(test_accuracies)]
print(f'Best K Value: {best_k}')
print(f'Test Accuracy with Best K Value: {max(test_accuracies) * 100:.2f}%')

### Plot Results

In [None]:
# Plot the accuracy for different K values
plt.figure(figsize=(10, 6))
plt.plot(k_values, train_accuracies, label='Training Accuracy')
plt.plot(k_values, test_accuracies, label='Testing Accuracy')
plt.title('KNN Accuracy vs. K Value')
plt.xlabel('K (Number of Neighbors)')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)
plt.show()

### K-Value Analysis

Write analysis here.

## Part 2: Linear Regression (Ridge)

### Define Ridge Regression Class

In [None]:
class RidgeRegression(nn.Module):
    def __init__(self, input_dim):
        # Initialize the parent class (nn.Module)
        super(RidgeRegression, self).__init__()
        # Define the linear layer
        self.linear = nn.Linear(input_dim, 1)
    
    # Forward pass
    def forward(self, x):
        return self.linear(x)

### Define PyTorch Z-Score Normalization Function

In [None]:
## Simple implementation based on sklearn.preprocessing.StandardScaler
class ZScoreNormalizer:
    def __init__(self):
        self.mean = None
        self.std = None

    def fit(self, data):
        # Calculate the mean and standard deviation along dim 0 (columns)
        self.mean = torch.mean(data, dim=0)
        self.std = torch.std(data, dim=0)

    def transform(self, data):
        if self.mean is None or self.std is None:
            raise ValueError("Not fitted yet. Call fit() before transform()")
        
        # z = (X - mean) / std
        normalized_data = (data - self.mean) / self.std
        return normalized_data

    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)

### Define Ridge Regression K-fold Cross Validation Function

In [None]:
def lr_k_fold_cross_validation(X, y, model, criterion, optimizer, num_epochs=100, lambda_val=0.0, k_folds=5):
    # Initialize lists to store accuracy scores for training and testing
    train_losses = []  # To store training losses for each fold
    test_losses = []   # To store testing losses for each fold
    r2_scores = []     # To store R2 scores for each fold
    mse_scores = []    # To store Mean Squared Error scores for each fold

    # Define the number of folds for cross-validation (in this case it is 5 per the assignment)
    num_folds = 5

    # Initialize K-fold cross-validation using sci-kit learn's KFold class
    # It is better to shuffle the data before splitting it into folds
    kf = KFold(n_splits=num_folds, shuffle=True)

    # Loop through different folds of the data
    for train_idx, test_idx in kf.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # Set the model to training mode
        model.train()
            
        # Zero out the gradients in the optimizer
        optimizer.zero_grad()

        # Training Loop
        for epoch in range(num_epochs):
            
            # Forward pass: compute predictions
            outputs = model(X_train)
            
            # Calculate the loss
            loss = criterion(outputs, y_train)
            
            # Add L2 regularization term to the loss
            l2_reg = lambda_val * torch.sum(model.linear.weight ** 2)
            loss += l2_reg
            
            # Backward pass: compute gradients
            loss.backward()

            """
            Note: 
            Gradient clipping prevents a NaN error due to exploding gradients in my case, I am not sure if there is an error in my implementation but I thought I should mention it.
            I could have used a smaller learning rate for the optimizer but I found that gradient clipping results in more consistent results.
            """
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            # Update the model's parameters
            optimizer.step()

        # Evaluation
        # Set the model to evaluation mode (no gradient computation)
        model.eval()
        
        with torch.no_grad():
            # Compute predictions on the training and testing data
            train_outputs = model(X_train)
            test_outputs = model(X_test)

            # Calculate training and testing losses
            train_loss = criterion(train_outputs, y_train)
            test_loss = criterion(test_outputs, y_test)
            
            # Calculate R2 score and Mean Squared Error for testing data
            r2 = r2_score(y_test, test_outputs)
            mse = mean_squared_error(y_test, test_outputs)

        # Store the results for this fold
        train_losses.append(train_loss.item())
        test_losses.append(test_loss.item())
        r2_scores.append(r2)
        mse_scores.append(mse)

    # Calculate average results over all folds
    avg_train_loss = np.mean(train_losses)
    avg_test_loss = np.mean(test_losses)
    avg_r2_score = np.mean(r2_scores)
    avg_mse_score = np.mean(mse_scores)

    return {
        'avg_train_loss': avg_train_loss,
        'avg_test_loss': avg_test_loss,
        'avg_r2_score': avg_r2_score,
        'avg_mse_score': avg_mse_score
    }

### Load Data And Run Linear Regression Code

In [None]:
input_file = 'LinearRegression.csv'
target_file = 'LinearRegressionTarget.csv'

# Load the input data
input_data_df = pd.read_csv(input_file)

# Load the target data
target_data_df = pd.read_csv(target_file)

# Get input data to a PyTorch tensor
input_tensor = torch.tensor(input_data_df.values, dtype=torch.float32)

# Get target values as a PyTorch tensor
target_tensor = torch.tensor(target_data_df.values, dtype=torch.float32)

# Normalize input data using custom ZScoreNormalizer class
input_normalizer = ZScoreNormalizer()
input_tensor = input_normalizer.fit_transform(input_tensor)

# Normalize target data using custom ZScoreNormalizer class
target_normalizer = ZScoreNormalizer()
target_tensor = target_normalizer.fit_transform(target_tensor)

# Initialize hyperparameters
input_dim = input_tensor.shape[1]  # Adjust this according to your dataset
num_epochs = 100

# Lambda values from [0, 250]
lambda_values = list(range(0, 251))
results = []

# Perform Ridge Regression with cross-validation for different lambda values
for lambda_val in lambda_values:
    model = RidgeRegression(input_dim)

    # Define the loss function
    criterion = nn.MSELoss()

    """ 
    Note: 
    I used a learning rate of 0.01 because I used gradient clipping to prevent exploding gradients (see the lr_k_fold_cross_validation function above).
    I messed around with using a smaller learning rate (e.g. 0.001) and no gradient clipping but I found that this resulted in a lower R2 score and larger avg losses.
    Therefore, I decided to use a larger learning rate with gradient clipping to get the best results with my implementation.
    """
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

    cv_results = lr_k_fold_cross_validation(input_tensor, target_tensor, model, criterion, optimizer, num_epochs, lambda_val)
    results.append((lambda_val, cv_results))

# Find the lambda with the best average R2 score
best_lambda, best_results = max(results, key=lambda x: x[1]['avg_r2_score'])

# Print lambda results for lambda=0
print("Results for lambda=0:")
print(results[0][1]) 

# Print lambda results for best lambda
print(f"\nBest lambda: {best_lambda}")
print("Results for best lambda:")
print(best_results)

### Plot Ridge Regression R2 Results and MSE

In [None]:
# Plot R2 scores for different lambda values
r2_scores = [result[1]['avg_r2_score'] for result in results]

plt.figure(figsize=(10, 6))
plt.plot(lambda_values, r2_scores)
plt.title('R2 Score vs. Lambda')
plt.xlabel('Lambda')
plt.ylabel('Average R2 Score')
plt.grid(True)
plt.show()

# Plot MSE scores for different lambda values
mse_scores = [result[1]['avg_mse_score'] for result in results]

plt.figure(figsize=(10, 6))
plt.plot(lambda_values, mse_scores)
plt.title('MSE Score vs. Lambda')
plt.xlabel('Lambda')
plt.ylabel('Average MSE Score')
plt.grid(True)
plt.show()

### Best Lambda Observation

Write analysis here.

## Part 3: Logistic Regression

## Part 4: K-Nearst Neighbor Classifier VS Logistic Regression