In [4]:
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 29 22:41:03 2022

@author: Ayush
"""

from sklearn.metrics import confusion_matrix, accuracy_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
import math

# Question 1
print("Question 1")

# Load the dataset
data = pd.read_csv('SteelPlateFaults-2class.csv')

# Split data into features (X) and target (Y)
Y = data['Class']
X = data.drop('Class', axis=1)

# Split the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42, shuffle=True)

# Save the train and test datasets
train_data = pd.concat([X_train, Y_train], axis=1)
test_data = pd.concat([X_test, Y_test], axis=1)
train_data.to_csv('SteelPlateFaults-train.csv', index=False)
test_data.to_csv('SteelPlateFaults-test.csv', index=False)

highest_accuracy_Q1_K = 1
highest_accuracy_Q1_value = 0

def KNN_Q1(K):
    global highest_accuracy_Q1_K
    global highest_accuracy_Q1_value

    # Train KNN classifier
    neigh = KNeighborsClassifier(n_neighbors=K)
    neigh.fit(X_train, Y_train)
    Y_predict = neigh.predict(X_test)

    # Print confusion matrix
    print("Confusion Matrix for K =", K)
    cfm = confusion_matrix(Y_test, Y_predict)
    print(cfm)
    print()

    # Calculate and print classification accuracy
    cay = accuracy_score(Y_test, Y_predict)
    print("Classification Accuracy for K =", K, " is ", cay)
    print()

    # Update highest accuracy
    if cay > highest_accuracy_Q1_value:
        highest_accuracy_Q1_K = K
        highest_accuracy_Q1_value = cay

# Evaluate KNN for K=1, 3, 5
KNN_Q1(1)
KNN_Q1(3)
KNN_Q1(5)
print("Highest Accuracy K is: ", highest_accuracy_Q1_K)
print("Highest Accuracy value is: ", highest_accuracy_Q1_value)
print()

# Question 2

# Perform Min-Max normalization on the training data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)

# Save the normalized data
pd.DataFrame(X_train_normalized, columns=X_train.columns).to_csv('SteelPlateFaults-train-Normalised.csv', index=False)
pd.DataFrame(X_test_normalized, columns=X_test.columns).to_csv('SteelPlateFaults-test-normalised.csv', index=False)

highest_accuracy_Q2_K = 1
highest_accuracy_Q2_value = 0

def KNN_Q2(K):
    global highest_accuracy_Q2_K
    global highest_accuracy_Q2_value

    # Train KNN classifier on normalized data
    neigh = KNeighborsClassifier(n_neighbors=K)
    neigh.fit(X_train_normalized, Y_train)
    Y_predict = neigh.predict(X_test_normalized)

    # Print confusion matrix
    print("Confusion Matrix for K =", K)
    cfm = confusion_matrix(Y_test, Y_predict)
    print(cfm)
    print()

    # Calculate and print classification accuracy
    cay = accuracy_score(Y_test, Y_predict)
    print("Classification Accuracy for K =", K, " is ", cay)
    print()

    # Update highest accuracy
    if cay > highest_accuracy_Q2_value:
        highest_accuracy_Q2_K = K
        highest_accuracy_Q2_value = cay

# Evaluate KNN on normalized data for K=1, 3, 5
KNN_Q2(1)
KNN_Q2(3)
KNN_Q2(5)
print("Highest Accuracy K is: ", highest_accuracy_Q2_K)
print("Highest Accuracy value is: ", highest_accuracy_Q2_value)
print()

# Question 3
print("Question 3")

# Load the train and test datasets
train = pd.read_csv('SteelPlateFaults-train.csv')
test = pd.read_csv('SteelPlateFaults-test.csv')

# Separate the features and the class labels
X_train = train.drop('Class', axis=1)
Y_train = train['Class']
X_test = test.drop('Class', axis=1)
Y_test = test['Class']

# Compute the mean and covariance for each class
mean0 = X_train[Y_train == 0].mean()
mean1 = X_train[Y_train == 1].mean()
cov0 = X_train[Y_train == 0].cov() + np.eye(X_train.shape[1]) * 1e-6  # Adding small value to the diagonal
cov1 = X_train[Y_train == 1].cov() + np.eye(X_train.shape[1]) * 1e-6  # Adding small value to the diagonal

prior0 = len(X_train[Y_train == 0]) / len(X_train)
prior1 = len(X_train[Y_train == 1]) / len(X_train)

def likelihood(x, mean, cov):
    size = len(x)
    det = np.linalg.det(cov)
    norm_const = 1.0 / (math.pow((2 * np.pi), float(size) / 2) * math.pow(det, 1.0 / 2))
    x_mu = x - mean
    inv = np.linalg.inv(cov)
    result = math.pow(math.e, -0.5 * (np.dot(np.dot(x_mu, inv), x_mu.T)))
    return norm_const * result

# Classify test data using Bayes classifier
predictions = []
for i in range(len(X_test)):
    p0 = likelihood(X_test.iloc[i], mean0, cov0) * prior0
    p1 = likelihood(X_test.iloc[i], mean1, cov1) * prior1
    if p0 > p1:
        predictions.append(0)
    else:
        predictions.append(1)

# Calculate accuracy and print confusion matrix
bayes_accuracy = accuracy_score(Y_test, predictions)
print("Confusion Matrix for Bayes Classifier:")
print(confusion_matrix(Y_test, predictions))
print("Accuracy Score for Bayes Classifier: ", bayes_accuracy)
print()

# Question 4
print("Question 4")
print("KNN")
print(highest_accuracy_Q1_value)
print()
print("KNN Normalised")
print(highest_accuracy_Q2_value)
print()
print("Bayes Classifier")
print(bayes_accuracy)
print()

print('Highest accuracy is achieved in KNN Normalised: ', highest_accuracy_Q2_value)
print()


Question 1
Confusion Matrix for K = 1
[[ 81  27]
 [ 27 201]]

Classification Accuracy for K = 1  is  0.8392857142857143

Confusion Matrix for K = 3
[[ 83  25]
 [ 12 216]]

Classification Accuracy for K = 3  is  0.8898809523809523

Confusion Matrix for K = 5
[[ 82  26]
 [  9 219]]

Classification Accuracy for K = 5  is  0.8958333333333334

Highest Accuracy K is:  5
Highest Accuracy value is:  0.8958333333333334

Confusion Matrix for K = 1
[[104   4]
 [  9 219]]

Classification Accuracy for K = 1  is  0.9613095238095238

Confusion Matrix for K = 3
[[105   3]
 [  7 221]]

Classification Accuracy for K = 3  is  0.9702380952380952

Confusion Matrix for K = 5
[[104   4]
 [  7 221]]

Classification Accuracy for K = 5  is  0.9672619047619048

Highest Accuracy K is:  3
Highest Accuracy value is:  0.9702380952380952

Question 3
Confusion Matrix for Bayes Classifier:
[[ 97  11]
 [  4 224]]
Accuracy Score for Bayes Classifier:  0.9553571428571429

Question 4
KNN
0.8958333333333334

KNN Normalised
