<a href="https://colab.research.google.com/github/kartheekkotha/IntroToML/blob/main/BDR_LDA_QDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import sklearn
from sklearn.neighbors import KernelDensity
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis , QuadraticDiscriminantAnalysis

from sklearn.naive_bayes import GaussianNB


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def readData(filePath):
    dataSet = []
    with open(filePath ,'r') as file:
        for line in file:
            data = [float(x) for x in line.strip().split()]
            dataSet.append(data)
    return dataSet

In [None]:
trainDataset = readData('/content/drive/MyDrive/shuttle.trn')
testDataset =  readData('/content/drive/MyDrive/shuttle.tst')

In [None]:
print("Example Train Data")
print(trainDataset[:10])
print("Example Test Data")
print(testDataset[:10])

Example Train Data
[[50.0, 21.0, 77.0, 0.0, 28.0, 0.0, 27.0, 48.0, 22.0, 2.0], [55.0, 0.0, 92.0, 0.0, 0.0, 26.0, 36.0, 92.0, 56.0, 4.0], [53.0, 0.0, 82.0, 0.0, 52.0, -5.0, 29.0, 30.0, 2.0, 1.0], [37.0, 0.0, 76.0, 0.0, 28.0, 18.0, 40.0, 48.0, 8.0, 1.0], [37.0, 0.0, 79.0, 0.0, 34.0, -26.0, 43.0, 46.0, 2.0, 1.0], [85.0, 0.0, 88.0, -4.0, 6.0, 1.0, 3.0, 83.0, 80.0, 5.0], [56.0, 0.0, 81.0, 0.0, -4.0, 11.0, 25.0, 86.0, 62.0, 4.0], [55.0, -1.0, 95.0, -3.0, 54.0, -4.0, 40.0, 41.0, 2.0, 1.0], [53.0, 8.0, 77.0, 0.0, 28.0, 0.0, 23.0, 48.0, 24.0, 4.0], [37.0, 0.0, 101.0, -7.0, 28.0, 0.0, 64.0, 73.0, 8.0, 1.0]]
Example Test Data
[[55.0, 0.0, 81.0, 0.0, -6.0, 11.0, 25.0, 88.0, 64.0, 4.0], [56.0, 0.0, 96.0, 0.0, 52.0, -4.0, 40.0, 44.0, 4.0, 4.0], [50.0, -1.0, 89.0, -7.0, 50.0, 0.0, 39.0, 40.0, 2.0, 1.0], [53.0, 9.0, 79.0, 0.0, 42.0, -2.0, 25.0, 37.0, 12.0, 4.0], [55.0, 2.0, 82.0, 0.0, 54.0, -6.0, 26.0, 28.0, 2.0, 1.0], [41.0, 0.0, 84.0, 3.0, 38.0, -4.0, 43.0, 45.0, 2.0, 1.0], [37.0, 0.0, 100.0, 0.0, 3

In [None]:
trainDataset = np.array(trainDataset)
testDataset = np.array(testDataset)
X_train = np.array(trainDataset[:,:-1])
Y_train = np.array(trainDataset[:,-1])
X_test = np.array(testDataset[:,:-1])
Y_test = np.array(testDataset[:,-1])
print("Example X-Train Data")
print(X_train[:10])
print("Example Y-Train Data")
print(Y_train[:10])
print("Example X-Test Data")
print(X_test[:10])
print("Example Y-Test Data")
print(Y_test[:10])

Example X-Train Data
[[ 50.  21.  77.   0.  28.   0.  27.  48.  22.]
 [ 55.   0.  92.   0.   0.  26.  36.  92.  56.]
 [ 53.   0.  82.   0.  52.  -5.  29.  30.   2.]
 [ 37.   0.  76.   0.  28.  18.  40.  48.   8.]
 [ 37.   0.  79.   0.  34. -26.  43.  46.   2.]
 [ 85.   0.  88.  -4.   6.   1.   3.  83.  80.]
 [ 56.   0.  81.   0.  -4.  11.  25.  86.  62.]
 [ 55.  -1.  95.  -3.  54.  -4.  40.  41.   2.]
 [ 53.   8.  77.   0.  28.   0.  23.  48.  24.]
 [ 37.   0. 101.  -7.  28.   0.  64.  73.   8.]]
Example Y-Train Data
[2. 4. 1. 1. 1. 5. 4. 1. 4. 1.]
Example X-Test Data
[[ 55.   0.  81.   0.  -6.  11.  25.  88.  64.]
 [ 56.   0.  96.   0.  52.  -4.  40.  44.   4.]
 [ 50.  -1.  89.  -7.  50.   0.  39.  40.   2.]
 [ 53.   9.  79.   0.  42.  -2.  25.  37.  12.]
 [ 55.   2.  82.   0.  54.  -6.  26.  28.   2.]
 [ 41.   0.  84.   3.  38.  -4.  43.  45.   2.]
 [ 37.   0. 100.   0.  36.  -8.  63.  64.   2.]
 [ 46.   0.  83.   0.  46.   0.  37.  36.   0.]
 [ 44.   0.  79.   0.  42. -17.  35.  37.

In [None]:
print(f"The size of the training set {len(X_train)} with features {len(X_train[0])}")
print(f"The size of the training set {len(X_test)} with features {len(X_test[0])}")

The size of the training set 43500 with features 9
The size of the training set 14500 with features 9


In [None]:
classLabels = np.unique(Y_train)
classes = len(classLabels)
print("No of classes present in the data are",classes)

No of classes present in the data are 7


In [None]:
#storing the pdf of the classes in the array of size 7
#class conditional probability p(x|C1) it will graph the class
#depending on the given x (x1,x2...x9)values in train data here
pdf= {}
for label in classLabels:
    classData = X_train[Y_train==label]#grouping the data of certain class
    grid = GridSearchCV(KernelDensity(),
                        {'bandwidth': np.linspace(0.1, 1.0, 10)},
                        cv= 5)  # You can adjust cv for cross-validation
    grid.fit(classData)
    bandwidth = grid.best_params_['bandwidth']
    print(bandwidth)
    kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(classData)
    pdf[label] = kde

print(pdf)

1.0
1.0
1.0
1.0
1.0
1.0
1.0
{1.0: KernelDensity(), 2.0: KernelDensity(), 3.0: KernelDensity(), 4.0: KernelDensity(), 5.0: KernelDensity(), 6.0: KernelDensity(), 7.0: KernelDensity()}


In [None]:
grid_resolution = 5
feature_ranges = [(X_train[:, i].min() - 1, X_train[:, i].max() + 1) for i in range(X_train.shape[1])]
grids = [np.linspace(min_val , max_val , grid_resolution) for min_val, max_val in feature_ranges]
meshgrid = np.meshgrid(*grids)
grid_points = np.vstack([entry.ravel() for entry in meshgrid]).T

In [None]:
class_counts = [np.sum(Y_train == label) for label in np.unique(Y_train)]
prior_probabilities = [count / len(Y_train) for count in class_counts]
print(class_counts)
print(prior_probabilities)

[34108, 37, 132, 6748, 2458, 6, 11]
[0.7840919540229885, 0.0008505747126436781, 0.0030344827586206895, 0.1551264367816092, 0.05650574712643678, 0.00013793103448275863, 0.0002528735632183908]


In [None]:
# ...

# Step 5: Classify test vectors using the grid and BDR
test_predictions = []

for point in X_test:
    # Find the index of the grid element in the feature space where the test vector is located
    grid_indices = []
    for i in range(point.shape[0]):
        index = np.searchsorted(grids[i], point[i])
        grid_indices.append(index)

    # Use the BDR to assign a label based on the grid element
    class_probabilities = []

    for label in np.unique(Y_train):
        kde = pdf[label]
        log_pdf = kde.score_samples(point.reshape(1, -1))

        # Use integer index to access prior_probabilities
        class_probabilities.append((label, (log_pdf + np.log(prior_probabilities[int(label)-1]))))

    predicted_label = max(class_probabilities, key=lambda x: x[1])[0]
    test_predictions.append(predicted_label)

# ...

In [None]:
accuracy_bdr = accuracy_score(Y_test, test_predictions)
error_rate_bdr = 1 - accuracy_bdr

# Print accuracy/error rate
print(f"Accuracy: {accuracy_bdr * 100:.2f}%")
print(f"Error Rate: {error_rate_bdr * 100:.2f}%")

Accuracy: 99.88%
Error Rate: 0.12%


In [None]:
ldaClassifier = LinearDiscriminantAnalysis()
ldaClassifier.fit(X_train ,Y_train)
Y_pred = ldaClassifier.predict(X_test)
accuracy_lda = accuracy_score(Y_test , Y_pred)
error_rate_lda = 1 - accuracy_lda

# Print accuracy/error rate
print(f"Accuracy: {accuracy_lda * 100:.2f}%")
print(f"Error Rate: {error_rate_lda * 100:.2f}%")

Accuracy: 94.62%
Error Rate: 5.38%


In [None]:
qdaClassifier = QuadraticDiscriminantAnalysis(reg_param = 0.2)
qdaClassifier.fit(X_train, Y_train)
Y_pred = qdaClassifier.predict(X_test)
accuracy_qda = accuracy_score(Y_test, Y_pred)
error_rate_qda = 1 - accuracy_qda

# Print accuracy/error rate
print(f"Accuracy: {accuracy_qda * 100:.2f}%")
print(f"Error Rate: {error_rate_qda * 100:.2f}%")

Accuracy: 96.46%
Error Rate: 3.54%




In [None]:
print(" Model | Accuracy | Error Rate")
print(f"  BDR  | {accuracy_bdr * 100:.2f}%   | {error_rate_bdr * 100:.2f}")
print(f"  LDA  | {accuracy_lda * 100:.2f}%   | {error_rate_lda * 100:.2f}")
print(f"  QDA  | {accuracy_qda * 100:.2f}%   | {error_rate_qda * 100:.2f}")

 Model | Accuracy | Error Rate
  BDR  | 99.88%   | 0.12
  LDA  | 94.62%   | 5.38
  QDA  | 96.46%   | 3.54


Checking accuracies of the lda and qda model after shuffling the data

In [None]:
trainDataset = readData('/content/drive/MyDrive/shuttle.trn')
testDataset =  readData('/content/drive/MyDrive/shuttle.tst')
print(f'length of the train dataset from shuttle.trn file is {len(trainDataset)}')
print(f'length of the test dataset from shuttle.trn file is {len(testDataset)}')

length of the train dataset from shuttle.trn file is 43500
length of the test dataset from shuttle.trn file is 14500


In [None]:
for i in testDataset:
  trainDataset.append(i)
print(f'length of the train dataset after merging both files{len(trainDataset)}')

length of the train dataset after merging both files58000


In [None]:
trainDataset = np.array(trainDataset)
X = np.array(trainDataset[:,:-1])
Y = np.array(trainDataset[:,-1])

In [None]:
X_train , X_test , Y_train , Y_test = sklearn.model_selection.train_test_split(X,Y,test_size = 0.2 , random_state = 42 , shuffle = True)

In [None]:
print(f"The size of the training set {len(X_train)} with features {len(X_train[0])}")
print(f"The size of the training set {len(X_test)} with features {len(X_test[0])}")

The size of the training set 46400 with features 9
The size of the training set 11600 with features 9


In [None]:
ldaClassifier = LinearDiscriminantAnalysis()
ldaClassifier.fit(X_train ,Y_train)
Y_pred = ldaClassifier.predict(X_test)
accuracy_lda = accuracy_score(Y_test , Y_pred)
error_rate_lda = 1 - accuracy_lda

# Print accuracy/error rate
print(f"Accuracy: {accuracy_lda * 100:.2f}%")
print(f"Error Rate: {error_rate_lda * 100:.2f}%")

Accuracy: 94.24%
Error Rate: 5.76%


In [None]:
qdaClassifier = QuadraticDiscriminantAnalysis(reg_param = 0.2)
qdaClassifier.fit(X_train, Y_train)
Y_pred = qdaClassifier.predict(X_test)
accuracy_qda = accuracy_score(Y_test, Y_pred)
error_rate_qda = 1 - accuracy_qda

# Print accuracy/error rate
print(f"Accuracy: {accuracy_qda * 100:.2f}%")
print(f"Error Rate: {error_rate_qda * 100:.2f}%")

Accuracy: 95.50%
Error Rate: 4.50%




In [None]:
print(" Model | Accuracy | Error Rate")
print(f"  LDA  | {accuracy_lda * 100:.2f}%   | {error_rate_lda * 100:.2f}")
print(f"  QDA  | {accuracy_qda * 100:.2f}%   | {error_rate_qda * 100:.2f}")

 Model | Accuracy | Error Rate
  LDA  | 94.24%   | 5.76
  QDA  | 95.50%   | 4.50
