<a href="https://colab.research.google.com/github/linssen2/ECE539-Solubility-Identifier-Project/blob/main/11_16_2023_KNN_Classifier_gen2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from google.colab import drive
import csv
import torch
from sklearn.model_selection import train_test_split
np.set_printoptions(precision=3,suppress=True)

#Read in data
drive.mount('/content/drive/')
with open('/content/drive/MyDrive/curated-solubility-dataset.csv', 'r') as f:
    l = csv.reader(f)
    data = np.array([list(filter(None,i)) for i in l])
    X = data[1:,9:-1].astype(np.double)
    y = data[1:,5].astype(np.double)

#Convert to labels to value (0 = Not Soluable, 1 = Slightly Soluable, 2 = Soluable)
y = [0 if val < -3.65 else 1 if -1.69 < val < 1 else 2 for val in y]
y = np.asarray(y)

#Partition
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size = .7, shuffle=True )
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, train_size = .5, shuffle=True)

X_train = np.asfarray(X_train)
X_val = np.asfarray(X_val)
X_test = np.asfarray(X_test)

#Dataset and label features
print('num_samples, num_features', X.shape)
print('num_labels', y.shape)
print('labels', np.unique(y))

#Paritioned data features
print("Training shape: ", str(X_train.shape))
print("Validation shape: ", str(X_val.shape))
print("Testing shape: ", str(X_test.shape))

Mounted at /content/drive/
num_samples, num_features (9982, 19)
num_labels (9982,)
labels [0 1 2]
Training shape:  (6987, 19)
Validation shape:  (1497, 19)
Testing shape:  (1498, 19)


In [2]:
#Test different number of neighbors each knn model for each
from sklearn.neighbors import KNeighborsClassifier

models = {}
for i in [1,3,5,10,15,25,50,75,100,250,500,1000]:
  print("# of neighbors: ", i)
  knn = KNeighborsClassifier(n_neighbors=i)
  models[i] = knn.fit(X_train, y_train)

# of neighbors:  1
# of neighbors:  3
# of neighbors:  5
# of neighbors:  10
# of neighbors:  15
# of neighbors:  25
# of neighbors:  50
# of neighbors:  75
# of neighbors:  100
# of neighbors:  250
# of neighbors:  500
# of neighbors:  1000


In [3]:
# Accuracy on each model with varying nneigs
import sklearn.metrics as metrics

for i in models:
  y_predict = models[i].predict(X_val)
  acc = metrics.accuracy_score(y_val, y_predict)
  print(f'Classification Rate using {i} neighbors: {acc*100:.2f}%')

Classification Rate using 1 neighbors: 68.47%
Classification Rate using 3 neighbors: 68.27%
Classification Rate using 5 neighbors: 68.00%
Classification Rate using 10 neighbors: 68.20%
Classification Rate using 15 neighbors: 68.60%
Classification Rate using 25 neighbors: 67.74%
Classification Rate using 50 neighbors: 65.93%
Classification Rate using 75 neighbors: 64.46%
Classification Rate using 100 neighbors: 63.39%
Classification Rate using 250 neighbors: 59.79%
Classification Rate using 500 neighbors: 58.32%
Classification Rate using 1000 neighbors: 55.85%


In [4]:
# Train and evaluate best model on X_train and X_val
nneigs = 3

X_trainval = np.concatenate((X_train, X_val), 0)
y_trainval = np.concatenate((y_train, y_val), 0)

knn = KNeighborsClassifier(n_neighbors=nneigs)
model = knn.fit(X_trainval, y_trainval)
y_pred = model.predict(X_test)

acc = metrics.accuracy_score(y_test, y_pred)
cm = metrics.confusion_matrix(y_test, y_pred)
print(f'Classification Rate of {nneigs} neighbors: {acc*100:.2f}%\n')
print(f'Confusion Matrix of {nneigs} neighbors:')
print(cm)

Classification Rate of 3 neighbors: 69.03%

Confusion Matrix of 3 neighbors:
[[429  15 102]
 [ 49 314  98]
 [130  70 291]]
