In [39]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from google.colab import drive
from sklearn import preprocessing
import os

COLAB = True

In [40]:
RANDOM_STATE = 30255
NUM_EPOCHS = 3

if COLAB:
  drive.mount('/content/gdrive')
  PATH = "gdrive/Shareddrives/Adv ML Project/Data/"
  df = pd.read_csv(os.path.join(PATH + "preprocessed_data.csv"))

else:
  df = pd.read_csv('../data/preprocessed_data.csv')
  df = df.sample(n=100, random_state=RANDOM_STATE).reset_index()


le = preprocessing.LabelEncoder()
le.fit(df['CLASS'])
df['LABEL'] = le.transform(df['CLASS'])

df.head()

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Unnamed: 0,DESCRIPTION,SUBJECT,MAIN_SUBJECT,CLASS,BERT_TOKENIZED,SPACY_PREPROCESSED,LABEL
0,The United States Department of Energy Vehicle...,"['33 Advanced Propulsion Systems', '36 Materia...",33 Advanced Propulsion Systems,"Energy Storage, Conversion, and Utilization","{'input_ids': tensor([[ 101, 1996, 2142, 2...",united states department energy vehicle techno...,0
1,Solar reflective “cool pavements” have been pr...,"['32 Energy Conservation, Consumption, And Uti...","32 Energy Conservation, Consumption, And Utili...","Energy Storage, Conversion, and Utilization","{'input_ids': tensor([[ 101, 5943, 21346, 1...",solar reflective cool pavement propose potenti...,0
2,Inconel 718 alloy is used extensively in aerog...,"['36 Materials Science', '33 Advanced Propulsi...",33 Advanced Propulsion Systems,"Energy Storage, Conversion, and Utilization","{'input_ids': tensor([[ 101, 4297, 5643, 2...",inconel alloy extensively aerogas turbine allo...,0
3,The Production Tax Credit (PTC) and the Invest...,"['29 Energy Planning, Policy, And Economy', 'P...","29 Energy Planning, Policy, And Economy","Energy Storage, Conversion, and Utilization","{'input_ids': tensor([[ 101, 1996, 2537, 4...",production tax credit ptc investment tax credi...,0
4,The production tax credit (PTC) promotes wind ...,"['29 Energy Planning, Policy, And Economy', '1...","29 Energy Planning, Policy, And Economy","Energy Storage, Conversion, and Utilization","{'input_ids': tensor([[ 101, 1996, 2537, 4...",production tax credit ptc promote wind energy ...,0


In [41]:
display(df[['CLASS', 'LABEL']].drop_duplicates())

Unnamed: 0,CLASS,LABEL
0,"Energy Storage, Conversion, and Utilization",0
1223,Environmental Sciences,1
2446,Fission and Nuclear Technologies,2
3669,Fossil Fuels,3
4892,Renewable Energy Sources,4


In [42]:
X = df['SPACY_PREPROCESSED']
y = df['LABEL']

In [43]:
# Create a TF-IDF vectorizer to convert text to numerical features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

In [44]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      train_size=0.7, test_size=0.15, 
                                                      random_state=RANDOM_STATE,
                                                      shuffle=True)

In [45]:
# Create and train the KNN model
knn = KNeighborsClassifier(n_neighbors=3, weights='distance')  # Adjust the number of neighbors (K) as desired
knn.fit(X_train, y_train)


In [46]:
# Make predictions on the test set
y_pred = knn.predict(X_valid)

# Evaluate the model
print(classification_report(y_valid, y_pred))


              precision    recall  f1-score   support

           0       0.63      0.83      0.72       185
           1       0.75      0.73      0.74       182
           2       0.84      0.81      0.82       191
           3       0.73      0.74      0.74       174
           4       0.78      0.56      0.66       186

    accuracy                           0.74       918
   macro avg       0.75      0.74      0.73       918
weighted avg       0.75      0.74      0.73       918



In [55]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_neighbors': [3, 7, 10, 15, 20, 25],  # Example values for n_neighbors
    'weights': ['uniform', 'distance'],
}

# Create the grid search object
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions using the best model
y_pred = best_model.predict(X_valid)


In [59]:
print(best_model)

KNeighborsClassifier(n_neighbors=15, weights='distance')


In [56]:
print(classification_report(y_valid, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.84      0.78       185
           1       0.80      0.71      0.76       182
           2       0.84      0.83      0.84       191
           3       0.70      0.86      0.77       174
           4       0.80      0.63      0.70       186

    accuracy                           0.77       918
   macro avg       0.78      0.77      0.77       918
weighted avg       0.78      0.77      0.77       918

