## Process data and generates embedding vectors

In [1]:
# Mounting Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Install the sentence-transformers package
!pip install sentence-transformers

In [3]:
# Adding the helper and models directory to sys.path for module import
import sys
sys.path.append('/content/drive/MyDrive/Project/helper/')
sys.path.append('/content/drive/MyDrive/Project/models/')

In [4]:
# Importing the required modules
import dataProcessor
import generateEmbedding
import torch
from sklearn import metrics

In [5]:
# Creating an instance of the dataProcessor class and processing the data
data_processor = dataProcessor.DataProcessor('/content/drive/MyDrive/Project/data/roft.csv')
data_dict = data_processor.process_data()

In [6]:
# Creating an instance of the SentenceEmbedder class and generating embeddings for train, validation, and test sets
sentence_embedder = generateEmbedding.SentenceEmbedder()
train_X = sentence_embedder.generate_embeddings(data_dict['train']['X'])
val_X = sentence_embedder.generate_embeddings(data_dict['val']['X'])
test_X = sentence_embedder.generate_embeddings(data_dict['test']['X'])

100%|██████████| 7571/7571 [09:55<00:00, 12.71it/s]
100%|██████████| 946/946 [01:13<00:00, 12.96it/s]
100%|██████████| 947/947 [01:12<00:00, 13.11it/s]


In [7]:
# Creating one-hot vectors for the output variable for train, validation, and test sets
train_Y, val_Y, test_Y = sentence_embedder.onehot_output(data_dict['train']['Y'], data_dict['val']['Y'], data_dict['test']['Y'])

## Single Layer Attention Model

In [8]:
import SingleLayerAttention

# Creating an instance of the SingleLayerAttention class for classification
AttentionModel = SingleLayerAttention.AttentionClassifier(input_size=768, hidden_size=128, num_classes=10)

# Defining the optimizer for training the model
optimizer = torch.optim.Adam(AttentionModel.parameters(), lr=0.001, weight_decay=1e-5)

# Creating an instance of the Trainer class and training the model
trainer = SingleLayerAttention.Trainer(AttentionModel, optimizer)
trainer.train(train_X, train_Y, num_epochs=100, batch_size=10)

Epoch 1/100: train_loss=2.2557 train_acc=0.1651
Epoch 2/100: train_loss=2.1582 train_acc=0.2091
Epoch 3/100: train_loss=2.1106 train_acc=0.2318
Epoch 4/100: train_loss=2.0649 train_acc=0.2528
Epoch 5/100: train_loss=2.0164 train_acc=0.2754
Epoch 6/100: train_loss=1.9621 train_acc=0.2964
Epoch 7/100: train_loss=1.9015 train_acc=0.3153
Epoch 8/100: train_loss=1.8344 train_acc=0.3416
Epoch 9/100: train_loss=1.7611 train_acc=0.3698
Epoch 10/100: train_loss=1.6833 train_acc=0.3964
Epoch 11/100: train_loss=1.6032 train_acc=0.4257
Epoch 12/100: train_loss=1.5262 train_acc=0.4550
Epoch 13/100: train_loss=1.4616 train_acc=0.4805
Epoch 14/100: train_loss=1.4000 train_acc=0.5071
Epoch 15/100: train_loss=1.3599 train_acc=0.5236
Epoch 16/100: train_loss=1.3075 train_acc=0.5467
Epoch 17/100: train_loss=1.2495 train_acc=0.5644
Epoch 18/100: train_loss=1.2063 train_acc=0.5813
Epoch 19/100: train_loss=1.1749 train_acc=0.5942
Epoch 20/100: train_loss=1.1401 train_acc=0.6056
Epoch 21/100: train_loss=1.09

In [9]:
# Predicting the output for train,test and validation set and calculating the accuracy
prediction = AttentionModel(torch.Tensor(train_X).to(trainer.device)).argmax(dim=1).cpu().detach().numpy()
print(f'Train Accuracy = {metrics.accuracy_score(prediction,train_Y.argmax(dim=1)):.4f}')
prediction = AttentionModel(torch.Tensor(test_X).to(trainer.device)).argmax(dim=1).cpu().detach().numpy()
print(f'Test Accuracy = {metrics.accuracy_score(prediction,test_Y.argmax(dim=1)):.4f}')
prediction = AttentionModel(torch.Tensor(val_X).to(trainer.device)).argmax(dim=1).cpu().detach().numpy()
print(f'Validation Accuracy = {metrics.accuracy_score(prediction,val_Y.argmax(dim=1)):.4f}')

Train Accuracy = 0.8423
Test Accuracy = 0.1658
Validation Accuracy = 0.2030


## Tree based models on cosine similarity values for consecutive embeddings

In [6]:
# Creating an instance of the SentenceEmbedder class and generating cosine similarity values 
# for consecutive embeddings for the train, validation, and test sets
sentence_embedder = generateEmbedding.SentenceEmbedder()
train_X_cosine = sentence_embedder.generate_running_embedding(data_dict['train']['X'])
val_X_cosine = sentence_embedder.generate_running_embedding(data_dict['val']['X'])
test_X_cosine = sentence_embedder.generate_running_embedding(data_dict['test']['X'])

100%|██████████| 7571/7571 [1:00:39<00:00,  2.08it/s]
100%|██████████| 946/946 [07:47<00:00,  2.02it/s]
100%|██████████| 947/947 [07:38<00:00,  2.06it/s]


In [7]:
train_Y_cosine, val_Y_cosine, test_Y_cosine = data_dict['train']['Y'], data_dict['val']['Y'], data_dict['test']['Y']

In [69]:
# Filter out data points where y is less than or equal to 0
train_X_filtered, train_Y_filtered = sentence_embedder.eliminate_zero_boundry(train_X_cosine ,train_Y_cosine)
test_X_filtered, test_Y_filtered = sentence_embedder.eliminate_zero_boundry(test_X_cosine ,test_Y_cosine)
val_X_filtered, val_Y_filtered = sentence_embedder.eliminate_zero_boundry(val_X_cosine ,val_Y_cosine)

In [70]:
import TreeBasedModels

rf = TreeBasedModels.RandomForest(n_estimators=500, max_depth=7)
rf.fit(train_X_filtered, train_Y_filtered)
prediction = rf.predict(train_X_filtered)
print(f'Train Accuracy = {metrics.accuracy_score(prediction,train_Y_filtered):.4f}')
prediction = rf.predict(test_X_filtered)
print(f'Test Accuracy = {metrics.accuracy_score(prediction,test_Y_filtered):.4f}')
prediction = rf.predict(val_X_filtered)
print(f'Validation Accuracy = {metrics.accuracy_score(prediction,val_Y_filtered):.4f}')

Train Accuracy = 0.6578
Test Accuracy = 0.1819
Validation Accuracy = 0.1800


In [72]:
xgb = TreeBasedModels.XGBoost(n_estimators=500, max_depth=7,learning_rate=0.01)
xgb.fit(train_X_filtered, train_Y_filtered)
prediction = xgb.predict(train_X_filtered)
print(f'Train Accuracy = {metrics.accuracy_score(prediction,train_Y_filtered):.4f}')
prediction = xgb.predict(test_X_filtered)
print(f'Test Accuracy = {metrics.accuracy_score(prediction,test_Y_filtered):.4f}')
prediction = xgb.predict(val_X_filtered)
print(f'Validation Accuracy = {metrics.accuracy_score(prediction,val_Y_filtered):.4f}')

Train Accuracy = 0.6092
Test Accuracy = 0.1981
Validation Accuracy = 0.1916
