<a href="https://colab.research.google.com/github/kaledai069/Answer-Validity-Checker-with-Word-Vectorizer-Neural-Nets/blob/master/Alternative_Solution_Ranker_Training_Neural_Net_Head.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [1]:
!pip install -q h5py

In [2]:
import pandas as pd
import numpy as np
import time
import os
import re
import torch
import ast
import h5py

from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim

In [3]:
DATASET_PATH = "/content/gdrive/MyDrive/answer_dataset.h5"

with h5py.File(DATASET_PATH, 'r') as hdf:
    features_loaded = hdf['Embedding'][:]
    target_loaded = hdf['Label'][:]

In [4]:
features_loaded.shape, target_loaded.shape

((2971683, 128), (2971683,))

In [5]:
# seperating +ve and -ve features
positive_samples = features_loaded[target_loaded == 1]
negative_samples = features_loaded[target_loaded == 0]

'''
A testing section hereby commences
'''
# negative_samples = negative_samples[:len(positive_samples)]

# train, validation and test set split for the negative samples
# splitting negative samples in the ratio 70-15-15
train_neg, temp_neg = train_test_split(negative_samples, test_size = 0.1, random_state = 69)
valid_neg, test_neg = train_test_split(temp_neg, test_size = 0.5, random_state = 69)

# train, validation and test set split for the positive samples
# splitting positive samples in the ratio of 90-5-5
train_posi, temp_posi = train_test_split(positive_samples, test_size = 0.1, random_state = 69)
valid_posi, test_posi = train_test_split(temp_posi, test_size = 0.5, random_state = 69)

# preparing features vertical stack with positive and negative ratios
train_features = np.vstack([train_neg, train_posi])
valid_features = np.vstack([valid_neg, valid_posi])
test_features = np.vstack([test_neg, test_posi])

# target labels for negative and positive samples correspondingly
train_labels = np.concatenate([np.zeros(len(train_neg)), np.ones(len(train_posi))])
valid_labels = np.concatenate([np.zeros(len(valid_neg)), np.ones(len(valid_posi))])
test_labels = np.concatenate([np.zeros(len(test_neg)), np.ones(len(test_posi))])

# obvious shuffle
shuffle_train = np.random.permutation(len(train_features))
shuffle_valid = np.random.permutation(len(valid_features))
shuffle_test = np.random.permutation(len(test_features))

# Actual Train-Validation-Test Split
X_train = torch.Tensor(train_features[shuffle_train])
Y_train = torch.FloatTensor(train_labels[shuffle_train])

X_valid = torch.Tensor(valid_features[shuffle_valid])
Y_valid = torch.FloatTensor(valid_labels[shuffle_valid])

X_test = torch.Tensor(test_features[shuffle_test])
Y_test = torch.FloatTensor(test_labels[shuffle_test])

In [6]:
BATCH_SIZE = 64

# GPU intensive training support
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# data loaded to the active device (CPU or GPU)
X_train_tensor, y_train_tensor = X_train.to(device), Y_train.to(device)
X_valid_tensor, y_valid_tensor = X_valid.to(device), Y_valid.to(device)

# tensor dataset loader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
valid_dataset = TensorDataset(X_valid_tensor, y_valid_tensor)

# dataLoader Module
train_loader = DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle = True)
valid_loader = DataLoader(valid_dataset, batch_size = BATCH_SIZE, shuffle = False)

In [7]:
# handling negative sample skewness with BCELogitLoss (pos_weight)
num_positive_samples = len(y_train_tensor[y_train_tensor == 1])
num_negative_samples = len(y_train_tensor[y_train_tensor == 0])

total_num_samples = num_positive_samples + num_negative_samples

weight_for_positive = total_num_samples / (2 * num_positive_samples)
weight_for_negative = total_num_samples / (2 * num_negative_samples)

class_weights = torch.tensor([weight_for_negative], dtype = torch.float32)
print(class_weights)

tensor([3.1997])


In [10]:
# custom neural net with
class AnswerValidator(nn.Module):
    def __init__(self):
        super(AnswerValidator, self).__init__()
        self.fc1 = nn.Linear(128, 64)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(64, 32)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(32, 16)
        self.relu3 = nn.ReLU()
        self.output = nn.Linear(16, 1)

    def forward(self, x):
        x = self.relu1(self.fc1(x))
        x = self.relu2(self.fc2(x))
        x = self.relu3(self.fc3(x))
        x = self.output(x)
        return x

model = AnswerValidator().to(device)

# Binary cross entropy (BCE) as the loss function with optmizer used as Adam
criterion = nn.BCEWithLogitsLoss(pos_weight = class_weights.to(device))
optimizer = optim.Adam(model.parameters(), lr = 0.001)

print("Total no. of parameters in the model: ", sum(p.numel() for p in model.parameters()))

Total no. of parameters in the model:  10881


In [11]:
# Main train_loop
num_epochs = 10
train_eval_interval = 1000
valid_eval_interval = 50000

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    correct_train = 0
    train_data_count = 0

    for i, (inputs, labels) in enumerate(train_loader, 1):
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()

        # Calculate outputs and apply sigmoid
        outputs = torch.sigmoid(model(inputs)).squeeze(1)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        # print("Output: ", outputs, "\nLabels: ", labels)
        # break

        predictions = outputs > 0.5
        correct_train += torch.sum(predictions == labels.byte()).item()
        train_data_count += len(inputs)

        # Compute training accuracy every train_eval_interval iterations
        if i % train_eval_interval == 0:
            avg_train_loss = total_loss / train_eval_interval

            print(f'Iteration {i}, '
                  f'\t Training Loss: {avg_train_loss:.6f}, \t Training Accuracy: {correct_train / train_data_count:.6f}')

            total_loss = 0.0
            # correct_train = 0

        # Validation every valid_eval_interval iterations
        if i % valid_eval_interval == 0:
            model.eval()
            with torch.no_grad():
                valid_loss = 0.0
                correct_valid = 0
                for inputs, labels in valid_loader:
                    inputs, labels = inputs.to(device), labels.to(device)
                    outputs = torch.sigmoid(model(inputs)).squeeze(1)
                    valid_loss += criterion(outputs, labels).item()

                    # Compute validation accuracy
                    predictions = outputs > 0.5
                    correct_valid += torch.sum(predictions == labels.byte()).item()

                avg_valid_loss = valid_loss / len(valid_loader)
                accuracy_valid = correct_valid / len(valid_dataset)

                print(f'\n VALIDATION: Iteration {i}, '
                      f'Validation Loss: {avg_valid_loss:.6f}, \t Validation Accuracy: {accuracy_valid:.6f}\n')

            model.train()

    # Final epoch summary
    print(f'Final Epoch {num_epochs}, '
          f'Training Loss: {avg_train_loss:.4f}, Training Accuracy: {correct_train / (i * len(inputs)):.4f}')


Iteration 1000, 	 Training Loss: 0.920072, 	 Training Accuracy: 0.847437
Iteration 2000, 	 Training Loss: 0.904891, 	 Training Accuracy: 0.848219
Iteration 3000, 	 Training Loss: 0.902406, 	 Training Accuracy: 0.849292
Iteration 4000, 	 Training Loss: 0.905682, 	 Training Accuracy: 0.849164
Iteration 5000, 	 Training Loss: 0.902933, 	 Training Accuracy: 0.849550
Iteration 6000, 	 Training Loss: 0.898696, 	 Training Accuracy: 0.850193
Iteration 7000, 	 Training Loss: 0.904459, 	 Training Accuracy: 0.849672
Iteration 8000, 	 Training Loss: 0.897455, 	 Training Accuracy: 0.850209
Iteration 9000, 	 Training Loss: 0.903156, 	 Training Accuracy: 0.850175
Iteration 10000, 	 Training Loss: 0.901691, 	 Training Accuracy: 0.850181
Iteration 11000, 	 Training Loss: 0.900223, 	 Training Accuracy: 0.850190
Iteration 12000, 	 Training Loss: 0.899658, 	 Training Accuracy: 0.850273
Iteration 13000, 	 Training Loss: 0.896487, 	 Training Accuracy: 0.850623
Iteration 14000, 	 Training Loss: 0.893549, 	 T

In [13]:
model.eval()
with torch.no_grad():
    valid_loss = 0.0
    correct_valid = 0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = torch.sigmoid(model(inputs)).squeeze(1)
        valid_loss += criterion(outputs, labels).item()

        # Compute validation accuracy
        predictions = outputs > 0.5
        correct_valid += torch.sum(predictions == labels.byte()).item()

    avg_valid_loss = valid_loss / len(train_loader)
    accuracy_valid = correct_valid / len(train_dataset)

    print(f'\n VALIDATION: Iteration {i}, '
          f'Validation Loss: {avg_valid_loss:.6f}, \t Validation Accuracy: {accuracy_valid:.6f}\n')


 VALIDATION: Iteration 41790, Validation Loss: 0.881030, 	 Validation Accuracy: 0.863608



In [14]:
!pip install -q sentencepiece

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [15]:
import sentencepiece as spm
from gensim.models import Word2Vec

sp_model = spm.SentencePieceProcessor()
sp_model.Load("/content/sp_model.model")
word_vec_model = Word2Vec.load("/content/word2vec_model")

In [16]:
def generate_answer_embedding(answer, sp_model, word_vec_model):
  test_sub_words = sp_model.EncodeAsPieces(answer)
  # just a part of preprocessing step
  if len(test_sub_words[0]) > 1:
    test_sub_words[0] = test_sub_words[0][1:]
  elif len(test_sub_words[0]) == 1:
    test_sub_words = test_sub_words[1:]

  embeddings = [word_vec_model.wv[word] for word in test_sub_words if word in word_vec_model.wv]
  if embeddings:
    final_word_vector = np.mean(embeddings, axis=0)
  else:
    final_word_vector = np.zeros(100)

  return final_word_vector

In [26]:
testing_words = ["pew", 'adios', 'twist', 'ere', 'mob', 'onset', 'thee', 'tree', 'otis', 'dst', 'quackremedy']

for word in testing_words:
  input_feature = torch.Tensor(generate_answer_embedding(word, sp_model, word_vec_model))

  model.eval().to("cpu")
  with torch.no_grad():
    output = model(input_feature)
    output_probabilities = torch.sigmoid(output)
    predictions = (output_probabilities > 0.5).float()
    print(predictions)

tensor([0.])
tensor([0.])
tensor([1.])
tensor([0.])
tensor([1.])
tensor([1.])
tensor([0.])
tensor([1.])
tensor([0.])
tensor([0.])
tensor([0.])


# Conclusion to this approach:
1. Obviously it didn't work
2. The main flaw with this approach could be the use of sub-word tokenization, and then using word2vec to find the vectors for each of the sub-word, then using mean value as the input to the neural network

In [None]:
torch.save(model.state_dict(), '/content/answer_ranker.model')

In [None]:
# main train_loop
num_epochs = 2
eval_interval = 500

train_eval_interval = 100
valid_eval_interval = 500

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    correct_train = 0
    for i, (inputs, labels) in enumerate(train_loader, 1):
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs).squeeze(dim = 1)
        outputs = torch.sigmoid(model(inputs)).squeeze(1)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        # Compute training accuracy every eval_interval iterations
        if i % eval_interval == 0:
            predictions = torch.sigmoid(outputs) > 0.5
            correct_train += torch.sum(predictions == labels.byte()).item()

    # Validation
    model.eval()
    with torch.no_grad():
        valid_loss = 0.0
        correct_valid = 0
        for inputs, labels in valid_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs).squeeze(dim = 1)
            outputs = torch.sigmoid(model(inputs)).squeeze(1)
            valid_loss += criterion(outputs, labels.squeeze(dim = 1)).item()

            # Compute validation accuracy
            predictions = torch.sigmoid(outputs) > 0.5
            correct_valid += torch.sum(predictions == labels.byte()).item()

    avg_train_loss = total_loss / len(train_loader)
    avg_valid_loss = valid_loss / len(valid_loader)
    accuracy_train = correct_train / len(train_dataset)
    accuracy_valid = correct_valid / len(valid_dataset)

    print(f'Epoch {epoch + 1}/{num_epochs}, '
          f'Training Loss: {avg_train_loss:.4f}, Training Accuracy: {accuracy_train:.4f}, '
          f'Validation Loss: {avg_valid_loss:.4f}, Validation Accuracy: {accuracy_valid:.4f}')