<a href="https://colab.research.google.com/github/kaledai069/Answer-Validity-Checker-with-Word-Vectorizer-Neural-Nets/blob/master/Alternative_Solution_Ranker_Vectorizer_Trainer_%26_Dataset_Preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!pip install -q sentencepiece
!pip install -q pyspellchecker
!pip install -q h5py

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25h

# Important Imports

In [22]:
import pandas as pd
import re
import numpy as np
import os
import time
import sentencepiece as spm
import random
import string
import nltk
import h5py

from gensim.models import Word2Vec
from spellchecker import SpellChecker
from nltk.corpus import treebank, brown
from tqdm import tqdm

tqdm.pandas()
nltk.download('brown')
nltk.download('treebank')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


True

## Preparing Answer-Dataset

In [4]:
answer_list_path = "/content/gdrive/MyDrive/First Pass Model/all_answer_list.tsv"

with open(answer_list_path, 'r') as f:
  lines = f.readlines()

answer_list = []
for i in tqdm(range(len(lines)), ncols = 100):
  pattern_omitted_word = re.sub('[~0-9\n]', '', re.sub(r'\t""""""\t', '', lines[i]))
  answer_format = pattern_omitted_word.replace(' ', '')
  answer_list.append(answer_format)

100%|███████████████████████████████████████████████████| 458929/458929 [00:01<00:00, 251060.91it/s]


In [None]:
with open("/content/corpus.txt", 'w') as f:
  f.write("\n".join(answer_list))

### Training Sentence-Piece Tokenizer for sub-word recognization

In [5]:
# training sub-word tokenizer
# sp = spm.SentencePieceTrainer.Train("--input=/content/corpus.txt --model_prefix=sp_model --vocab_size=3000")

# loding trained sp-model for sub-word tokenized words
sp_model = spm.SentencePieceProcessor()
sp_model.Load("/content/sp_model.model")

True

#### Post-processing output string from the SP model

In [None]:
sub_word_answers_list = []
for answer in tqdm(answer_list, ncols = 100):
  sub_words = sp_model.EncodeAsPieces(answer)
  try:
    if len(sub_words[0]) > 1:
      sub_words[0] = sub_words[0][1:]
    elif len(sub_words[0]) == 1:
      sub_words = sub_words[1:]
    sub_word_answers_list.append(sub_words)
  except IndexError:
    print(answer, sub_words)

print(len(sub_word_answers_list))

 22%|███████████▍                                       | 103150/458929 [00:00<00:03, 106615.96it/s]

 []


 28%|██████████████                                     | 126963/458929 [00:01<00:03, 110480.82it/s]

 []


100%|███████████████████████████████████████████████████| 458929/458929 [00:04<00:00, 107975.31it/s]

458927





In [6]:
def generate_answer_embedding(answer, sp_model, word_vec_model):
  test_sub_words = sp_model.EncodeAsPieces(answer)
  # just a part of preprocessing step
  if len(test_sub_words[0]) > 1:
    test_sub_words[0] = test_sub_words[0][1:]
  elif len(test_sub_words[0]) == 1:
    test_sub_words = test_sub_words[1:]

  embeddings = [word_vec_model.wv[word] for word in test_sub_words if word in word_vec_model.wv]
  if embeddings:
    final_word_vector = np.mean(embeddings, axis=0)
  else:
    final_word_vector = np.zeros(100)

  return final_word_vector

In [7]:
word_vec_model = Word2Vec.load("/content/word2vec_model")

#### Partial-Dataset with only Positive-Answer listing

In [28]:
dataset_as_list = []
for i, answer in tqdm(enumerate(answer_list), ncols = 100):
  try:
    dataset_as_list.append((answer, generate_answer_embedding(answer, sp_model, word_vec_model), 1))
  except IndexError:
    print(answer)

83162it [00:03, 25107.25it/s]




111446it [00:04, 24479.52it/s]




458929it [00:19, 23028.49it/s]


In [29]:
df = pd.DataFrame(dataset_as_list, columns = ["answer", "embedding", "label"])
df.to_csv("/content/partial_dataset.csv")

## Fetching Unique Words from mini-corpus
- 'brown'
- 'treebank'
- 'wiki-2'

In [8]:
def is_all_alphabet(input_string):
    pattern = '^[a-z]+$'
    match = re.match(pattern, input_string.lower())
    return match is not None

#### Using 'Penn Treebank', 'Brown' & 'Wiki-2' Corpus to build unique word repo to be added as positive samples

In [9]:
# fetching all the words from the treebank corpus
tagged_words = treebank.tagged_words()

words_list = []
for (word, tag) in tagged_words:
  hypen_words = word.split('-')
  if len(hypen_words) > 1:
    for hy_word in hypen_words:
      if is_all_alphabet(hy_word):
        words_list.append(hy_word.lower())
  else:
    if is_all_alphabet(word):
      words_list.append(word.lower())

words_list = [word for word in words_list if len(word) >= 3 and len(word) < 28]

In [10]:
# fetching all the words from the brown corpus
categories = brown.categories()
brown_words_list = []
for category in categories:
  sentences = brown.sents(categories = category)
  for sentence in sentences:
    for word in sentence:
      if is_all_alphabet(word):
        brown_words_list.append(word.lower())

brown_words_list = [word for word in brown_words_list if len(word) >= 3 and len(word) < 28]
all_words_list = words_list + brown_words_list
print("Total number of words: ", len(all_words_list))

Total number of words:  846819


In [11]:
words_df = pd.DataFrame(all_words_list, columns = ['Word'])
words_df.drop_duplicates(subset = 'Word', keep = 'first', inplace = True)
print("Total number of unique words from 'brown' & 'treebank'", len(words_df))

Total number of unique words from 'brown' & 'treebank' 41780


In [14]:
def fetch_words_from_set(set_path):
  wiki_words_list = []
  with open(set_path, 'r') as file:
    all_lines = file.readlines()

  word_count = 0
  for line in all_lines:
    all_words = line.split(' ')
    for word in all_words:
      if is_all_alphabet(word):
        wiki_words_list.append(word.lower())

  wiki_words_list = [word for word in wiki_words_list if len(word) >= 3 and len(word) < 28]
  return wiki_words_list

wiki_train_list = fetch_words_from_set("/content/wiki.train.raw")
wiki_valid_list = fetch_words_from_set("/content/wiki.valid.raw")
wiki_test_list = fetch_words_from_set("/content/wiki.test.raw")
all_wiki_words = wiki_train_list + wiki_valid_list + wiki_test_list

print("Total number of words in the 'wiki-2': ", len(all_wiki_words))

Total number of words in the 'wiki-2':  1662599


In [15]:
all_words_list += all_wiki_words
words_df = pd.DataFrame(all_words_list, columns = ['Word'])
words_df.drop_duplicates(subset = 'Word', keep = 'first', inplace = True)
print("Total number of unique words from 'brown', 'treebank' & 'wiki-2' Dataset: ", len(words_df))

# words_df.to_csv("/content/unique_answer_list.txt", header=False, index=False)

Total number of unique words from 'brown', 'treebank' & 'wiki-2' Dataset:  81181


## Postive and Negative sample generation

In [16]:
# modification to be applied to positive answer to generate negative answers

def replace_random_chars(input_string, num_chars_to_replace):
    positions_to_replace = random.sample(range(len(input_string)), num_chars_to_replace)

    replaced_string = list(input_string)
    for position in positions_to_replace:
        replaced_string[position] = random.choice(string.ascii_letters.lower())

    return ''.join(replaced_string)

# random character omission from the input string

def remove_random_chars(input_string, num_chars_to_remove):
  positions_to_remove = random.sample(range(len(input_string)), 1)

  for i, pos in enumerate(positions_to_remove):
    input_string = input_string[:pos - i] + input_string[pos - i + 1 :]

  return input_string

In [17]:
def check_exist_of_word(word, ans_list):
  return word in ans_list

In [18]:
# checking if the answer is a single valid word

spell = SpellChecker()

dataset_list = []

for answer in tqdm(answer_list, ncols = 120):
  if answer != '':
    # original single answer
    dataset_list.append((answer, generate_answer_embedding(answer, sp_model, word_vec_model), 1))

    is_valid_word = spell.known([answer])

    # generating negative answers with single valid words
    if is_valid_word:
      # get two negative answers for single valid word
      for _ in range(2):
        if len(answer) >= 10:
          # concurrent two letter replacement
          neg_answer_with_replace = replace_random_chars(answer, 2 )

        else:
          # a single letter replacement
          neg_answer_with_replace = replace_random_chars(answer, 1)
        dataset_list.append((neg_answer_with_replace, generate_answer_embedding(neg_answer_with_replace, sp_model, word_vec_model), 0))

      if len(answer) > 5:
        neg_answer_with_omission = remove_random_chars(answer, 1)
        dataset_list.append((neg_answer_with_omission, generate_answer_embedding(neg_answer_with_omission, sp_model, word_vec_model), 0))

    # generate negative answers with unsegmented answers
    else:
      neg_answer_with_replace_list = []
      neg_answer_with_omission_list = []

      for _ in range(2):
        if len(answer) >= 15:
          neg_answer_with_replace_list.append(replace_random_chars(answer, 3))
          neg_answer_with_omission_list.append(remove_random_chars(answer, 2))

        if len(answer) >= 10:
          neg_answer_with_replace_list.append(replace_random_chars(answer, 2))
          neg_answer_with_omission_list.append(remove_random_chars(answer, 1))

        if len(answer) >= 5:
          neg_answer_with_replace_list.append(replace_random_chars(answer, 1))
          neg_answer_with_omission_list.append(remove_random_chars(answer, 1))

      for neg_answer in neg_answer_with_replace_list + neg_answer_with_omission_list:
        dataset_list.append((neg_answer, generate_answer_embedding(neg_answer, sp_model, word_vec_model), 0))

100%|█████████████████████████████████████████████████████████████████████████| 458929/458929 [02:54<00:00, 2631.82it/s]


In [19]:
for word in tqdm(words_df['Word'], ncols = 120):
  dataset_list.append((word, generate_answer_embedding(word, sp_model, word_vec_model), 1))

100%|██████████████████████████████████████████████████████████████████████████| 81181/81181 [00:02<00:00, 27802.65it/s]


In [20]:
dataset_df = pd.DataFrame(dataset_list, columns = ['Answer', 'Embedding', 'Label'])

print("Dataset size before removing duplicates: ", len(dataset_df))
dataset_df.drop_duplicates(subset = 'Answer', keep = 'first', inplace = True)
print("Dataset size after removing duplicates: ", len(dataset_df))

# dataset_df.to_csv("/content/answer dataset.csv")

Dataset size before removing duplicates:  3308843
Dataset size after removing duplicates:  2971683


In [48]:
features_data = np.array(dataset_df['Embedding'].tolist())
target_data = np.array(dataset_df['Label'].tolist())

assert features_data.shape[0] == target_data.shape[0]

In [52]:
with h5py.File('/content/answer_dataset.h5', 'w') as hdf:
    hdf.create_dataset('Embedding', data = features_data)
    hdf.create_dataset('Label', data = target_data)

#### Copying final-dataset to the G-Drive

In [53]:
import shutil
src_path = "/content/answer_dataset.h5"
dest_path = "/content/gdrive/MyDrive"
shutil.copy(src_path, dest_path)

'/content/gdrive/MyDrive/answer_dataset.h5'