In [17]:
import torch
from torch.utils.data import Dataset, DataLoader

import os
import time 
import json 
import numpy as np
from tqdm.notebook import tqdm
from collections import Counter
import pandas as pd

In [20]:
class Vocabulary(object):
    def __init__(self, lil_tokens, max_vocab_size=10_000):
        self.lil_tokens = lil_tokens
        self.max_vocab_size = max_vocab_size
        self.tokens = []
        self.ids = {}

        # add special tokens 
        self.tokens.append('<bos>')
        self.tokens.append('<eos>')
        self.tokens.append('<pad>')
        self.tokens.append('<unk>')

        # add all the tokens 
        self.build_vocab()
    
    def build_vocab(self):
        all_tokens = [token for l_tokens in self.lil_tokens for token in l_tokens]
        counter = Counter(all_tokens)
        most_common = counter.most_common(self.max_vocab_size - len(self.tokens))
        self.tokens += [item[0] for item in most_common]
        self.ids = {token: id for id, token in enumerate(self.tokens)}

    def get_id(self, w):
        return self.ids[w]

    def get_token(self, id):
        return self.tokens[id]

    def decode_idx2token(self, list_id):
        return [self.tokens[i] for i in list_id]

    def encode_token2idx(self, list_token):
        return [self.ids[tok] if tok in self.ids else '<unk>' for tok in list_token]

    def __len__(self):
        return len(self.tokens)

def read_data(file_path):
    with open(file_path, 'r') as f:
        dataset = []
        for line in f:
            dataset.append(line.strip().split(' '))
    return dataset

def token2index_dataset(dataset_lil, vocab):
    index_lil = []
    for data in dataset_lil:
        data = ['<bos>'] + data + ['<eos>']
        index_data = vocab.encode_token2idx(data)
        index_lil.append(index_data)

    return index_lil

def load_qa_data(file_path):
    # read data 
    answer_lil = read_data(file_path['source'])
    question_lil = read_data(file_path['target'])

    # save list of words 
    main_df = pd.DataFrame()
    main_df['source_data'] = answer_lil
    main_df['target_data'] = question_lil

    # build dictionary for source and target 
    answer_vocab = Vocabulary(answer_lil, 45_000)
    question_vocab = Vocabulary(question_lil, 28_000)

    # convert words to idx for each dataset
    main_df['source_indized'] = token2index_dataset(answer_lil, answer_vocab)
    main_df['target_indized'] = token2index_dataset(question_lil, question_vocab)

    return main_df, answer_vocab, question_vocab


class QAPair(Dataset):
    def __init__(self, file_path):
        self.main_df, self.answer_vocab, self.question_vocab = load_qa_data(file_path)

    def __len__(self):
        return len(self.main_df)

    def __getitem__(self, idx):
        return (self.main_df.iloc[idx]['source_indized'], self.main_df.iloc[idx]['target_indized'])

Fake data test

In [12]:
fake_data = [['i', 'love', 'you'], ['you', 'hate', 'me'], ['you', 'love', 'and', 'hate', 'him']]

fake_vocab = Vocabulary(fake_data, 8)
print(fake_vocab.tokens)
print(fake_vocab.ids)
print(fake_vocab.encode_token2idx(['i', 'love' , 'you', 'so', 'much']))

['<bos>', '<eos>', '<pad>', '<unk>', 'you', 'love', 'hate', 'i']
{'<bos>': 0, '<eos>': 1, '<pad>': 2, '<unk>': 3, 'you': 4, 'love': 5, 'hate': 6, 'i': 7}
[7, 5, 4, '<unk>', '<unk>']


In [16]:
fake_pairs = [
    (['this', 'is', 'answer'], ['what', 'is', 'the', 'question']),
    (['i', 'do', 'not', 'know'], ['when', 'are', 'assignments', 'over' ]),
]

ans_vocab, q_vocab = load_qa_pairs(fake_pairs)

In [19]:
q_vocab.tokens

['<bos>',
 '<eos>',
 '<pad>',
 '<unk>',
 'what',
 'is',
 'the',
 'question',
 'when',
 'are',
 'assignments',
 'over']

In [4]:
ans = [[1,2,3], [4,5,6]]
qs = [[1,2,3,4], [5,6,7,8]]

print(type(zip(ans, qs)))

<class 'zip'>


In [13]:
data = read_data('data/processed/tgt-test.txt')
data

[['to',
  'whom',
  'did',
  'the',
  'virgin',
  'mary',
  'allegedly',
  'appear',
  'in',
  '1858',
  'in',
  'lourdes',
  'france',
  '?'],
 ['what',
  'is',
  'in',
  'front',
  'of',
  'the',
  'notre',
  'dame',
  'main',
  'building',
  '?'],
 ['the',
  'basilica',
  'of',
  'the',
  'sacred',
  'heart',
  'at',
  'notre',
  'dame',
  'is',
  'beside',
  'to',
  'which',
  'structure',
  '?'],
 ['what', 'is', 'the', 'grotto', 'at', 'notre', 'dame', '?'],
 ['what',
  'sits',
  'on',
  'top',
  'of',
  'the',
  'main',
  'building',
  'at',
  'notre',
  'dame',
  '?'],
 ['when',
  'did',
  'the',
  'scholastic',
  'magazine',
  'of',
  'notre',
  'dame',
  'begin',
  'publishing',
  '?'],
 ['how',
  'often',
  'is',
  'notre',
  'dame',
  "'s",
  'the',
  'juggler',
  'published',
  '?'],
 ['what',
  'is',
  'the',
  'daily',
  'student',
  'paper',
  'at',
  'notre',
  'dame',
  'called',
  '?'],
 ['how',
  'many',
  'student',
  'news',
  'papers',
  'are',
  'found',
  'at',
 

In [18]:
test_file_path = {
    'source': "data/processed/src-test.txt",
    'target': "data/processed/tgt-test.txt"
}
df, sv, tv = load_qa_data(test_file_path)

In [19]:
df

Unnamed: 0,source_data,target_data,source_indized,target_indized
0,"[it, is, a, replica, of, the, grotto, at, lour...","[to, whom, did, the, virgin, mary, allegedly, ...","[0, 30, 21, 11, 11790, 6, 4, 7791, 26, 9426, 5...","[0, 10, 290, 11, 5, 4285, 4286, 4287, 499, 8, ..."
1,"[immediately, in, front, of, the, main, buildi...","[what, is, in, front, of, the, notre, dame, ma...","[0, 1185, 9, 2375, 6, 4, 253, 254, 8, 4609, 30...","[0, 6, 12, 8, 2084, 7, 5, 63, 62, 214, 175, 4, 1]"
2,"[next, to, the, main, building, is, the, basil...","[the, basilica, of, the, sacred, heart, at, no...","[0, 496, 10, 4, 253, 254, 21, 4, 5754, 6, 4, 5...","[0, 5, 3334, 7, 5, 3335, 1408, 37, 63, 62, 12,..."
3,"[immediately, behind, the, basilica, is, the, ...","[what, is, the, grotto, at, notre, dame, ?]","[0, 1185, 1001, 4, 5754, 21, 4, 7791, 5, 11, 1...","[0, 6, 12, 5, 4289, 37, 63, 62, 4, 1]"
4,"[atop, the, main, building, 's, gold, dome, is...","[what, sits, on, top, of, the, main, building,...","[0, 3107, 4, 253, 254, 19, 1816, 9428, 21, 11,...","[0, 6, 5993, 22, 203, 7, 5, 214, 175, 37, 63, ..."
...,...,...,...,...
11872,"[notable, athletes, include, swimmer, sharron,...","[what, is, the, occupation, of, trevor, franci...","[0, 1053, 3839, 172, 9080, 20387, 20388, 5, 20...","[0, 6, 12, 5, 1342, 7, 10914, 1377, 4, 1]"
11873,"[canadian, politician, and, legal, scholar, ch...","[what, is, the, current, nationality, of, form...","[0, 150, 6512, 8, 763, 3957, 1204, 20392, 2039...","[0, 6, 12, 5, 546, 1324, 7, 519, 59, 10915, 23..."
11874,"[america, based, actor, donald, moffat, ,, who...","[what, united, states, vice, president, did, d...","[0, 318, 204, 1442, 4465, 15567, 5, 488, 1278,...","[0, 6, 146, 144, 2530, 140, 11, 5989, 5990, 25..."
11875,"[america, based, actor, donald, moffat, ,, who...","[in, what, film, did, donald, moffat, play, pr...","[0, 318, 204, 1442, 4465, 15567, 5, 488, 1278,...","[0, 8, 6, 124, 11, 5989, 5990, 258, 140, 5945,..."
