In [1]:
import re, os, string, typing, gc, json
import pandas as pd 
import numpy as np
from collections import Counter
import torch.nn.functional as F
from torch import nn
import pickle

import spacy
import en_core_web_sm
nlp = en_core_web_sm.load(disable = ['parser','ner','tagger'])
from tqdm.auto import tqdm, trange

import torch

In [2]:
def load_json(path):
    with open(path, 'r', encoding='utf-8') as file:
        data = json.load(file)
        
    return data

In [3]:
train_data = load_json('/kaggle/input/squad-dataset/squad_train.json')
valid_data = load_json('/kaggle/input/squad-dataset/squad_dev.json')

In [4]:
def parse_data(data):
    data = data['data']
    qa_list = []

    for paragraphs in data:
        for para in paragraphs['paragraphs']:
            ctx = para['context']

            for qa in para['qas']:
                idx = qa['id']
                quest = qa['question']
                
                for ans in qa['answers']:
                    answ = ans['text']
                    ans_start = ans['answer_start']
                    ans_end = ans_start + len(answ)
                    
                    qa_list.append({
                        "id": idx,
                        "context": ctx,
                        "question": quest,
                        "label": [ans_start, ans_end],
                        "answer": answ
                    })    
    
    return qa_list

In [5]:
train_list = parse_data(train_data)
valid_list = parse_data(valid_data)

train_df = pd.DataFrame(train_list)
valid_df = pd.DataFrame(valid_list)

In [6]:
train_df.head()

Unnamed: 0,id,context,question,label,answer
0,5733be284776f41900661182,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,"[515, 541]",Saint Bernadette Soubirous
1,5733be284776f4190066117f,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,"[188, 213]",a copper statue of Christ
2,5733be284776f41900661180,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,"[279, 296]",the Main Building
3,5733be284776f41900661181,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,"[381, 420]",a Marian place of prayer and reflection
4,5733be284776f4190066117e,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,"[92, 126]",a golden statue of the Virgin Mary


In [7]:
def lower_df(df):
    
    def to_lower(text):
        return text.lower()

    df.context = df.context.apply(to_lower)
    df.question = df.question.apply(to_lower)
    df.answer = df.answer.apply(to_lower)

In [8]:
lower_df(train_df)
lower_df(valid_df)

In [9]:
vocab_text = []
total = 0
for df in [train_df, valid_df]:
    unique_contexts = list(df.context.unique())
    unique_questions = list(df.question.unique())
    total += df.context.nunique() + df.question.nunique()
    vocab_text.extend(unique_contexts + unique_questions)

print("Unique sentences:", len(vocab_text))

Unique sentences: 118822


# Word Vocab

In [10]:
words = []
for sent in tqdm(vocab_text):
    for word in nlp(sent, disable=['parser','tagger','ner']):
        words.append(word.text)

word_counter = Counter(words)
word_vocab = sorted(word_counter, key=word_counter.get, reverse=True)

word_vocab.insert(0, '<unk>')
word_vocab.insert(1, '<pad>')
word2idx = {word:idx for idx, word in enumerate(word_vocab)}
idx2word = {v:k for k,v in word2idx.items()}

  0%|          | 0/118822 [00:00<?, ?it/s]



# Char Vocab

In [11]:
chars = []
for sent in tqdm(vocab_text):
    for ch in sent:
        chars.append(ch)

char_counter = Counter(chars)
char_vocab = sorted(char_counter, key=char_counter.get, reverse=True)
high_freq_char = [char for char, count in char_counter.items() if count>=20]
char_vocab = list(set(char_vocab).intersection(set(high_freq_char)))
char_vocab.insert(0,'<unk>')
char_vocab.insert(1,'<pad>')
char2idx = {char:idx for idx, char in enumerate(char_vocab)}

  0%|          | 0/118822 [00:00<?, ?it/s]

# Ctx to Ids

In [12]:
def context_to_ids(text, word2idx):
    context_tokens = [w.text for w in nlp(text, disable=['parser','tagger','ner'])]
    context_ids = [word2idx[word] for word in context_tokens]
    
    assert len(context_ids) == len(context_tokens)
    return context_ids

In [13]:
%time train_df['context_ids'] = train_df.context.apply(context_to_ids, word2idx=word2idx)
%time valid_df['context_ids'] = valid_df.context.apply(context_to_ids, word2idx=word2idx)



CPU times: user 18min 1s, sys: 332 ms, total: 18min 1s
Wall time: 18min 2s




CPU times: user 7min 23s, sys: 173 ms, total: 7min 23s
Wall time: 7min 24s


# Quest to Ids

In [14]:
def question_to_ids(text, word2idx):
    question_tokens = [w.text for w in nlp(text, disable=['parser','tagger','ner'])]
    question_ids = [word2idx[word] for word in question_tokens]
    
    assert len(question_ids) == len(question_tokens)
    return question_ids

In [15]:
train_df['question_ids'] = train_df.question.apply(question_to_ids, word2idx=word2idx)
valid_df['question_ids'] = valid_df.question.apply(question_to_ids, word2idx=word2idx)



# Errors

In [16]:
def test_indices(df, idx2word):
    
    start_value_error = []
    end_value_error = []
    assert_error = []
    
    for index, row in tqdm(df.iterrows()):

        answer_tokens = [w.text for w in nlp(row['answer'], disable=['parser','tagger','ner'])]

        start_token = answer_tokens[0]
        end_token = answer_tokens[-1]
        
        context_span  = [(word.idx, word.idx + len(word.text)) for word in nlp(row['context'], disable=['parser','tagger','ner'])]

        starts, ends = zip(*context_span)

        answer_start, answer_end = row['label']

        try:
            start_idx = starts.index(answer_start)
        except:
            start_value_error.append(index)
        try:
            end_idx  = ends.index(answer_end)
        except:
            end_value_error.append(index)

        try:
            assert idx2word[row['context_ids'][start_idx]] == answer_tokens[0]
            assert idx2word[row['context_ids'][end_idx]] == answer_tokens[-1]
        except:
            assert_error.append(index)


    return start_value_error, end_value_error, assert_error



def get_error_indices(df, idx2word):
    start_value_error, end_value_error, assert_error = test_indices(df, idx2word)
    err_idx = start_value_error + end_value_error + assert_error
    err_idx = set(err_idx)
    
    return err_idx

In [17]:
train_err = get_error_indices(train_df, idx2word)
valid_err = get_error_indices(valid_df, idx2word)

train_df = train_df.drop(train_err)
valid_df = valid_df.drop(valid_err)

0it [00:00, ?it/s]



0it [00:00, ?it/s]

# Labels

In [18]:
def index_answer(row, idx2word):    
    context_span = [(word.idx, word.idx + len(word.text)) for word in nlp(row.context, disable=['parser','tagger','ner'])]
    starts, ends = zip(*context_span)
    
    answer_start, answer_end = row.label
    start_idx = starts.index(answer_start)
 
    end_idx  = ends.index(answer_end)
    
    ans_toks = [w.text for w in nlp(row.answer,disable=['parser','tagger','ner'])]
    ans_start = ans_toks[0]
    ans_end = ans_toks[-1]
    
    return [start_idx, end_idx]

In [19]:
train_label_idx = train_df.apply(index_answer, axis=1, idx2word=idx2word)
valid_label_idx = valid_df.apply(index_answer, axis=1, idx2word=idx2word)

train_df['label_idx'] = train_label_idx
valid_df['label_idx'] = valid_label_idx



In [20]:
train_df.to_pickle('bidaftrain.pkl')
valid_df.to_pickle('bidafvalid.pkl')

with open('bidafw2id.pickle','wb') as handle:
    pickle.dump(word2idx, handle)

with open('bidafc2id.pickle','wb') as handle:
    pickle.dump(char2idx, handle)

In [21]:
def get_glove_dict(path_to_glove):
    glove_dict = {}
    with open(path_to_glove, "r", encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            glove_dict[word] = vector
            
    f.close()
    
    return glove_dict

In [22]:
glove_dict = get_glove_dict("/kaggle/input/glove6b100dtxt/glove.6B.100d.txt")

In [23]:
def create_weights_matrix(glove_dict):
    weights_matrix = np.zeros((len(word_vocab), 100))
    for i, word in enumerate(word_vocab):
        try:
            weights_matrix[i] = glove_dict[word]
        except:
            pass
        
    return weights_matrix

In [24]:
weights_matrix = create_weights_matrix(glove_dict)

In [25]:
np.save('bidafglove_tv.npy', weights_matrix)