In [37]:
import os
import sys
import gzip
import json_lines
import logging
import collections
import json
import pickle
import multiprocessing
import argparse
import math
from tqdm import tqdm
from collections import Counter, OrderedDict, defaultdict as ddict
import numpy as np


def read_squad(path):

    with open(path, 'rb') as f:
        squad_dict = json.load(f)
    data_dict = {'question': [], 'context': [], 'id': [], 'answer': []}
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                if len(qa['answers']) == 0:
                    data_dict['question'].append(question)
                    data_dict['context'].append(context)
                    data_dict['id'].append(qa['id'])
                else:
                    for answer in  qa['answers']:
                        data_dict['question'].append(question)
                        data_dict['context'].append(context)
                        data_dict['id'].append(qa['id'])
                        data_dict['answer'].append(answer)
    id_map = ddict(list)
    for idx, qid in enumerate(data_dict['id']):
        id_map[qid].append(idx)

    data_dict_collapsed = {'question': [], 'context': [], 'id': []}
    if data_dict['answer']:
        data_dict_collapsed['answer'] = []
    for qid in id_map:
        ex_ids = id_map[qid]
        data_dict_collapsed['question'].append(data_dict['question'][ex_ids[0]])
        data_dict_collapsed['context'].append(data_dict['context'][ex_ids[0]])
        data_dict_collapsed['id'].append(qid)
        if data_dict['answer']:
            all_answers = [data_dict['answer'][idx] for idx in ex_ids]
            data_dict_collapsed['answer'].append({'answer_start': [answer['answer_start'] for answer in all_answers],
                                                  'text': [answer['text'] for answer in all_answers]})
    return data_dict_collapsed
##################################################################
def read_squad_examples(input_file, debug=False):
    # Read data
#     unproc_data = []
#     with gzip.open(input_file, 'rt', encoding='utf-8') as f:  # opening file in binary(rb) mode
#         for item in json_lines.reader(f):
#             # print(item) #or use print(item['X']) for printing specific data
#             unproc_data.append(item)

    # Delete header
#     unproc_data = unproc_data[1:]
#     if debug:
#         unproc_data = unproc_data[:100]
    
#     print(unproc_data)
    with open(input_file, 'rb') as f:
        squad_dict = json.load(f)
        unproc_data = squad_dict['data']
        
    ###################### Make Examples ######################
    examples = []
    for item in unproc_data:
        for passage in item['paragraphs']:
            # 1. Get Context
            doc_tokens = []
            for token in item['context_tokens']:
                # BERT has only [SEP] in it's word piece vocabulary. because we keps all separators char length 5
                # we can replace all of them with [SEP] without modifying the offset
                if token[0] in ['[TLE]', '[PAR]', '[DOC]']:
                    token[0] = '[SEP]'
                doc_tokens.append(token[0])

            # 2. qas
            for qa in item['qas']:
                qas_id = qa['qid']
                question_text = qa['question']

                answer_lst = []  # Check for duplicate question
                for answer in qa['detected_answers']:
                    orig_answer_text = answer['text']
                    # We could find so many duplicate "Detected Answer"...It needs to be erased
                    if orig_answer_text in answer_lst:
                        continue
                    else:
                        answer_lst.append(orig_answer_text)

                    # Only take the first span
                    start_position = answer['token_spans'][0][0]
                    end_position = answer['token_spans'][0][1]

                    example = SquadExample(
                        qas_id=qas_id,
                        question_text=question_text,
                        doc_tokens=doc_tokens,
                        orig_answer_text=orig_answer_text,
                        start_position=start_position,
                        end_position=end_position)
                    examples.append(example)
    return examples

class SquadExample(object):
    """
    A single training/test example for the Squad dataset.
    For examples without an answer, the start and end position are -1.
    """

    def __init__(self,
                 qas_id,
                 question_text,
#                  doc_tokens,
                 orig_answer_text=None,
                 start_position=None,
                 end_position=None):
        self.qas_id = qas_id
        self.question_text = question_text
#         self.doc_tokens = doc_tokens
        self.orig_answer_text = orig_answer_text
        self.start_position = start_position
        self.end_position = end_position

    def __str__(self):
        return self.__repr__()

    def __repr__(self):
        s = ""
        s += "qas_id: %s" % (self.qas_id)
        s += ", question_text: %s" % (self.question_text)
#         s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
        if self.start_position:
            s += ", start_position: %d" % (self.start_position)
        if self.end_position:
            s += ", end_position: %d" % (self.end_position)
        return s

In [43]:
with open('/home/zhanj289/cs224n_robust_qa/adv_model/datasets/train/newsqa', 'rb') as f:
        squad_dict = json.load(f)

In [75]:
squad_dict['data'][0].keys()

dict_keys(['title', 'paragraphs'])

In [72]:
squad_dict['data'][190]['paragraphs']

[{'context': 'Editor\'s note: Campbell Brown anchors CNN\'s "Campbell Brown: No Bias, No Bull" at 8 p.m. ET Mondays through Fridays. She delivered this commentary during the "Cutting through the Bull" segment of Tuesday night\'s broadcast.\n\nCNN\'s Campbell Brown says "having no life" isn\'t a requirement for a man to get a job.\n\n(CNN) -- How many times have politicians been warned about the dangers of an open microphone? And yet, on Tuesday, the lectern mic at the National Governors Conference picked up this little nugget from Pennsylvania\'s Democratic Gov. Ed Rendell.\n\nHe\'s having a conversation near the lectern about President-elect Barack Obama\'s choice for to lead the Homeland Security Department, Arizona Gov. Janet Napolitano. Here is what Rendell said about Napolitano:\n\nRendell: Janet\'s perfect for that job. Because for that job, you have to have no life. Janet has no family. Perfect. She can devote, literally, 19-20 hours a day to it\n\nWow. Now, I\'m sure Gov. Napol

In [46]:
unproc_data = []
with gzip.open('/home/zhanj289/cs224n_robust_qa/adv_model/data/train/NewsQA.jsonl.gz', 'rt', encoding='utf-8') as f:  # opening file in binary(rb) mode
    for item in json_lines.reader(f):
        # print(item) #or use print(item['X']) for printing specific data
        unproc_data.append(item)

In [77]:
unproc_data[2]

{'context': 'WASHINGTON (CNN) -- One of the Marines shown in a famous World War II photograph raising the U.S. flag on Iwo Jima was posthumously awarded a certificate of U.S. citizenship on Tuesday.\n\nThe Marine Corps War Memorial in Virginia depicts Strank and five others raising a flag on Iwo Jima.\n\nSgt. Michael Strank, who was born in Czechoslovakia and came to the United States when he was 3, derived U.S. citizenship when his father was naturalized in 1935. However, U.S. Citizenship and Immigration Services recently discovered that Strank never was given citizenship papers.\n\nAt a ceremony Tuesday at the Marine Corps Memorial -- which depicts the flag-raising -- in Arlington, Virginia, a certificate of citizenship was presented to Strank\'s younger sister, Mary Pero.\n\nStrank and five other men became national icons when an Associated Press photographer captured the image of them planting an American flag on top of Mount Suribachi on February 23, 1945.\n\nStrank was killed in 

In [None]:
read_squad('/home/zhanj289/cs224n_robust_qa/adv_model/datasets/train/race')

In [None]:
read_squad_examples('/home/zhanj289/cs224n_robust_qa/adv_model/data/train/NewsQA.jsonl.gz', debug = True)

In [78]:
a ='dsasadasdsadas'

In [79]:
a[-3:]

'das'

In [80]:
'race'[-4:]

'race'

In [168]:
def read_squad(path):
    document_id = path[-4:]
#     path = Path(path)
    with open(path, 'rb') as f:
        squad_dict = json.load(f)
    data_dict = {'question': [], 'context': [], 'id': [], 'answer': [], 'doc': []}
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                if len(qa['answers']) == 0:
                    data_dict['question'].append(question)
                    data_dict['context'].append(context)
                    data_dict['id'].append(qa['id'])
                    ###
                    data_dict['doc'].append(document_id)
                else:
                    for answer in  qa['answers']:
                        data_dict['question'].append(question)
                        data_dict['context'].append(context)
                        data_dict['id'].append(qa['id'])
                        data_dict['answer'].append(answer)
                        ###
                        data_dict['doc'].append(document_id)
                        
    id_map = ddict(list)
    
    for idx, qid in enumerate(data_dict['id']):
        id_map[qid].append(idx)
#     print(len(id_map))
#     print(id_map)
    
    data_dict_collapsed = {'question': [], 'context': [], 'id': [], 'doc': []}
    if data_dict['answer']:
        data_dict_collapsed['answer'] = []
    for qid in id_map:
        ex_ids = id_map[qid]
        
#         print(qid)
#         print(ex_ids)
        data_dict_collapsed['question'].append(data_dict['question'][ex_ids[0]])
        data_dict_collapsed['context'].append(data_dict['context'][ex_ids[0]])
        data_dict_collapsed['id'].append(qid)
        #
        data_dict_collapsed['doc'].append(data_dict['doc'][0])
        if data_dict['answer']:
            all_answers = [data_dict['answer'][idx] for idx in ex_ids]
            data_dict_collapsed['answer'].append({'answer_start': [answer['answer_start'] for answer in all_answers],
                                                  'text': [answer['text'] for answer in all_answers]})
            
    return data_dict_collapsed

In [172]:
dataset_dict= read_squad('/home/zhanj289/cs224n_robust_qa/datasets/indomain_train/nat_questions')

In [173]:
len(dataset_dict['doc'])

50000

In [147]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

def prepare_train_data(dataset_dict, tokenizer):
    tokenized_examples = tokenizer(dataset_dict['question'],
                                   dataset_dict['context'],
                                   truncation="only_second",
                                   stride=128,
                                   max_length=384,
                                   return_overflowing_tokens=True,
                                   return_offsets_mapping=True,
                                   padding='max_length')
    sample_mapping = tokenized_examples["overflow_to_sample_mapping"]
    offset_mapping = tokenized_examples["offset_mapping"]
    
    print(tokenized_examples.keys())

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []
    tokenized_examples['id'] = []
    
    inaccurate = 0
    for i, offsets in enumerate(tqdm(offset_mapping)):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answer = dataset_dict['answer'][sample_index]
        # Start/end character index of the answer in the text.
        start_char = answer['answer_start'][0]
        end_char = start_char + len(answer['text'][0])
        tokenized_examples['id'].append(dataset_dict['id'][sample_index])
        # Start token index of the current span in the text.
        token_start_index = 0
        while sequence_ids[token_start_index] != 1:
            token_start_index += 1

        # End token index of the current span in the text.
        token_end_index = len(input_ids) - 1
        while sequence_ids[token_end_index] != 1:
            token_end_index -= 1

        # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
        if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
            # Note: we could go after the last offset if the answer is the last word (edge case).
            while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                token_start_index += 1
            tokenized_examples["start_positions"].append(token_start_index - 1)
            while offsets[token_end_index][1] >= end_char:
                token_end_index -= 1
            tokenized_examples["end_positions"].append(token_end_index + 1)
            # assertion to check if this checks out
            context = dataset_dict['context'][sample_index]
            offset_st = offsets[tokenized_examples['start_positions'][-1]][0]
            offset_en = offsets[tokenized_examples['end_positions'][-1]][1]
            if context[offset_st : offset_en] != answer['text'][0]:
                inaccurate += 1
        
        
        
    doc_map = {v: k for k,v in enumerate(set(dataset_dict['doc']))}
    tokenized_examples['labels']  = [doc_map[ele] for ele in dataset_dict['doc'] ]

#     print(len(tokenized_examples))  #7           
#     print(len(tokenized_examples['start_positions']))  
#     print(tokenized_examples['start_positions'][1:10])  
#     print(len(tokenized_examples['end_positions']))
#     print(len(tokenized_examples['labels'] ))
#     print(tokenized_examples['labels'][1:10])
    
    total = len(tokenized_examples['id'])
    print(f"Preprocessing not completely accurate for {inaccurate}/{total} instances")
    return tokenized_examples

In [148]:
tokenized_examples = prepare_train_data(dataset_dict, tokenizer)

100%|██████████| 196/196 [00:00<00:00, 22618.27it/s]

dict_keys(['input_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping'])
Preprocessing not completely accurate for 0/196 instances





In [136]:
tokenized_examples.keys()

dict_keys(['input_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping', 'start_positions', 'end_positions', 'id', 'labels'])

In [154]:
len(tokenized_examples['labels'])

127

In [179]:
test= (1,2,3)

In [180]:
a, b,c, d = test

ValueError: not enough values to unpack (expected 4, got 3)