In [1]:
import pandas as pd
import os
import unicodedata
from tqdm import tqdm

In [2]:
devsetPath = 'data/devset.json'
datapath = 'data/wiki-pages-text/'

In [3]:
filesInDataPath = sorted([datapath + fileName for fileName in os.listdir(datapath)])

In [4]:
devset = pd.read_json(devsetPath, orient='index')
devset.reset_index(inplace=True)

Helper Functions

In [5]:
def normalize(inputString):
    intermediaryString = unicodedata.normalize('NFKD', inputString).encode('ascii','ignore')
    return str(intermediaryString).replace("b'","").replace("'","")

In [6]:
def loadVocab(filesInDataPath):
    vocab = {}
    with tqdm(total=len(filesInDataPath)) as pbar:
        for shardFile in filesInDataPath:
            with open(shardFile, 'r') as openedFile:
                for line in openedFile:
                    pageTitle, sentenceNo, pageText = line.split(' ', 2)
                    try:
                        sentenceNo = int(sentenceNo)
                    except Exception:
                        pass
                    key = normalize(pageTitle + ' ' + str(sentenceNo))
                    vocab[key] = pageText
            pbar.update(1)
    return vocab  

In [7]:
def getEvidenceText(vocab, listofLists):
    if len(listofLists) > 0:
        evidenceText = []
        for subList in listofLists:
            searchString = subList[0] + ' ' + str(subList[1])
            try:
                evidenceText.append(vocab[searchString])
            except KeyError:
                searchString = normalize(searchString)
                evidenceText.append(vocab[searchString])
        return evidenceText
    else:
        return None

Apply stuff

In [None]:
vocab = loadVocab(filesInDataPath)
devset['evidence_text'] = devset['evidence'].apply(lambda x: getEvidenceText(vocab, x))