# Extracting Sentences

Sometimes we want to train a model only with sentences to make sentence representation. To do that, we convert the training and dev dataset's sentences from context and questions in a separate file. 

In [8]:
import os
import json


def get_sentence_spans(context):
    from nltk.data import load
    tokenizer = load('tokenizers/punkt/{0}.pickle'.format('english'))
    sentence_span = tokenizer.span_tokenize(context)
    return sentence_span

def get_sentence_index(answer_span, sentence_spans):
    """
    return the sentence index. It returns -1 if the answer span could not be found on a single sentence. 
    """
    idx = -1
    for sent_idx, sent_span in enumerate(sentence_spans):
            if sent_span[0] <= answer_span[0] and answer_span[1] <=sent_span[1]+1:
                idx = sent_idx
                break
    return idx




work_dir = "..\\data\\"
dev_filenames = ["dev-v1.1.json", "train-v1.1.json"]

for dev_filename in dev_filenames:
    with open(os.path.join(work_dir, dev_filename)) as data_file:    
        dev_data = json.load(data_file)


    print("train datafile info:")
    print("\tversion:", dev_data["version"])
    print()

    data = dev_data["data"]

    sentences_context = []
    sentences_question = []
    for article in data:
        title = article["title"]
        # print(title)
        paragraphs = article["paragraphs"]
        for paragraph in paragraphs:
            context = paragraph["context"]
            spans = get_sentence_spans(context)

            # print context sentence by sentence
            for idx, sent_span in enumerate(spans):
                sentences_context.append(context[sent_span[0]:sent_span[1]+1])


            qas = paragraph["qas"]
            for qa in qas:
                id = qa["id"]
                question = qa["question"]
                answers = qa["answers"]

                sentences_question.append(question)

print("total number of sentences extracted from context:", len(sentences_context))
print("total number of sentences extracted from question:", len(sentences_question))
sentences_question[:5]    

train datafile info:
	version: 1.1

train datafile info:
	version: 1.1

total number of sentences extracted from context: 93595
total number of sentences extracted from question: 87599


['To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'What is in front of the Notre Dame Main Building?',
 'The Basilica of the Sacred heart at Notre Dame is beside to which structure?',
 'What is the Grotto at Notre Dame?',
 'What sits on top of the Main Building at Notre Dame?']

## Convert in into encoder decoder format

Create a file that contains 3 tab separated entity.

sentence1 sentence2 dummy_label

In [11]:
with open("sentences_context.txt", "w",encoding='utf-8') as f:
    for sent in sentences_context:
        f.write(sent+"\n")
        
with open("sentences_question.txt", "w",encoding='utf-8') as f:
    for sent in sentences_question:
        f.write(sent+"\n")

with open("sentences_all.txt", "w",encoding='utf-8') as f:
    for sent in sentences_context + sentences_question:
        f.write(sent+"\n")
