In [4]:
import pandas as pd
from nltk.tokenize import sent_tokenize
import re

import random
import json
import sys
import random
sys.path.append('../')
from nltk.data import load
tokenizer = load('tokenizers/punkt/{0}.pickle'.format('english'))
tokenizer._params.abbrev_types.add('..')
tokenizer._params.abbrev_types.add('No')
tokenizer._params.abbrev_types.add('no')
tokenizer._params.abbrev_types.add('Dr')
tokenizer._params.abbrev_types.add('dr')
tokenizer._params.abbrev_types.add('op')
tokenizer._params.abbrev_types.add('J.S.')

def default_sentence_split(passage):
    return tokenizer.tokenize(passage)
extra_train = pd.read_csv('../datasets/processed_new_data.csv',index_col=None,encoding='utf-8').dropna()
squad_train = pd.read_csv('../datasets/train-v2.0.csv',index_col=None).dropna()
squad_test = pd.read_csv('../datasets/test-v2.0.csv',index_col=None).dropna()

train_raw_data = squad_train[['question', 'is_impossible', 'title', 'context', 'answer',
                                  'answer_start', 'answer_end']]
test_raw_data = squad_test[['question', 'is_impossible', 'title', 'context', 'answer',
                                'answer_start', 'answer_end']]

In [6]:
#import spacy library
import spacy
  
#load core english library
nlp = spacy.load("en_core_web_sm")
nlp.tokenizer.add_special_case("No.",[{"ORTH":"No."}])
nlp.tokenizer.add_special_case("Op.",[{"ORTH":"Op."}])
nlp.tokenizer.add_special_case('..',[{"ORTH":".."}])
nlp.tokenizer.add_special_case('No',[{"ORTH":"No"}])
nlp.tokenizer.add_special_case('no',[{"ORTH":"no"}])
nlp.tokenizer.add_special_case('Dr.',[{"ORTH":"Dr."}])
nlp.tokenizer.add_special_case('dr.',[{"ORTH":"dr."}])
nlp.tokenizer.add_special_case('J.S.',[{"ORTH":"J.S."}])
def spacy_sentence_tokenizer(passage):
    doc = nlp(passage)
    return [s.text for s in doc.sents]



In [7]:

def spacy_search(answer,passage,start_idx):
    end_idx = start_idx + len(answer)
    sentences = spacy_sentence_tokenizer(passage)
    sentence_marked =[]
    correct = ""
    running_len = 0
    for s in sentences:
        ll= len(s)
        start = running_len 
        end= len(s) + running_len
        running_len = end+1

        
        if end_idx <= end and answer in s:
            correct=s
    return correct,sentences
def extract_passage_sentence(answer,passage,start_idx):
    #answer_section = default_sentence_split(answer)
    #print(answer_section)
    end_idx = start_idx + len(answer)
    sentences = default_sentence_split(passage)
    sentence_marked =[]
    correct = ""
    running_len = 0
    for s in sentences:
        ll= len(s)
        start = running_len 
        end= len(s) + running_len
        running_len = end+1
        if end_idx <= end and answer in s:
            correct=s
    # set up a fall back
    correct_ = correct
    if correct =='':
        correct_c,sentences = spacy_search(answer,passage,start_idx)
        correct = correct_
        #sentences = [s for s in spacy_search(answer,passage,start_idx) if answer in s]
        #correct = random.choice(sentences) if len(sentences)>0 else ''
    if correct_ =='':
        #print(answer)
        answer_section = default_sentence_split(answer)
        #print(answer_section)
        sentences_ = [s for s in sentences if any([a in s for a in answer_section] )]
        correct =' '.join(sentences_)


    return correct,sentences 

In [8]:
import tqdm
from nltk.util import ngrams
from src.dataset_processor import QuestionGenerationData
from src.config import GenerationTasks
def selectContext(articles, answer_index, n=4):
    if not isinstance(answer_index,list):
        answer_index = [answer_index]
    vals = list(ngrams(range(len(articles)), n))
    possible_contexts = []
    for l in vals:
        if all([a in l for a in answer_index]):
            possible_contexts.append(l)
    if len(possible_contexts)>0:
        return random.choice(possible_contexts)
    return None

def processSquadData(datapack: pd.DataFrame,verbose=False):
    datapack_strip = datapack.values
    dataset = []
    for idx, dat in tqdm.tqdm(enumerate(datapack_strip )) if verbose else enumerate(datapack_strip ):
        answer_sentence,sentences = extract_passage_sentence(dat[4],dat[3],dat[5])
        answer_sentences = default_sentence_split(answer_sentence)
        answer_sentence_index = [idx for idx,s in enumerate(sentences) if s in answer_sentences]
        selected_context_idxs = selectContext(sentences, answer_sentence_index , n=random.choice([4,3,4]))

        answer_fact = dat[3]
        if  selected_context_idxs is not None:
            answer_fact =  ' '.join([sentences[s] for s in selected_context_idxs])
        
        vani_dob_q = QuestionGenerationData(task=GenerationTasks.vanilla_question_gen,
                                            input_text= answer_fact, 
                                            output_text=dat[0],
                                            contextual_text= answer_sentence.strip())
        context_questions = QuestionGenerationData(task=GenerationTasks.context_question_gen,
                                                   input_text= dat[3], 
                                                   output_text= dat[0],
                                                   contextual_text= dat[4].strip())
        dataset.append(context_questions)
        dataset.append(vani_dob_q)
    return dataset
        
        


In [9]:
squad_test_extended = processSquadData(test_raw_data)

In [10]:
squad_train_extended = processSquadData(train_raw_data,verbose=True)

130315it [00:09, 13729.70it/s]


In [79]:
from dataclass_csv import DataclassReader, dateformat,DataclassWriter

with open("../curated_data/squad_train.csv", "w",encoding='utf-8') as f:
    w = DataclassWriter(f, squad_train_extended, QuestionGenerationData)
    w.write()

with open("../curated_data/squad_dev.csv", "w",encoding='utf-8') as f:
    w = DataclassWriter(f, squad_test_extended, QuestionGenerationData)
    w.write()

In [87]:
#from src.dataset_processor  import load_dataset
def load_dataset(data_path: str):
    pack =[]
    with open(data_path,encoding='utf-8') as f:
        dataset = DataclassReader(f, QuestionGenerationData)
        for row in dataset:
            pack.append(row)
    return pack

In [91]:
yu= load_dataset("../curated_data/squad_train.csv")

In [92]:
yu==squad_train_extended

True

In [103]:
def process_extra(extra_data):
    data = extra_data.values
    dataset = []
    for idx, dat in tqdm.tqdm(enumerate(data)):  # type: ignore
        fact, answer_sentence = dat[3], dat[2]

        vani_dob_q = QuestionGenerationData(task=GenerationTasks.vanilla_question_gen,
                                            input_text= fact, 
                                            output_text=dat[1].strip(),
                                            contextual_text= answer_sentence.strip())
        
        dataset.append(vani_dob_q)

    return dataset

In [105]:
processed_extra[0]

QuestionGenerationData(task='<generate_questions>', input_text="Meta Platforms, Inc., doing business as Meta and formerly named Facebook, Inc., and TheFacebook, Inc., is an American multinational technology conglomerate based in Menlo Park, California. Meta is one of the world's most valuable companies. It is considered one of the Big Five American information technology companies, alongside Alphabet, Amazon, Apple, and Microsoft. Meta's products and services include Facebook, Messenger, Facebook Watch, and Meta Portal. It has also acquired Oculus, Giphy, Mapillary, Kustomer, Presize and has a 9.99% stake in Jio Platforms.", output_text='According to the article which companies make up the top five in terms of technology?', contextual_text="Meta is one of the world's most valuable companies. It is considered one of the Big Five American information technology companies, alongside Alphabet, Amazon, Apple, and Microsoft.")

In [104]:
processed_extra = process_extra(extra_train)

129it [00:00, ?it/s]


In [15]:
with open("../curated_data/extra_data.csv", "w",encoding='utf-8') as f:
    w = DataclassWriter(f, processed_extra, QuestionGenerationData)
    w.write()

In [19]:
for (index_label, row_series) in train_raw_data.iterrows():
    #print(k)
    break

In [20]:
example = pd.DataFrame.from_dict(row_series)

In [24]:
example.loc['answer']

0    in the late 1990s
Name: answer, dtype: object

In [32]:
squad_train_extended = processSquad(train_raw_data)

KeyboardInterrupt: 

In [30]:
import random


def extract_passage_sentence_debug(answer,passage,start_idx):
    end_idx = start_idx + len(answer)
    sentences = spacy_sentence_tokenizer(passage)
    sentence_marked =[]
    correct = ""
    running_len = 0
    for s in sentences:
        ll= len(s)
        start = running_len 
        end= len(s) + running_len
        running_len = end+1

        print(start,'Yes',end,end_idx)
        if start_idx>= start:
            #correct=s
            print(start,'Yes',end,end_idx)
            #print(start,s,end)
        
        elif answer in s:
            print('Found')
        if end_idx <= end and answer in s:
            correct=s
            print('End Here')
            print(start,s,end)
            correct=s

        #if start_idx>= start and end>=end_idx: 
        #    correct=s
    # set up a fall back
    if correct=='':
        sentences = [s for s in default_sentence_split(passage) if answer in s]
        correct = random.choice(sentences) if len(sentences)>0 else ''

    return correct

In [63]:
squad_train.shape

(130315, 8)

In [61]:
for d in squad_train.:
    

AttributeError: 'tuple' object has no attribute 'answer'

In [65]:
idk= idx
squad_train.answer.values[idk],squad_train.context.values[idk],squad_train.answer_start.values[idk]

('ps',
 'The symbol $, usually written before the numerical amount, is used for the U.S. dollar (as well as for many other currencies). The sign was the result of a late 18th-century evolution of the scribal abbreviation "ps" for the peso, the common name for the Spanish dollars that were in wide circulation in the New World from the 16th to the 19th centuries. These Spanish pesos or dollars were minted in Spanish America, namely in Mexico City, Potosí, Bolivia; and Lima, Peru. The p and the s eventually came to be written over each other giving rise to $.',
 213)

In [73]:
squad_test.columns

Index(['Unnamed: 0', 'question', 'is_impossible', 'title', 'context', 'answer',
       'answer_start', 'answer_end'],
      dtype='object')

In [37]:
tokenizer.tokenize(squad_train.context[idk])

['The preludes, many of which are very brief (some consisting of simple statements and developments of a single theme or figure), were described by Schumann as "the beginnings of studies".',
 'Inspired by J.S.',
 "Bach's The Well-Tempered Clavier, Chopin's preludes move up the circle of fifths (rather than Bach's chromatic scale sequence) to create a prelude in each major and minor tonality.",
 'The preludes were perhaps not intended to be played as a group, and may even have been used by him and later pianists as generic preludes to others of his pieces, or even to music by other composers, as Kenneth Hamilton suggests: he has noted a recording by Ferruccio Busoni of 1922, in which the Prelude Op.',
 '28 No.',
 '7 is followed by the Étude Op.',
 '10 No.',
 '5.']

In [32]:
default_sentence_split(squad_train.context[idk])

['The preludes, many of which are very brief (some consisting of simple statements and developments of a single theme or figure), were described by Schumann as "the beginnings of studies".',
 'Inspired by J.S.',
 "Bach's The Well-Tempered Clavier, Chopin's preludes move up the circle of fifths (rather than Bach's chromatic scale sequence) to create a prelude in each major and minor tonality.",
 'The preludes were perhaps not intended to be played as a group, and may even have been used by him and later pianists as generic preludes to others of his pieces, or even to music by other composers, as Kenneth Hamilton suggests: he has noted a recording by Ferruccio Busoni of 1922, in which the Prelude Op. 28 No. 7 is followed by the Étude Op. 10 No. 5.']

In [49]:
spacy_sentence_tokenizer(squad_train.context[idk])

['The preludes, many of which are very brief (some consisting of simple statements and developments of a single theme or figure), were described by Schumann as "the beginnings of studies".',
 "Inspired by J.S. Bach's The Well-Tempered Clavier, Chopin's preludes move up the circle of fifths (rather than Bach's chromatic scale sequence) to create a prelude in each major and minor tonality.",
 'The preludes were perhaps not intended to be played as a group, and may even have been used by him and later pianists as generic preludes to others of his pieces, or even to music by other composers, as Kenneth Hamilton suggests: he has noted a recording by Ferruccio Busoni of 1922, in which the Prelude Op. 28 No. 7 is followed by the Étude Op. 10 No. 5.']

In [137]:
squad_train.context[idk].index(squad_train.answer[idk])

633

In [124]:
squad_train.answer[idk] in squad_train.context[idk]

True

In [66]:
extract_passage_sentence(squad_train.answer.values[idk],squad_train.context.values[idk],squad_train.answer_start.values[idk])

'The sign was the result of a late 18th-century evolution of the scribal abbreviation "ps" for the peso, the common name for the Spanish dollars that were in wide circulation in the New World from the 16th to the 19th centuries.'

In [71]:
for idx in range(squad_test.shape[0]):
    
    oo=extract_passage_sentence(squad_train.answer.values[idk],squad_train.context.values[idk],squad_train.answer_start.values[idk])
    if len(oo)<1:
        break

In [72]:
idx,squad_test.shape


(11872, (11873, 8))

In [77]:
def spacy_sentence_tokenizer(passage):
    doc = nlp(passage)
    return [s for s in doc.sents]

In [92]:
sent.

27

In [94]:
doc = nlp(squad_train.context[idk])
#to print sentences
for sent in doc.sents:
  print(' '.join([t.text for t in sent]))

From September 1823 to 1826 Chopin attended the Warsaw Lyceum , where he received organ lessons from the Czech musician Wilhelm Würfel during his first year .
In the autumn of 1826 he began a three - year course under the Silesian composer Józef Elsner at the Warsaw Conservatory , studying music theory , figured bass and composition.[n 3 ]
Throughout this period he continued to compose and to give recitals in concerts and salons in Warsaw .
He was engaged by the inventors of a mechanical organ , the " eolomelodicon " , and on this instrument in May 1825 he performed his own improvisation and part of a concerto by Moscheles .
The success of this concert led to an invitation to give a similar recital on the instrument before Tsar Alexander I , who was visiting Warsaw ; the Tsar presented him with a diamond ring .
At a subsequent eolomelodicon concert on 10 June 1825 , Chopin performed his Rondo Op . 1 .
This was the first of his works to be commercially published and earned him his first

In [98]:
sent.text

'This was the first of his works to be commercially published and earned him his first mention in the foreign press, when the Leipzig Allgemeine Musikalische Zeitung praised his "wealth of musical ideas".'

In [95]:
len('From September 1823 to 1826 Chopin attended the Warsaw Lyceum , where he received organ lessons from the Czech musician Wilhelm Würfel during his first year .')

158

In [84]:
len('From September 1823 to 1826 Chopin attended the Warsaw Lyceum, where he received organ lessons from the Czech musician Wilhelm Würfel during his first year.')

156

In [70]:
extract_passage_sentence(squad_train.answer[idk],squad_train.context[idk],squad_train.answer_start[idk])

KeyError: 33487

In [38]:
idx

33

In [2]:
squad_train.head()

Unnamed: 0.1,Unnamed: 0,question,is_impossible,title,context,answer,answer_start,answer_end
0,0,When did Beyonce start becoming popular?,False,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,in the late 1990s,269,-1
1,1,What areas did Beyonce compete in when she was...,False,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,singing and dancing,207,-1
2,2,When did Beyonce leave Destiny's Child and bec...,False,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,2003,526,-1
3,3,In what city and state did Beyonce grow up?,False,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,"Houston, Texas",166,-1
4,4,In which decade did Beyonce become famous?,False,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,late 1990s,276,-1
