In [2]:
from utils import translate_text

In [3]:
import json
json_file = json.load(open("../silver_facts.json"))
t_qa = translate_text(json_file)


with open('examples.txt', 'w') as f:
    for qa in t_qa:
        q, a = qa
        f.write(q)
        f.write('\n')
        f.write(a)
        f.write('\n')



In [24]:
from spacy_conll import init_parser


# Initialise English parser, already including the ConllFormatter as a pipeline component.
# Indicate that we want to get the CoNLL headers in the string output.
# `use_gpu` and `verbose` are specific to stanza (and stanfordnlp). These keywords arguments
# are passed onto their Pipeline() initialisation
nlp = init_parser("en_core_web_sm",
                  "spacy",
                  include_headers=True)

def convert_conll(s):
    out = nlp(s)._.conll_str
    out = out.split('\n')[2:-1]
    out = '\n'.join(out) + '\n\n'
    out = out.replace('ROOT', 'root')
    return out


# with open('examples.txt', 'r') as f_in:
#     with open('examples.conllu', 'w') as f_out:
#         for line in f_in:
#             x = convert_conll(line)
#             tmp = x.split('\n')
#             x = x.split('\n')[3:-4]
#             x = '\n'.join(x)
#             f_out.write(x)
#             f_out.write('\n')
#             f_out.write('\n')


with open('examples.conllu', 'w') as f_out:
    f_out.write(convert_conll("Who died in 1285?"))
    f_out.write(convert_conll("Zhenjin"))


In [25]:
from copy import deepcopy
import codecs
from mosestokenizer import MosesDetokenizer
from conllu import parse
from rules import Question, AnswerSpan
# import pattern

In [26]:
detokenizer = MosesDetokenizer()



In [27]:
print('Parsing conllu file...')
with codecs.open('examples.conllu', 'r', encoding='utf-8') as f:
    conllu_file = parse(f.read())

Parsing conllu file...


In [28]:
# Creating dict
ids = range(int(len(conllu_file)/2))
examples = {}
count = 0
for i, s in enumerate(conllu_file):
    if i % 2 == 0:
        examples[ids[count]] = s
    else:
        examples[str(ids[count])+'_answer'] = s
        count +=1

In [29]:
def qa2d(idx):
    print(list(examples[idx]))
    q = Question(deepcopy(list(examples[idx])))
    if not q.isvalid:
        print("Question {} is not valid.".format(idx))
        return ''
    a = AnswerSpan(deepcopy(list(examples[str(idx)+'_answer'])))
    if not a.isvalid:
        print("Answer span {} is not valid.".format(idx))
        return ''
    q.insert_answer_default(a)
    return detokenizer(q.format_declr())

In [30]:
def print_sentence(idx):
    return detokenizer([list(examples[idx])[i]['form'] for i in range(len(list(examples[idx])))])

In [31]:
total = int(len(examples.keys())/2)
print("Transforming {} examples.".format(total))
for i in range(total):
    out = qa2d(i)
    print(print_sentence(i))
    if out != '':
        print(out)
    print('----------')
 

Transforming 1 examples.
[{'id': 1, 'form': 'Who', 'lemma': 'who', 'upos': 'PRON', 'xpos': 'WP', 'feats': None, 'head': 2, 'deprel': 'nsubj', 'deps': None, 'misc': None}, {'id': 2, 'form': 'died', 'lemma': 'die', 'upos': 'VERB', 'xpos': 'VBD', 'feats': {'Tense': 'Past', 'VerbForm': 'Fin'}, 'head': 0, 'deprel': 'root', 'deps': None, 'misc': None}, {'id': 3, 'form': 'in', 'lemma': 'in', 'upos': 'ADP', 'xpos': 'IN', 'feats': None, 'head': 2, 'deprel': 'prep', 'deps': None, 'misc': None}, {'id': 4, 'form': '1285', 'lemma': '1285', 'upos': 'NUM', 'xpos': 'CD', 'feats': {'NumType': 'Card'}, 'head': 3, 'deprel': 'pobj', 'deps': None, 'misc': None}, {'id': 5, 'form': '?', 'lemma': '?', 'upos': 'PUNCT', 'xpos': '.', 'feats': {'PunctType': 'Peri'}, 'head': 2, 'deprel': 'punct', 'deps': None, 'misc': None}]
Who died in 1285?
Zhenjin died in 1285.
----------


In [32]:
# def get_tokens(token_list):
#     return [t['form'] for t in list(token_list)]

# get_tokens(examples[0])

list(examples[0])

[{'id': 1,
  'form': 'Who',
  'lemma': 'who',
  'upos': 'PRON',
  'xpos': 'WP',
  'feats': None,
  'head': 2,
  'deprel': 'nsubj',
  'deps': None,
  'misc': None},
 {'id': 2,
  'form': 'died',
  'lemma': 'die',
  'upos': 'VERB',
  'xpos': 'VBD',
  'feats': {'Tense': 'Past', 'VerbForm': 'Fin'},
  'head': 0,
  'deprel': 'root',
  'deps': None,
  'misc': None},
 {'id': 3,
  'form': 'in',
  'lemma': 'in',
  'upos': 'ADP',
  'xpos': 'IN',
  'feats': None,
  'head': 2,
  'deprel': 'prep',
  'deps': None,
  'misc': None},
 {'id': 4,
  'form': '1285',
  'lemma': '1285',
  'upos': 'NUM',
  'xpos': 'CD',
  'feats': {'NumType': 'Card'},
  'head': 3,
  'deprel': 'pobj',
  'deps': None,
  'misc': None},
 {'id': 5,
  'form': '?',
  'lemma': '?',
  'upos': 'PUNCT',
  'xpos': '.',
  'feats': {'PunctType': 'Peri'},
  'head': 2,
  'deprel': 'punct',
  'deps': None,
  'misc': None}]