In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import re


def get_nq_tokens(simplified_nq_example):
  if "document_text" not in simplified_nq_example:
    raise ValueError("`get_nq_tokens` should be called on a simplified NQ"
                     "example that contains the `document_text` field.")

  return simplified_nq_example["document_text"].split(" ")


def simplify_nq_example(nq_example):
    
  def _clean_token(token):
    return re.sub(u" ", "_", token["token"])

  text = " ".join([_clean_token(t) for t in nq_example["document_tokens"]])

  def _remove_html_byte_offsets(span):
    if "start_byte" in span:
      del span["start_byte"]

    if "end_byte" in span:
      del span["end_byte"]

    return span

  def _clean_annotation(annotation):
    annotation["long_answer"] = _remove_html_byte_offsets(
        annotation["long_answer"])
    annotation["short_answers"] = [
        _remove_html_byte_offsets(sa) for sa in annotation["short_answers"]
    ]
    return annotation

  simplified_nq_example = {
      "question_text": nq_example["question_text"],
      "example_id": nq_example["example_id"],
      "document_url": nq_example["document_url"],
      "document_text": text,
      "long_answer_candidates": [
          _remove_html_byte_offsets(c)
          for c in nq_example["long_answer_candidates"]
      ],
      "annotations": [_clean_annotation(a) for a in nq_example["annotations"]]
  }

  if len(get_nq_tokens(simplified_nq_example)) != len(
      nq_example["document_tokens"]):
    raise ValueError("Incorrect number of tokens.")

  return simplified_nq_example

In [None]:
input_dir = "v1.0/train"
output_dir = "Newsimplified_natural_questions"

os.makedirs(output_dir, exist_ok=True)

for filename in os.listdir(input_dir):
    if filename.endswith(".jsonl.gz"):
        input_file = os.path.join(input_dir, filename)
        output_file = os.path.join(output_dir, f"simplified-{filename}")

        print(f"Processing {filename}...")
        with gzip.open(input_file, 'rt', encoding='utf-8') as infile, \
                gzip.open(output_file, 'wt', encoding='utf-8') as outfile:
            for line in infile:
                nq_example = json.loads(line.strip())  # Parse JSON
                try:
                    simplified_example = simplify_nq_example(nq_example)  # Simplify
                    outfile.write(json.dumps(simplified_example) + '\n')  # Save
                except ValueError as e:
                    print(f"Skipping example due to error: {e}")