# Data preprocessing for MLQA or SquAD-style dataset

In [None]:
import os
import numpy as np
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Dataset downloading
!pip install -q gdown

# Other environment packages required by FAISS  
!sudo apt-get -qq install libopenblas-dev
!sudo apt-get -qq install libomp-dev

# Encoder
!pip install -U -q sentence-transformers
# Index
!pip install -q faiss==1.5.3

# Word segmentation
!pip install -q pyvi

# Sentence segmentation
!pip install -q underthesea

debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 2.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 
Selecting previously unselected package libomp5:amd64.
(Reading database ... 155320 files and directories currently installed.)
Preparing to unpack .../libomp5_5.0.1-1_amd64.deb ...
Unpacking libomp5:amd64 (5.0.1-1) ...
Selecting previously unselected package libomp-dev.
Preparing to unpack .../libomp-dev_5.0.1-1_amd64.deb ...
Unpacking libomp-dev (5.0.1-1) ...
Setting up libomp5:amd64 (5.0.1-1) ...
Setting up libomp-dev (5.0.1-1) ...
Processing triggers for libc-bin (2.27-3ubuntu1.3) ...
/sbin/ldconfig.real: /usr/local/lib/python3.7/dist-packag

In [None]:
import pandas as pd
import numpy as np
import json

from os import path

In [None]:
from sentence_transformers import SentenceTransformer
encoder = SentenceTransformer('distiluse-base-multilingual-cased-v2')

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.38k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/610 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/341 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/539M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/531 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/114 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

In [None]:
# Download the corpus
ROOT_DIR = "drive/MyDrive/School/Academic/Research/exporting/MLQA/Test/"

# Export information
DATASET = path.join(ROOT_DIR, "test-context-vi-question-vi.json")   # testset of the MLQA
EXPORT_SENTENCE_TEXT = "sentences.json"

EXPORT_DOCS_FILENAME = "docs.json"
EXPORT_DOC_RANGE_MAP = "doc_range_map.json"
EXPORT_EMBEDDED_SENTENCES_FILENAME = "embedded_sentences.numpy.out"
EXPORT_FAISS_INDEX = "sentence.index"

In [None]:
from typing import List, Dict
def get_content(data: List[Dict]) -> List:
  """Get list of contexts from the dataset."""
  contents = []
  for dt in data:
    for para in dt["paragraphs"]:
      contents.append(para["context"])
  return contents      

In [None]:
%%time
from underthesea import sent_tokenize

def preprocess(contents: List[str]) -> List:
  """Preprocess data by sentences tokenization."""
  # Sentence segmentation
  sentences = [
    [
      # word_tokenize(sent, format="text")    # apply word segmentation if the embedding requires, e.g., PhoBERT
      sent
      for sent in sent_tokenize(text)
    ]
    for text in contents
  ]
  return sentences

CPU times: user 34 µs, sys: 0 ns, total: 34 µs
Wall time: 43.2 µs


In [None]:
# Load dataset
with open(DATASET, encoding="ascii") as f:
  corpus = json.load(f)
  data = corpus["data"]
  
# get data
contents = get_content(data)
print('Number of documents:', len(contents))

# preprocessing
sentences = preprocess(contents)
assert len(contents) == len(sentences)

Number of documents: 4795


In [None]:
import json

export_sentences_filename = os.path.join(ROOT_DIR, EXPORT_SENTENCE_TEXT) 

with open(export_sentences_filename, "w+", encoding="utf-8") as f:
  f.write(json.dumps(sentences, indent=4, ensure_ascii=False))

~~ ***Caution***: document index start from **1** not **0**
So must set enumerate(iterable, start=1) ~~

In [None]:
from itertools import chain

doc_range_map = {}
docs = []
# an entry is a list (of sentences)
for doc_id, entry in enumerate(sentences, start=0):
  start = len(docs)
  end = len(docs) + len(entry) - 1
  docs = docs.extend(entry)
  doc_range_map[doc_id] = {
      "start": start,
      "end": end,
  }

print("N doc_range_map:", len(doc_range_map))
print("N docs:", len(docs))

N doc_range_map: 4795
N docs: 30245


In [None]:
docs_filename = os.path.join(
    ROOT_DIR,
    EXPORT_DOCS_FILENAME,
)
doc_range_map_filename = os.path.join(
    ROOT_DIR,
    EXPORT_DOC_RANGE_MAP,
)
with open(docs_filename, "w+") as f:
  f.write(json.dumps(docs, ensure_ascii=False))

with open(doc_range_map_filename, "w+") as f:
  f.write(json.dumps(doc_range_map, ensure_ascii=False))

In [None]:
embedded_sentences = encoder.encode(sentences=docs, batch_size=100, show_progress_bar=True)

Batches:   0%|          | 0/303 [00:00<?, ?it/s]

In [None]:
embedded_sentences_filename = os.path.join(
    ROOT_DIR,
    EXPORT_EMBEDDED_SENTENCES_FILENAME,
)
np.savetxt(embedded_sentences_filename, embedded_sentences)

In [None]:
import faiss
FEATURE_SIZE = embedded_sentences.shape[1]

sentence_index = faiss.IndexFlatIP(FEATURE_SIZE)
sentence_index.train(embedded_sentences)
# Check if the index model has been trained,
# by default, IndexFlatIP no need to be trained beforehand
print('Is the model trained? ', str(sentence_index.is_trained))
# Add dense vectors (embedded_sentences) into FAISS index
sentence_index.add(embedded_sentences)
print('Number of entries in the index: ', str(sentence_index.ntotal))
print("N total:", sentence_index.ntotal)

Is the model trained?  True
Number of entries in the index:  30245
N total: 30245


In [None]:
faiss.write_index(
    sentence_index,
    os.path.join(ROOT_DIR, EXPORT_FAISS_INDEX),
)