## Preprocessing

In [2]:
import os
import sys
import argparse

from dotenv import load_dotenv
load_dotenv()
sys.path.append(os.getenv('ROOT_DIR'))

RAW_DATA_DIR = os.getenv('RAW_DATA_DIR')
PROCESSED_DATA_DIR = os.getenv('PROCESSED_DATA_DIR')
TESTING_DATA_DIR = os.getenv('TESTING_DATA_DIR')
PROCESSED_TESTING_CONTEXTS_PATH = os.path.join(PROCESSED_DATA_DIR,"contexts-test.jsonl")
PROCESSED_TESTING_QUERIES_PATH = os.path.join(PROCESSED_DATA_DIR,"queries-test.jsonl")
DATA_LANG_DICT = {"squad":"en", "korquad":"ko", "fquad":"fr", "germanquad":"de", "uitviquad":"vi"}

### Set Up

In [1]:

def create_context_and_query_files(contexts_filename = PROCESSED_TESTING_CONTEXTS_PATH, queries_filename = PROCESSED_TESTING_QUERIES_PATH):

    with open(contexts_filename, "w") as f:
        # Create an empty JSONL file
        f.write("")
    print("---Created contexts file at {filename}".format(filename=contexts_filename))

    with open(queries_filename, "w") as f:
        # Create an empty JSONL file
        f.write("")
    
    print("---Created queries file at {filename}".format(filename=queries_filename))

def main():

    #create data directories or delete previous testing files
    if not os.path.exists(PROCESSED_DATA_DIR):
        os.system("mkdir {dir}".format(dir=PROCESSED_DATA_DIR))
    else:
        print(">> Starting delete existing files in processed folder")
        print()
        for filename in os.listdir(PROCESSED_DATA_DIR):
            if filename.endswith("-test-translated.jsonl") or filename.endswith("-test-neg.json") or filename.endswith("-test.json"):
                os.system("rm -f {file}".format(file = os.path.join(PROCESSED_DATA_DIR, filename)))

    if not os.path.exists(TESTING_DATA_DIR):
        os.system("mkdir {dir}".format(dir=TESTING_DATA_DIR))

    else:
        print(">> Starting delete existing files in testing input folder")
        print()
        for filename in os.listdir(TESTING_DATA_DIR):
            os.system("rm -f {file}".format(file = os.path.join(TESTING_DATA_DIR, filename)))

    #check for context and query file, create if not exist
    create_context_and_query_files()

if __name__ == "__main__":
    main()

>> Starting delete existing files in processed folder

>> Starting delete existing files in testing input folder

---Created contexts file at /Volumes/Users/ly_k1/Documents/mColBERT/data/processed/contexts-test.jsonl
---Created queries file at /Volumes/Users/ly_k1/Documents/mColBERT/data/processed/queries-test.jsonl


### Convert to Tabular form

In [5]:
from services.convert_data import convert_quad

def main():
    for data_name in DATA_LANG_DICT.keys():
        RAW_DATA_PATH = os.path.join(RAW_DATA_DIR, "{data}-test.json".format(data=data_name))
        TABULAR_DATA_PATH =  os.path.join(PROCESSED_DATA_DIR, "{data}-test.json".format(data=data_name))
        convert_quad(RAW_DATA_PATH, TABULAR_DATA_PATH, PROCESSED_TESTING_CONTEXTS_PATH, PROCESSED_TESTING_QUERIES_PATH)

if __name__ == '__main__':
    main()

Completed conversion of /Volumes/Users/ly_k1/Documents/mColBERT/data/raw/squad-test.json
Saved to /Volumes/Users/ly_k1/Documents/mColBERT/data/processed/squad-test.json
Added new contexts and queries to /Volumes/Users/ly_k1/Documents/mColBERT/data/processed/contexts-test.jsonl and /Volumes/Users/ly_k1/Documents/mColBERT/data/processed/queries-test.jsonl
Completed conversion of /Volumes/Users/ly_k1/Documents/mColBERT/data/raw/korquad-test.json
Saved to /Volumes/Users/ly_k1/Documents/mColBERT/data/processed/korquad-test.json
Added new contexts and queries to /Volumes/Users/ly_k1/Documents/mColBERT/data/processed/contexts-test.jsonl and /Volumes/Users/ly_k1/Documents/mColBERT/data/processed/queries-test.jsonl
Completed conversion of /Volumes/Users/ly_k1/Documents/mColBERT/data/raw/fquad-test.json
Saved to /Volumes/Users/ly_k1/Documents/mColBERT/data/processed/fquad-test.json
Added new contexts and queries to /Volumes/Users/ly_k1/Documents/mColBERT/data/processed/contexts-test.jsonl and /V

### Translate data

In [None]:
from services.translate_queries import translate_all
            
def main():
    for data_name, data_lang in DATA_LANG_DICT.items():
        #translate all queries from one language to all other languages and save to new files
        TABULAR_DATA_PATH = os.path.join(PROCESSED_DATA_DIR, "{data}-test.json".format(data=data_name))
        TRANSLATED_DATA_PATH = os.path.join(PROCESSED_DATA_DIR, "{data}-test-translated.jsonl".format(data=data_name))
        TRANSLATED_LANGUAGES = [lang for lang in ["en", "de", "ko", "fr", "vi"] if lang != data_lang]

        translate_all(TABULAR_DATA_PATH, data_lang, TRANSLATED_DATA_PATH, TRANSLATED_LANGUAGES, PROCESSED_TESTING_QUERIES_PATH, PROCESSED_TESTING_QUERIES_PATH)

if __name__ == '__main__':
    main()

### Prepare Testing Data

In [12]:
import jsonlines

def jsonl_to_tsv(data_filepath, output_data_filepath):
    """
    Convert JSONL file with format {id:_,data:_} to TSV file with format id\tdata
    """
    output_data_file = open(output_data_filepath,"w")
    with open(data_filepath, "r") as data_file:
        reader = jsonlines.Reader(data_file)
        for line in reader:
            for key, value in line.items():
                if key.endswith("_id"):
                    id = value
                else:
                    val = value.replace("\n","")
            if val and not val.isspace():
                output_data_file.write("{id}\t{data}\n".format(id = id, data = val))
    output_data_file.close()
    print("Converted {file} to {output_file}".format(file = data_filepath, output_file = output_data_filepath))

jsonl_to_tsv(os.path.join(PROCESSED_DATA_DIR,"contexts-test.jsonl"), os.path.join(TESTING_DATA_DIR,"contexts.tsv"))
jsonl_to_tsv(os.path.join(PROCESSED_DATA_DIR,"queries-test.jsonl"), os.path.join(TESTING_DATA_DIR,"queries.tsv"))

Converted /Volumes/Users/ly_k1/Documents/mColBERT/data/processed/contexts-test.jsonl to /Volumes/Users/ly_k1/Documents/mColBERT/data/testing/contexts.tsv
Converted /Volumes/Users/ly_k1/Documents/mColBERT/data/processed/queries-test.jsonl to /Volumes/Users/ly_k1/Documents/mColBERT/data/testing/queries.tsv


## Testing

In [2]:
import os
import sys
import argparse

from dotenv import load_dotenv
load_dotenv()
sys.path.append(os.getenv('ROOT_DIR'))

RAW_DATA_DIR = os.getenv('RAW_DATA_DIR')
PROCESSED_DATA_DIR = os.getenv('PROCESSED_DATA_DIR')
TESTING_DATA_DIR = os.getenv('TESTING_DATA_DIR')
PROCESSED_TESTING_CONTEXTS_PATH = os.path.join(PROCESSED_DATA_DIR,"contexts-test.jsonl")
PROCESSED_TESTING_QUERIES_PATH = os.path.join(PROCESSED_DATA_DIR,"queries-test.jsonl")
DATA_LANG_DICT = {"squad":"en", "korquad":"ko", "fquad":"fr", "germanquad":"de", "uitviquad":"vi"}

### Create ground truth

In [3]:
import jsonlines

def get_ground_truth(data_filepath):
    res = {}
    with jsonlines.open(data_filepath) as reader:
        for line in reader:
            res[line["query_id"]] = {"context_id":line["context_id"],"query_lang":line["query_lang"], "context_lang":line["context_lang"]}
    return res


ground_truth = {}
for data_name, data_lang in DATA_LANG_DICT.items():
    data_filepath = os.path.join(PROCESSED_DATA_DIR, "{data}-test-translated.jsonl".format(data=data_name))
    ground_truth.update(get_ground_truth(data_filepath))
        

### Set up for testing

In [4]:
import colbert
from colbert import Indexer, Searcher
from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert.data import Queries, Collection

nbits = 2   # encode each dimension with 2 bits
doc_maxlen = 300   # truncate passages at 300 tokens
index_name = f'{nbits}bits'

checkpoint = '/Volumes/Users/ly_k1/Documents/mColBERT/experiments/default/none/2024-04/13/23.13.02/checkpoints/colbert'
collection = os.path.join(TESTING_DATA_DIR,"contexts.tsv")
queries = os.path.join(TESTING_DATA_DIR,"queries.tsv")
queries = Queries(path=queries)
collection = Collection(path=collection)

f'Loaded {len(queries):,} queries and {len(collection):,} passages'

[Apr 22, 11:52:33] #> Loading the queries from /Volumes/Users/ly_k1/Documents/mColBERT/data/testing/queries.tsv ...
[Apr 22, 11:52:33] #> Got 121945 queries. All QIDs are unique.

[Apr 22, 11:52:33] #> Loading collection...
0M 


'Loaded 121,945 queries and 4,830 passages'

### Indexing

In [6]:
#index
with Run().context(RunConfig(nranks=1, experiment='testing-official')):  # nranks specifies the number of GPUs to use.
    config = ColBERTConfig(doc_maxlen=doc_maxlen, nbits=nbits)

    indexer = Indexer(checkpoint=checkpoint, config=config)
    indexer.index(name=index_name, collection=collection, overwrite=True)
indexer.get_index() # You can get the absolute path of the index, if needed.

### Searching

In [7]:
with Run().context(RunConfig(experiment='testing-official')):
    searcher = Searcher(index=index_name)

[Apr 22, 11:53:16] #> Loading collection...
0M 
[Apr 22, 11:53:17] #> Loading codec...
[Apr 22, 11:53:17] Loading decompress_residuals_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...
[Apr 22, 11:53:17] Loading packbits_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...
[Apr 22, 11:53:18] #> Loading IVF...
[Apr 22, 11:53:18] #> Loading doclens...


100%|██████████| 1/1 [00:00<00:00, 844.09it/s]

[Apr 22, 11:53:18] #> Loading codes and residuals...



100%|██████████| 1/1 [00:00<00:00, 63.91it/s]


In [8]:
res = searcher.search_all(queries, k=10).todict()

121945it [05:51, 347.29it/s]


In [7]:
sum_dict = {k1:{k2:0 for k2 in ["en", "de", "ko", "fr", "vi"]} for k1 in ["en", "de", "ko", "fr", "vi"]}

for qid, row in ground_truth.items():
    sum_dict[row["query_lang"]][row["context_lang"]] += 1

In [8]:
top10_dict = {k1:{k2:0 for k2 in ["en", "de", "ko", "fr", "vi"]} for k1 in ["en", "de", "ko", "fr", "vi"]}

res_dict = res

for qid, rankings in res_dict.items():
    row = ground_truth[qid]
    pids = [tup[0] for tup in rankings]
    if row["context_id"] in pids:
        top10_dict[row["query_lang"]][row["context_lang"]] += 1

accuracy = {}
for lang in sum_dict.keys():
    accuracy[lang] = {}
    for context_lang in sum_dict[lang].keys():
        accuracy[lang][context_lang] = top10_dict[lang][context_lang] / sum_dict[lang][context_lang]
accuracy

{'en': {'en': 0.8887417218543047,
  'de': 0.7735934664246824,
  'ko': 0.8089712504329754,
  'fr': 0.7782308657465495,
  'vi': 0.8518658122879759},
 'de': {'en': 0.8573320719016083,
  'de': 0.8008166969147006,
  'ko': 0.6629719431936266,
  'fr': 0.7474905897114178,
  'vi': 0.7719562759140596},
 'ko': {'en': 0.8142857142857143,
  'de': 0.7100725952813067,
  'ko': 0.9468306200207828,
  'fr': 0.6831869510664994,
  'vi': 0.7463249151903506},
 'fr': {'en': 0.8573320719016083,
  'de': 0.7490925589836661,
  'ko': 0.6705923103567717,
  'fr': 0.8296737766624843,
  'vi': 0.7779871843196381},
 'vi': {'en': 0.8340586565752128,
  'de': 0.7218693284936479,
  'ko': 0.7185659854520263,
  'fr': 0.7082810539523212,
  'vi': 0.9140595552205051}}

In [9]:
top5_dict = {k1:{k2:0 for k2 in ["en", "de", "ko", "fr", "vi"]} for k1 in ["en", "de", "ko", "fr", "vi"]}

res_dict = res

for qid, rankings in res_dict.items():
    row = ground_truth[qid]
    pids = [tup[0] for tup in rankings[:5]]
    if row["context_id"] in pids:
        top5_dict[row["query_lang"]][row["context_lang"]] += 1

accuracy = {}
for lang in sum_dict.keys():
    accuracy[lang] = {}
    for context_lang in sum_dict[lang].keys():
        accuracy[lang][context_lang] = top5_dict[lang][context_lang] / sum_dict[lang][context_lang]
accuracy

{'en': {'en': 0.867833491012299,
  'de': 0.7617967332123412,
  'ko': 0.7753723588500173,
  'fr': 0.743099121706399,
  'vi': 0.8198266113833396},
 'de': {'en': 0.8319772942289498,
  'de': 0.7921960072595281,
  'ko': 0.6179425008659508,
  'fr': 0.705771643663739,
  'vi': 0.7263475310968714},
 'ko': {'en': 0.7771050141911069,
  'de': 0.6928312159709619,
  'ko': 0.9350536889504676,
  'fr': 0.6358218318695107,
  'vi': 0.7108933283075763},
 'fr': {'en': 0.830558183538316,
  'de': 0.7318511796733213,
  'ko': 0.6271215794942847,
  'fr': 0.8036386449184442,
  'vi': 0.7327553712777988},
 'vi': {'en': 0.8026490066225166,
  'de': 0.707350272232305,
  'ko': 0.6827156217526844,
  'fr': 0.6590338770388958,
  'vi': 0.8993592159819073}}

In [11]:
top1_dict = {k1:{k2:0 for k2 in ["en", "de", "ko", "fr", "vi"]} for k1 in ["en", "de", "ko", "fr", "vi"]}

res_dict = res

for qid, rankings in res_dict.items():
    row = ground_truth[qid]
    pids = [tup[0] for tup in rankings[:1]]
    if row["context_id"] in pids:
        top1_dict[row["query_lang"]][row["context_lang"]] += 1

accuracy = {}
for lang in sum_dict.keys():
    accuracy[lang] = {}
    for context_lang in sum_dict[lang].keys():
        accuracy[lang][context_lang] = top1_dict[lang][context_lang] / sum_dict[lang][context_lang]
accuracy

{'en': {'en': 0.7487228003784295,
  'de': 0.6746823956442831,
  'ko': 0.6222722549359196,
  'fr': 0.5699498117942283,
  'vi': 0.6532227666792311},
 'de': {'en': 0.6923368022705771,
  'de': 0.7245916515426497,
  'ko': 0.4620713543470731,
  'fr': 0.541405269761606,
  'vi': 0.5454202789295137},
 'ko': {'en': 0.6161778618732261,
  'de': 0.6048094373865699,
  'ko': 0.8481122272254936,
  'fr': 0.4598494353826851,
  'vi': 0.5367508480964945},
 'fr': {'en': 0.689120151371807,
  'de': 0.6424682395644283,
  'ko': 0.4624177346726706,
  'fr': 0.6609159347553325,
  'vi': 0.5702977761025254},
 'vi': {'en': 0.6463576158940397,
  'de': 0.6188747731397459,
  'ko': 0.5261517145826117,
  'fr': 0.4883939774153074,
  'vi': 0.7810026385224275}}