## Preprocessing

In [2]:
import os
import sys
import argparse

from dotenv import load_dotenv
load_dotenv()
sys.path.append(os.getenv('ROOT_DIR'))

RAW_DATA_DIR = os.getenv('RAW_DATA_DIR')
PROCESSED_DATA_DIR = os.getenv('PROCESSED_DATA_DIR')
TESTING_DATA_DIR = os.getenv('TESTING_DATA_DIR')
PROCESSED_TESTING_CONTEXTS_PATH = os.path.join(PROCESSED_DATA_DIR,"contexts-test.jsonl")
PROCESSED_TESTING_QUERIES_PATH = os.path.join(PROCESSED_DATA_DIR,"queries-test.jsonl")
DATA_LANG_DICT = {"squad":"en", "korquad":"ko", "fquad":"fr", "germanquad":"de", "uitviquad":"vi"}

### Set Up

In [1]:

def create_context_and_query_files(contexts_filename = PROCESSED_TESTING_CONTEXTS_PATH, queries_filename = PROCESSED_TESTING_QUERIES_PATH):

    with open(contexts_filename, "w") as f:
        # Create an empty JSONL file
        f.write("")
    print("---Created contexts file at {filename}".format(filename=contexts_filename))

    with open(queries_filename, "w") as f:
        # Create an empty JSONL file
        f.write("")
    
    print("---Created queries file at {filename}".format(filename=queries_filename))

def main():

    #create data directories or delete previous testing files
    if not os.path.exists(PROCESSED_DATA_DIR):
        os.system("mkdir {dir}".format(dir=PROCESSED_DATA_DIR))
    else:
        print(">> Starting delete existing files in processed folder")
        print()
        for filename in os.listdir(PROCESSED_DATA_DIR):
            if filename.endswith("-test-translated.jsonl") or filename.endswith("-test-neg.json") or filename.endswith("-test.json"):
                os.system("rm -f {file}".format(file = os.path.join(PROCESSED_DATA_DIR, filename)))

    if not os.path.exists(TESTING_DATA_DIR):
        os.system("mkdir {dir}".format(dir=TESTING_DATA_DIR))

    else:
        print(">> Starting delete existing files in testing input folder")
        print()
        for filename in os.listdir(TESTING_DATA_DIR):
            os.system("rm -f {file}".format(file = os.path.join(TESTING_DATA_DIR, filename)))

    #check for context and query file, create if not exist
    create_context_and_query_files()

if __name__ == "__main__":
    main()

>> Starting delete existing files in processed folder

>> Starting delete existing files in testing input folder

---Created contexts file at /Volumes/Users/ly_k1/Documents/mColBERT/data/processed/contexts-test.jsonl
---Created queries file at /Volumes/Users/ly_k1/Documents/mColBERT/data/processed/queries-test.jsonl


### Convert to Tabular form

In [5]:
from services.convert_data import convert_quad

def main():
    for data_name in DATA_LANG_DICT.keys():
        RAW_DATA_PATH = os.path.join(RAW_DATA_DIR, "{data}-test.json".format(data=data_name))
        TABULAR_DATA_PATH =  os.path.join(PROCESSED_DATA_DIR, "{data}-test.json".format(data=data_name))
        convert_quad(RAW_DATA_PATH, TABULAR_DATA_PATH, PROCESSED_TESTING_CONTEXTS_PATH, PROCESSED_TESTING_QUERIES_PATH)

if __name__ == '__main__':
    main()

Completed conversion of /Volumes/Users/ly_k1/Documents/mColBERT/data/raw/squad-test.json
Saved to /Volumes/Users/ly_k1/Documents/mColBERT/data/processed/squad-test.json
Added new contexts and queries to /Volumes/Users/ly_k1/Documents/mColBERT/data/processed/contexts-test.jsonl and /Volumes/Users/ly_k1/Documents/mColBERT/data/processed/queries-test.jsonl
Completed conversion of /Volumes/Users/ly_k1/Documents/mColBERT/data/raw/korquad-test.json
Saved to /Volumes/Users/ly_k1/Documents/mColBERT/data/processed/korquad-test.json
Added new contexts and queries to /Volumes/Users/ly_k1/Documents/mColBERT/data/processed/contexts-test.jsonl and /Volumes/Users/ly_k1/Documents/mColBERT/data/processed/queries-test.jsonl
Completed conversion of /Volumes/Users/ly_k1/Documents/mColBERT/data/raw/fquad-test.json
Saved to /Volumes/Users/ly_k1/Documents/mColBERT/data/processed/fquad-test.json
Added new contexts and queries to /Volumes/Users/ly_k1/Documents/mColBERT/data/processed/contexts-test.jsonl and /V

### Translate data

In [6]:
from services.translate_queries import translate_all
            
def main():
    for data_name, data_lang in DATA_LANG_DICT.items():
        #translate all queries from one language to all other languages and save to new files
        TABULAR_DATA_PATH = os.path.join(PROCESSED_DATA_DIR, "{data}-test.json".format(data=data_name))
        TRANSLATED_DATA_PATH = os.path.join(PROCESSED_DATA_DIR, "{data}-test-translated.jsonl".format(data=data_name))
        TRANSLATED_LANGUAGES = [lang for lang in ["en", "de", "ko", "fr", "vi"] if lang != data_lang]

        translate_all(TABULAR_DATA_PATH, data_lang, TRANSLATED_DATA_PATH, TRANSLATED_LANGUAGES, PROCESSED_TESTING_QUERIES_PATH, PROCESSED_TESTING_QUERIES_PATH)

if __name__ == '__main__':
    main()

Translating /Volumes/Users/ly_k1/Documents/mColBERT/data/processed/squad-test.json
Finished translating 10 rows from en to de
Finished translating 20 rows from en to de
Finished translating 30 rows from en to de
Finished translating 40 rows from en to de
Finished translating 50 rows from en to de
Finished translating 60 rows from en to de
Finished translating 70 rows from en to de
Finished translating 80 rows from en to de
Finished translating 90 rows from en to de
Finished translating 100 rows from en to de
Finished translating 110 rows from en to de
Finished translating 120 rows from en to de
Finished translating 130 rows from en to de
Finished translating 140 rows from en to de
Finished translating 150 rows from en to de
Finished translating 160 rows from en to de
Finished translating 170 rows from en to de
Finished translating 180 rows from en to de
Finished translating 190 rows from en to de
Finished translating 200 rows from en to de
Finished translating 210 rows from en to de
Fi

### Prepare Testing Data

In [12]:
import jsonlines

def jsonl_to_tsv(data_filepath, output_data_filepath):
    """
    Convert JSONL file with format {id:_,data:_} to TSV file with format id\tdata
    """
    output_data_file = open(output_data_filepath,"w")
    with open(data_filepath, "r") as data_file:
        reader = jsonlines.Reader(data_file)
        for line in reader:
            for key, value in line.items():
                if key.endswith("_id"):
                    id = value
                else:
                    val = value.replace("\n","")
            if val and not val.isspace():
                output_data_file.write("{id}\t{data}\n".format(id = id, data = val))
    output_data_file.close()
    print("Converted {file} to {output_file}".format(file = data_filepath, output_file = output_data_filepath))

jsonl_to_tsv(os.path.join(PROCESSED_DATA_DIR,"contexts-test.jsonl"), os.path.join(TESTING_DATA_DIR,"contexts.tsv"))
jsonl_to_tsv(os.path.join(PROCESSED_DATA_DIR,"queries-test.jsonl"), os.path.join(TESTING_DATA_DIR,"queries.tsv"))

Converted /Volumes/Users/ly_k1/Documents/mColBERT/data/processed/contexts-test.jsonl to /Volumes/Users/ly_k1/Documents/mColBERT/data/testing/contexts.tsv
Converted /Volumes/Users/ly_k1/Documents/mColBERT/data/processed/queries-test.jsonl to /Volumes/Users/ly_k1/Documents/mColBERT/data/testing/queries.tsv


## Testing

### Create ground truth

In [13]:
import jsonlines

def get_ground_truth(data_filepath):
    res = {}
    with jsonlines.open(data_filepath) as reader:
        for line in reader:
            res[line["query_id"]] = {"context_id":line["context_id"],"query_lang":line["query_lang"], "context_lang":line["context_lang"]}
    return res


ground_truth = {}
for data_name, data_lang in DATA_LANG_DICT.items():
    data_filepath = os.path.join(PROCESSED_DATA_DIR, "{data}-test-translated.jsonl".format(data=data_name))
    ground_truth.update(get_ground_truth(data_filepath))
        

### Set up for testing

In [6]:
import colbert
from colbert import Indexer, Searcher
from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert.data import Queries, Collection

nbits = 2   # encode each dimension with 2 bits
doc_maxlen = 300   # truncate passages at 300 tokens
index_name = f'{nbits}bits'

checkpoint = '/Volumes/Users/ly_k1/Documents/mColBERT/experiments/default/none/2024-04/13/23.13.02/checkpoints/colbert'
collection = os.path.join(TESTING_DATA_DIR,"contexts.tsv")
queries = os.path.join(TESTING_DATA_DIR,"queries.tsv")
queries = Queries(path=queries)
collection = Collection(path=collection)

f'Loaded {len(queries):,} queries and {len(collection):,} passages'

[Apr 15, 23:16:46] #> Loading the queries from /Volumes/Users/ly_k1/Documents/mColBERT/data/testing/queries.tsv ...
[Apr 15, 23:16:46] #> Got 121945 queries. All QIDs are unique.

[Apr 15, 23:16:46] #> Loading collection...
0M 


'Loaded 121,945 queries and 4,830 passages'

In [9]:
#index
with Run().context(RunConfig(nranks=1, experiment='testing-official')):  # nranks specifies the number of GPUs to use.
    config = ColBERTConfig(doc_maxlen=doc_maxlen, nbits=nbits)

    indexer = Indexer(checkpoint=checkpoint, config=config)
    indexer.index(name=index_name, collection=collection, overwrite=True)
indexer.get_index() # You can get the absolute path of the index, if needed.



[Apr 15, 23:17:36] #> Creating directory /Volumes/Users/ly_k1/Documents/mColBERT/notebooks/experiments/testing-official/indexes/2bits 


#> Starting...
nranks = 1 	 num_gpus = 1 	 device=0
{
    "query_token_id": "[unused0]",
    "doc_token_id": "[unused1]",
    "query_token": "[Q]",
    "doc_token": "[D]",
    "ncells": null,
    "centroid_score_threshold": null,
    "ndocs": null,
    "load_index_with_mmap": false,
    "index_path": null,
    "nbits": 2,
    "kmeans_niters": 4,
    "resume": false,
    "similarity": "cosine",
    "bsize": 64,
    "accumsteps": 1,
    "lr": 1e-5,
    "maxsteps": 500000,
    "save_every": null,
    "warmup": 20000,
    "warmup_bert": null,
    "relu": false,
    "nway": 2,
    "use_ib_negatives": true,
    "reranker": false,
    "distillation_alpha": 1.0,
    "ignore_scores": false,
    "model_name": null,
    "query_maxlen": 32,
    "attend_to_mask_tokens": false,
    "interaction": "colbert",
    "dim": 128,
    "doc_maxlen": 300,
    "mask_punctua

[W socket.cpp:426] [c10d] The server socket cannot be initialized on [::]:12749 (errno: 97 - Address family not supported by protocol).
[W socket.cpp:601] [c10d] The client socket cannot be initialized to connect to [::ffff:127.0.0.1]:12749 (errno: 97 - Address family not supported by protocol).
[W socket.cpp:601] [c10d] The client socket cannot be initialized to connect to [::ffff:127.0.0.1]:12749 (errno: 97 - Address family not supported by protocol).


[Apr 15, 23:17:40] [0] 		 # of sampled PIDs = 4830 	 sampled_pids[:3] = [3412, 83, 2446]
[Apr 15, 23:17:40] [0] 		 #> Encoding 4830 passages..
[Apr 15, 23:17:50] [0] 		 avg_doclen_est = 208.28944396972656 	 len(local_sample) = 4,830
[Apr 15, 23:17:52] [0] 		 Creaing 8,192 partitions.
[Apr 15, 23:17:52] [0] 		 *Estimated* 1,006,038 embeddings.
[Apr 15, 23:17:52] [0] 		 #> Saving the indexing plan to /Volumes/Users/ly_k1/Documents/mColBERT/notebooks/experiments/testing-official/indexes/2bits/plan.json ..
Clustering 956038 points in 128D to 8192 clusters, redo 1 times, 4 iterations
  Preprocessing in 0.04 s
  Iteration 3 (0.83 s, search 0.74 s): objective=396381 imbalance=1.227 nsplit=0       
[Apr 15, 23:17:54] Loading decompress_residuals_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...
[Apr 15, 23:17:55] Loading packbits_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...
[0.052, 0.048, 0.05, 0.047, 0.049, 0.047, 0.043, 0.049, 0.

0it [00:00, ?it/s]

[Apr 15, 23:18:06] [0] 		 #> Saving chunk 0: 	 4,830 passages and 1,006,038 embeddings. From #0 onward.


1it [00:10, 10.32s/it]
100%|██████████| 1/1 [00:00<00:00, 567.33it/s]
100%|██████████| 8192/8192 [00:00<00:00, 223808.41it/s]


[Apr 15, 23:18:06] [0] 		 #> Checking all files were saved...
[Apr 15, 23:18:06] [0] 		 Found all files!
[Apr 15, 23:18:06] [0] 		 #> Building IVF...
[Apr 15, 23:18:06] [0] 		 #> Loading codes...
[Apr 15, 23:18:06] [0] 		 Sorting codes...
[Apr 15, 23:18:06] [0] 		 Getting unique codes...
[Apr 15, 23:18:06] #> Optimizing IVF to store map from centroids to list of pids..
[Apr 15, 23:18:06] #> Building the emb2pid mapping..
[Apr 15, 23:18:06] len(emb2pid) = 1006038
[Apr 15, 23:18:06] #> Saved optimized IVF to /Volumes/Users/ly_k1/Documents/mColBERT/notebooks/experiments/testing-official/indexes/2bits/ivf.pid.pt
[Apr 15, 23:18:06] [0] 		 #> Saving the indexing metadata to /Volumes/Users/ly_k1/Documents/mColBERT/notebooks/experiments/testing-official/indexes/2bits/metadata.json ..
#> Joined...


'/Volumes/Users/ly_k1/Documents/mColBERT/notebooks/experiments/testing-official/indexes/2bits'

In [10]:
with Run().context(RunConfig(experiment='testing-official')):
    searcher = Searcher(index=index_name)

[Apr 15, 23:19:29] #> Loading collection...
0M 
[Apr 15, 23:19:31] #> Loading codec...
[Apr 15, 23:19:31] Loading decompress_residuals_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...
[Apr 15, 23:19:31] Loading packbits_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...
[Apr 15, 23:19:31] #> Loading IVF...
[Apr 15, 23:19:31] #> Loading doclens...


100%|██████████| 1/1 [00:00<00:00, 3209.11it/s]

[Apr 15, 23:19:31] #> Loading codes and residuals...



100%|██████████| 1/1 [00:00<00:00, 69.41it/s]


In [11]:
res = searcher.search_all(queries, k=10).todict()

121945it [05:43, 354.74it/s]


In [23]:
top10_dict = {k1:{k2:0 for k2 in ["en", "de", "ko", "fr", "vi"]} for k1 in ["en", "de", "ko", "fr", "vi"]}

res_dict = res

for qid, rankings in res_dict.items():
    pids = [tup[0] for tup in rankings]
    if ground_truth[qid]["context_id"] in pids:
        top10_dict[ground_truth[qid]["query_lang"]][ground_truth[qid]["context_lang"]] += 1

In [25]:
sum_dict = {k1:{k2:0 for k2 in ["en", "de", "ko", "fr", "vi"]} for k1 in ["en", "de", "ko", "fr", "vi"]}

for qid, row in ground_truth.items():
    sum_dict[row["query_lang"]][row["context_lang"]] += 1

In [24]:
top10_dict

{'en': {'en': 9394, 'de': 1705, 'ko': 4671, 'fr': 2481, 'vi': 2260},
 'de': {'en': 9062, 'de': 1765, 'ko': 3828, 'fr': 2383, 'vi': 2048},
 'ko': {'en': 8607, 'de': 1565, 'ko': 5467, 'fr': 2178, 'vi': 1980},
 'fr': {'en': 9062, 'de': 1651, 'ko': 3872, 'fr': 2645, 'vi': 2064},
 'vi': {'en': 8816, 'de': 1591, 'ko': 4149, 'fr': 2258, 'vi': 2425}}

In [26]:
result_dict = {}
for lang in sum_dict.keys():
    result_dict[lang] = {}
    for context_lang in sum_dict[lang].keys():
        result_dict[lang][context_lang] = top10_dict[lang][context_lang] / sum_dict[lang][context_lang]
result_dict

{'en': {'en': 0.8887417218543047,
  'de': 0.7735934664246824,
  'ko': 0.8089712504329754,
  'fr': 0.7782308657465495,
  'vi': 0.8518658122879759},
 'de': {'en': 0.8573320719016083,
  'de': 0.8008166969147006,
  'ko': 0.6629719431936266,
  'fr': 0.7474905897114178,
  'vi': 0.7719562759140596},
 'ko': {'en': 0.8142857142857143,
  'de': 0.7100725952813067,
  'ko': 0.9468306200207828,
  'fr': 0.6831869510664994,
  'vi': 0.7463249151903506},
 'fr': {'en': 0.8573320719016083,
  'de': 0.7490925589836661,
  'ko': 0.6705923103567717,
  'fr': 0.8296737766624843,
  'vi': 0.7779871843196381},
 'vi': {'en': 0.8340586565752128,
  'de': 0.7218693284936479,
  'ko': 0.7185659854520263,
  'fr': 0.7082810539523212,
  'vi': 0.9140595552205051}}