### Customize your local configuration

Create a .env file in this folder and configure your local path to following properties: 

```
ROOT_PATH =                     # path to root folder where alignment result to process as well output benchmark directory is located
ALIGNMENT_RESULTS_FOLDER_NAME = # folder name with alignment results 
BENCHMARK_FOLDER_NAME  =        # folder name with benchmark results

DATASET_WIKI =                  # path to wiki-manual
DATASET_NEWSELA =               # path to newsela-manual
DATASET_SICK =                  # path to Sick
DATASET_STS12 =                 # path to STS
DATASET_STS13 =                 # path to STS
DATASET_STS14 =                 # path to STS
DATASET_STS15 =                 # path to STS
DATASET_STS16 =                 # path to STS
DATASET_STSBENCHMARK =          # path to STS Benchmark
DATASET_RTE1_CD =               # path to RTE CD
DATASET_RTE1_PP =               # path to RTE PP
DATASET_PARA =                  # path to PARA
DATASET_ONESTOPENGLISH =        # path to ONESTOPENGLISH   
```

In [4]:
import os
from dotenv import  load_dotenv
load_dotenv()
ROOT_PATH = os.getenv('ROOT_PATH')                                          
ALIGNMENT_RESULTS_FOLDER_NAME = os.getenv('ALIGNMENT_RESULTS_FOLDER_NAME')   
BENCHMARK_FOLDER_NAME = os.getenv('BENCHMARK_FOLDER_NAME')              

In [2]:
DATASET_WIKI  = os.getenv('DATASET_WIKI')
DATASET_NEWSELA  = os.getenv('DATASET_NEWSELA') 
DATASET_SICK  = os.getenv('DATASET_SICK') 
DATASET_STS12  = os.getenv('DATASET_STS12') 
DATASET_STS13  = os.getenv('DATASET_STS13') 
DATASET_STS14  = os.getenv('DATASET_STS14') 
DATASET_STS15  = os.getenv('DATASET_STS15') 
DATASET_STS16  = os.getenv('DATASET_STS16') 
DATASET_STSBENCHMARK  = os.getenv('DATASET_STSBENCHMARK') 
DATASET_RTE1_CD  = os.getenv('DATASET_RTE1_CD') 
DATASET_RTE1_PP  = os.getenv('DATASET_RTE1_PP') 
DATASET_PARA  = os.getenv('DATASET_PARA') 
DATASET_ONESTOPENGLISH  = os.getenv('DATASET_ONESTOPENGLISH')   

TEST_DATA_SETS_DETAILS = {
    "wiki":[os.path.join(DATASET_WIKI, "test.tsv"), (0,3,4)], 
    "newsela":[os.path.join(DATASET_NEWSELA, "test.tsv"), (0,3,4)],
    "sick":[os.path.join(DATASET_SICK, "sick_test_alignment.txt"),  (0,1,2)],
    "sts12":[os.path.join(DATASET_STS12, "mteb_sts12-sts.txt"),  (0,1,2)],
    "sts13":[os.path.join(DATASET_STS13, "mteb_sts13-sts.txt"),  (0,1,2)],
    "sts14":[os.path.join(DATASET_STS14, "mteb_sts14-sts.txt"),  (0,1,2)],
    "sts15":[os.path.join(DATASET_STS15, "mteb_sts15-sts.txt"),  (0,1,2)],
    "sts16":[os.path.join(DATASET_STS16, "mteb_sts16-sts.txt"),  (0,1,2)], 
    "stsbenchmark":[os.path.join(DATASET_STSBENCHMARK, "mteb_stsbenchmark-sts.txt"),  (0,1,2)], 
    "rte1-cd":[os.path.join(DATASET_RTE1_CD, "rte1_cd.txt"),  (0,1,2)],  
    "rte1-pp":[os.path.join(DATASET_RTE1_PP, "rte1_pp.txt"),  (0,1,2)], 
    "para" :[os.path.join(DATASET_PARA, "para_test.txt"),  (0,1,2)],  
    "OneStopEnglish" :[os.path.join(DATASET_ONESTOPENGLISH, "OneStopEnglishCorpus_Sentence-Aligned.txt"), (0,1,2)]     
}

In [3]:
import random
import pathlib
import tqdm
import glob
import random
from pathlib import Path   
path_to_files = os.path.join(ROOT_PATH, ALIGNMENT_RESULTS_FOLDER_NAME)
random.seed(10)

def read_ds(path_to_ds, index_label=0, index_src=3, index_trg=4, previous_version=False, return_full_line=False):
    
    source = []
    target = []
    aligned = []
    full_lines = []
    is_eof = False
    if not previous_version:
        with open(path_to_ds, 'r', encoding='utf-8') as f:
            for line in f:
                if len(line.strip()) > 0:
                    items = line.split("\t")
                    aligned.append(items[index_label].strip())
                    source.append(items[index_src].strip())
                    target.append(items[index_trg].strip())
                    full_lines.append(line.strip())
    else:
        file = open(path_to_ds, 'r', encoding='utf-8')
        while not is_eof:
            line = file.readline()
            if len(line.strip()) > 0:
                items = line.split("\t")
                aligned.append(items[index_label].strip())
                source.append(items[index_src].strip())
                target.append(items[index_trg].strip())
                full_lines.append(line.strip())
            else:
                is_eof = True 
    if return_full_line:
        return source, target, aligned, full_lines
    else:
        return source, target, aligned    

def split_filename(file_name):
    file_name = file_name.split(".")[0]
    return file_name.split("_")

def read_unique(path_to_files, index_label, index_src, index_trg):
    sentences = []
    for path_to_file in path_to_files:
        source, target, aligned = read_ds(path_to_file, index_label=index_label, index_src=index_src, index_trg=index_trg)
        for index in range(len(source)):
            sentences.append(f'{aligned[index].strip()}\t{source[index].strip()}\t{target[index].strip()}')
    uniqe = set(sentences)
    return uniqe  

def fix_crf_file_name(path_to_files):
    files_to_process = sorted(pathlib.Path(path_to_files).glob('*_chaojiang06_*.txt'))
    for fileName in files_to_process:
        os.rename(fileName, str(fileName).replace("_chaojiang06_", "_chaojiang06-"))
    files_to_process = sorted(pathlib.Path(path_to_files).glob('*_rte1_*.txt'))
    for fileName in files_to_process:
        os.rename(fileName, str(fileName).replace("_rte1_", "_rte1-"))   
    
            
def get_negatives(path_to_files):
    output = {}
    negatives = []
    current = set()
    files_to_process = sorted(pathlib.Path(path_to_files).glob('*.txt'))
    for txt_file in  tqdm.tqdm_notebook(files_to_process):
        file_name = Path(txt_file).name
        file_name_attr = split_filename(file_name)
        method, ds, error_type = file_name_attr[1], file_name_attr[-2], file_name_attr[-1]
        lines = []
        with open(txt_file, 'r', encoding='utf-8') as f:
            for line in f:
                lines.append(line.strip())    
        uniqlines = set(lines)
        to_add = uniqlines.difference(current)
        output["\t"+method.upper()+"\t"+ds.upper()+"\t"+error_type.upper()] = to_add
        current.update(to_add)
            
    for item in output.keys():
        lines = output[item]
        for line in lines: 
            line = line.strip()
            if len(line)>0:
                negatives.append(line+item)
    return negatives

def get_positives(path_to_files, percentTPFN=0.05):
    ds_set = {}
    ds_fptn_set = {}
    all_records = []
    for ds in TEST_DATA_SETS_DETAILS.keys():
        unique_ds =  read_unique([TEST_DATA_SETS_DETAILS[ds][0]], index_label=TEST_DATA_SETS_DETAILS[ds][1][0], index_src=TEST_DATA_SETS_DETAILS[ds][1][1], index_trg=TEST_DATA_SETS_DETAILS[ds][1][2])
        ds_set[ds] = unique_ds
        all_records.extend(list(unique_ds))
        
        file_paths =  glob.glob(os.path.join(path_to_files, f'*{ds}_F*.txt'))
        unique_ds_fptn = read_unique(file_paths, index_label=0, index_src=1, index_trg=2)
        ds_fptn_set[ds] = unique_ds_fptn

            

    positives = []
    for key in ds_set.keys():
        remainsentences = list(ds_fptn_set[key] ^ ds_set[key])
        positive_per_ds = int(len(ds_fptn_set[key]) * percentTPFN )
        TN_items, TP_items = [], []
        for item in remainsentences:
            alignment = item.split("\t")[0]
            if "aligned" ==alignment:
                TP_items.append(item+"\t-1\t0.0\tGROUND-TRUTH\t"+key.upper()+"\tTP")
            else:
                TN_items.append(item+"\t-1\t0.0\tGROUND-TRUTH\t"+key.upper()+"\tTN")
        
        positives.extend(random.choices(TP_items, k=min(int(positive_per_ds/2), len(TP_items))))
        positives.extend(random.choices(TN_items, k=min(int(positive_per_ds/2), len(TN_items))))                
    return positives    

to_write = []
fix_crf_file_name(path_to_files)
negatives = get_negatives(path_to_files)
positives = get_positives(path_to_files)
to_write.extend(negatives)
to_write.extend(positives)



with open(os.path.join(ROOT_PATH,BENCHMARK_FOLDER_NAME,"benchmark_doublets.out"), 'w', encoding='utf-8') as file_handler:
    for item in to_write:
        file_handler.write("{}\n".format(item))   

ids_to_remove = []
final_set = set()
with open(os.path.join(ROOT_PATH,BENCHMARK_FOLDER_NAME,"benchmark_doublets.out"), 'r', encoding='utf-8') as f:
    for index, line in enumerate(f):
        items = line.split("\t")   
        key = items[0]+"\t"+items[1]+"\t"+items[2]
        if key in final_set:
           ids_to_remove.append(index)
        else: 
            final_set.add(key)
print(f'Found {len(ids_to_remove)} to remove') 

with open(os.path.join(ROOT_PATH,BENCHMARK_FOLDER_NAME,"benchmark.out"), 'w', encoding='utf-8') as file_handler:
    for index, item in enumerate(to_write):
        if index not in ids_to_remove:
            file_handler.write("{}\n".format(item))   
       

count_errors = 0
ds = {}
with open(os.path.join(ROOT_PATH,BENCHMARK_FOLDER_NAME,"benchmark.out"), 'r', encoding='utf-8') as f:
    for index, line in enumerate(f):
        items = line.split("\t")   
        if len(items)!=8:
            print(f'Error in {index} - {len(items)} - {line}')
            count_errors+=1
        if items[-2] not in ds:
                ds[items[-2]] = 0
        ds[items[-2]] +=1

print("DS stats")    
print(f'Total nbr of records: {index}')
for key in ds:
    print(f'\t{key} -> {ds[key]}')

print(f'Nbr of errors: {count_errors}')
print(f'Benchmark done ')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for txt_file in  tqdm.tqdm_notebook(files_to_process):


  0%|          | 0/338 [00:00<?, ?it/s]

Found 68247 to remove
DS stats
Total nbr of records: 18774
	ONESTOPENGLISH -> 156
	NEWSELA -> 1412
	PARA -> 838
	RTE1-CD -> 109
	RTE1-PP -> 33
	SICK -> 3827
	STS12 -> 2394
	STS13 -> 1054
	STS14 -> 3373
	STS15 -> 1910
	STS16 -> 890
	STSBENCHMARK -> 1133
	WIKI -> 1646
Nbr of errors: 0
Benchmark done 
