In [None]:
from underthesea import sent_tokenize, word_tokenize

PhoBERT_base_fairseq_path = \
"/Users/n2t2k/Documents/Studying/Master/Thesis/InProgress/Coding/ORIGIN_RUN_ALL_edge-oriented-graph-master-studying/PhoBERT_base_fairseq"
from fairseq.models.roberta import RobertaModel
phoBERT = RobertaModel.from_pretrained(PhoBERT_base_fairseq_path, checkpoint_file='model.pt')
phoBERT.eval()  # disable dropout (or leave in train mode to finetune
import torch
from fairseq.models.roberta import alignment_utils
from fairseq.models import roberta 
from typing import Tuple, List
from fairseq.data.encoders.fastbpe import fastBPE


In [None]:
from collections import Counter
from typing import List
import torch

def align_bpe_to_words(roberta, bpe_tokens: torch.LongTensor, other_tokens: List[str]):
    """
    Helper to align GPT-2 BPE to other tokenization formats (e.g., spaCy).

    Args:
        roberta (RobertaHubInterface): RoBERTa instance
        bpe_tokens (torch.LongTensor): GPT-2 BPE tokens of shape `(T_bpe)`
        other_tokens (List[str]): other tokens of shape `(T_words)`

    Returns:
        List[str]: mapping from *other_tokens* to corresponding *bpe_tokens*.
    """
    assert bpe_tokens.dim() == 1
    assert bpe_tokens[0] == 0

    def clean(text):
        return text.strip()

    # remove whitespaces to simplify alignment
    bpe_tokens = [roberta.task.source_dictionary.string([x]) for x in bpe_tokens]
    bpe_tokens = [
        clean(roberta.bpe.decode(x) if x not in {"<s>", ""} else x) for x in bpe_tokens
    ]
    other_tokens = [clean(str(o)) for o in other_tokens]

    # strip leading <s>
    bpe_tokens = bpe_tokens[1:]
    assert "".join(bpe_tokens) == "".join(other_tokens)

    # create alignment from every word to a list of BPE tokens
    alignment = []
    bpe_toks = filter(lambda item: item[1] != "", enumerate(bpe_tokens, start=1))
    j, bpe_tok = next(bpe_toks)

    for other_tok in other_tokens:
        # print("other_tok ", other_tok)
        bpe_indices = []
        while True:
            if other_tok.startswith(bpe_tok):
                bpe_indices.append(j)
                other_tok = other_tok[len(bpe_tok) :]
                try:
                    j, bpe_tok = next(bpe_toks)
                except StopIteration:
                    j, bpe_tok = None, None
            elif bpe_tok.startswith(other_tok):
                # other_tok spans multiple BPE tokens
                bpe_indices.append(j)
                bpe_tok = bpe_tok[len(other_tok) :]
                other_tok = ""
            else:
                raise Exception('Cannot align "{}" and "{}"'.format(other_tok, bpe_tok))
            if other_tok == "":
                break
        assert len(bpe_indices) > 0
        alignment.append(bpe_indices)
    assert len(alignment) == len(other_tokens)

    return alignment


In [None]:

def extract_aligned_roberta(roberta, sentence: str, 
                            tokens: List[str], 
                            return_all_hiddens=False):
    ''' Code inspired from: 
       https://github.com/pytorch/fairseq/blob/master/fairseq/models/roberta/hub_interface.py
    
    Aligns roberta embeddings for an input tokenization of words for a sentence
    
    Inputs:
    1. roberta: roberta fairseq class
    2. sentence: sentence in string
    3. tokens: tokens of the sentence in which the alignment is to be done
    
    Outputs: Aligned roberta features 
    '''

    # tokenize both with GPT-2 BPE and get alignment with given tokens
    
    
    print("* "*50)
    print(sentence)
    print(tokens)
    
    
    bpe_toks = roberta.encode(sentence)
    # alignment = alignment_utils.align_bpe_to_words(roberta, bpe_toks, tokens)
    alignment = align_bpe_to_words(roberta, bpe_toks, tokens)
    # extract features and align them
    features = roberta.extract_features(bpe_toks, return_all_hiddens=return_all_hiddens)
    features = features.squeeze(0)   #Batch-size = 1
    aligned_feats = alignment_utils.align_features_to_words(roberta, features, alignment)
    return aligned_feats[1:-1]  #exclude <s> and </s> tokens

# Khởi tạo Byte Pair Encoding cho PhoBERT
class BPE():
  bpe_codes = PhoBERT_base_fairseq_path+'/bpe.codes'

args = BPE()
phoBERT.bpe = fastBPE(args) #Incorporate the BPE encoder into PhoBERT

In [None]:

import numpy as np

def sentence_with_word_tokenize(sentence_org):
    sentence = word_tokenize(sentence_org, format="text")
    # tokens = sentence.split(" ")
    # print(sentence)
    # print(sentence_org)
    # print("len(sentence) != len(sentence_org)", len(sentence) != len(sentence_org))
    if len(sentence) != len(sentence_org):
        sentence_org_replace = sentence_org
        lw = sentence.split(" ")
        lw = set(w for w in lw if "_" in w)
        # print(lw)
        for w in lw:
            w_ = f"{w}"
            w = w.replace("_", " ")
            # print(w_, w)
            sentence_org_replace = sentence_org_replace.replace(w, w_)
        if len(sentence_org_replace) == len(sentence_org):
            sentence = sentence_org_replace
        else:
            sentence = ""
    # print(sentence)
    return sentence

def reduce_emb_vec(vt1):
  
  vt1_np = vt1.detach().numpy()
  vt1_mean = vt1_np.reshape(-1, 4).mean(axis=1)
  return np.hstack((vt1_mean, np.zeros(8))) # 200-192
  # return vt1_mean

def embedding_words_in_sents_full_shape(sentence):
  tokens = sentence.split(" ")
  print("* "*40)
  print(sentence)
  print("sentence ", sentence)
  print("tokens ", tokens)
  print("* "*50)
  # *the last sentence char is "", fx: "Hello is this "
  if tokens and tokens[-1] == "":
    tokens = tokens[:-1]
  
  w = extract_aligned_roberta(phoBERT, sentence, tokens,False)

  emb_vec_reduce = [] 
  for tk, word in list(zip(tokens, w)):
    emb_vec_reduce.append((tk, reduce_emb_vec(word)))

    
    
  return emb_vec_reduce

def write_append_data_to_txt_file(full_path_to_file, txt):
    with open(full_path_to_file,'a') as out:
        out.write(f'{txt}\n')
        # out.write(f'{txt}')s
        
def clear_file(full_path_to_files_list):
    for _file in full_path_to_files_list:
      with open(_file,'w') as out:
        out.write(f'')


In [None]:
from os import listdir, remove
from os.path import isfile, join, isdir

folder = \
"/Users/n2t2k/Documents/Studying/Master/Thesis/InProgress/Coding/ORIGIN_RUN_ALL_edge-oriented-graph-master-studying/dataProcessingOfficialCleaned/dev_processed/split_sentence_underthesea/docs"
# STILL error on BERT UNK handling after
error_file = "/Users/n2t2k/Documents/Studying/Master/Thesis/InProgress/Coding/ORIGIN_RUN_ALL_edge-oriented-graph-master-studying/dataProcessingOfficialCleaned/dev_processed/split_sentence_underthesea/code/Error_list/BERT_UNK_ERR/err_code_docs.txt"

valid_embedded_files_path = f'{folder}/common_info_embedd_files/valid_embedded_files_list.txt'
error_failed_to_embedded_files_path = f'{folder}/common_info_embedd_files/error_failed_to_embedded_files.txt'


clear_file([error_file, error_failed_to_embedded_files_path, valid_embedded_files_path])


OUT_word_embedded_folder = folder+"/word_embedded_files"
IN_tokened_files_path = folder+"/split_sentence_with_token_from_paragraph.txt"

print(isdir(OUT_word_embedded_folder))
print(isfile(IN_tokened_files_path))


n = 0
with open(IN_tokened_files_path,'r') as tokened_file:
    for tokened_line in tokened_file:
        
        n+=1
        # if n > 2: break
        # print(tokened_line)
        # print(tokened_line[len(tokened_line)-10:])
        list_sents = tokened_line.split("$$##$$$$##$$")
        doc_code = list_sents[0]
        list_sents = list_sents[1:][:-1] # remove last item \n
        clear_file([f"{OUT_word_embedded_folder}/{doc_code}"])

        # print(doc_code)
        # print(list_sents[len(list_sents)-1:][len(list_sents[len(list_sents)-1:])-10: ])
        # if doc_code != '23352634':
        #     # print(n)
        #     continue
        # m = 0 
        print("doc_code", doc_code)
        try:  
        # if 1:
            for sentence_org in list_sents:
                # m+=1
                print("sentence_org> ", sentence_org)
                sentence = sentence_with_word_tokenize(sentence_org)
                emb_vec_reduce =  embedding_words_in_sents_full_shape(sentence)
                # print(emb_vec_reduce)
                for tk, em_w in emb_vec_reduce:
                    em_w_ls = em_w.tolist()
                    em_w_ls = " ".join(map(str, em_w_ls))                    
                    with open(f"{OUT_word_embedded_folder}/{doc_code}",'a') as out:
                        out.write(f"{tk} {em_w_ls}\n")
            write_append_data_to_txt_file(valid_embedded_files_path, doc_code)     
        except: 
            # print("m ", m)
            write_append_data_to_txt_file(error_file, doc_code)
            write_append_data_to_txt_file(error_failed_to_embedded_files_path, doc_code) 
            print(doc_code)
            # break
        
        # if doc_code == '23352634':
        #     print("* "*50)
        #     print(n)
        #     break    






In [None]:
def write_append_data_to_txt_file(full_path_to_file, txt):
    with open(full_path_to_file,'a') as out:
        out.write(f'{txt}')
        out.write(f'\n')
        # out.write(f'{txt}')s
        
def clear_file(full_path_to_files_list):
    for _file in full_path_to_files_list:
      with open(_file,'w') as out:
        out.write(f'')


from os import listdir, remove

folder = \
        f"/Users/n2t2k/Documents/Studying/Master/Thesis/InProgress/Coding/ORIGIN_RUN_ALL_edge-oriented-graph-master-studying/dataProcessingOfficialCleaned/dev_processed/split_sentence_underthesea/docs"
error_file = "/Users/n2t2k/Documents/Studying/Master/Thesis/InProgress/Coding/ORIGIN_RUN_ALL_edge-oriented-graph-master-studying/dataProcessingOfficialCleaned/dev_processed/split_sentence_underthesea/code/Error_list/BERT_UNK_ERR/err_code_docs.txt"
valid_embedded_files_path = f'{folder}/common_info_embedd_files/valid_embedded_files_list.txt'
replaced_underscore_words = f"{folder}/word_embedded_files/"
clear_file([valid_embedded_files_path, error_failed_to_embedded_files_path])

onlyfiles = [f for f in listdir(replaced_underscore_words) if isfile(join(replaced_underscore_words, f))]
print(onlyfiles)


with open(error_file,'r') as ef:
    for error_code in ef: 
        print(error_code, len(error_code))
        if error_code in onlyfiles:
            remove(f"{folder}/{error_code}")
        error_code = error_code.replace('\n', '')
        write_append_data_to_txt_file(error_failed_to_embedded_files_path, error_code) 


        

In [None]:
err_list = set()
with open(error_file,'r') as ef:
    for error_code in ef: 
        err_list.add(error_code[:-1])
print(onlyfiles)
err_set = set(err_list)
onlyfiles_valid_set = set(onlyfiles)
print(len(onlyfiles_valid_set))
onlyfiles_valid_set -= err_set
print(len(onlyfiles_valid_set))
for valid_file in onlyfiles_valid_set:
    write_append_data_to_txt_file(valid_embedded_files_path, valid_file)

In [None]:
"""
only in valid_embedded_files_list 
    sentence
        check len if:
            len_total
            len[:10]
            len(-10:)
            
                remove error_failed_to_embedded_files.txt
"""