In [5]:
import random
import nltk
import re
import io
from tqdm import tqdm
import numpy as np

from underthesea import word_tokenize
from fairseq.models.roberta import RobertaModel
from fairseq.data.encoders.fastbpe import fastBPE  

In [7]:
def _load_vectors(fname, num_words=10000):
    """Load fasttext vector

    Args:
        fname (str): file path
        num_words (int, optional): Number of words to be loaded. Defaults to 10000.

    Returns:
        Dict: Dictionary of word
    """
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    fin.readline().split()
    data = {}
    for i, line in enumerate(tqdm(fin)):
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = np.array([float(val) for val in tokens[1:]])
        data[tokens[0]] /= np.linalg.norm(data[tokens[0]])
        if i > num_words:
            break
    return data

In [40]:
FASTTEXT_PATH = "../augment_model/cc.vi.300.vec"
fasttext_data = _load_vectors(FASTTEXT_PATH)

class BPE():
    bpe_codes = 'PhoBERT_base_fairseq/bpe.codes'
    

10001it [00:01, 6386.85it/s]


In [42]:
phobert = RobertaModel.from_pretrained('../PhoBERT_base_fairseq', checkpoint_file='../model.pt')
args = BPE()
phobert.bpe = fastBPE(args)

2022-06-30 17:04:38 | INFO | fairseq.file_utils | loading archive file ../PhoBERT_base_fairseq


OSError: Model file not found: ../PhoBERT_base_fairseq/../model.pt

In [33]:
def find_similar_word(word, num_similar=1):
    """Find similar word given a word

    Args:
        word (str): Word need to find synonym
        num_similar (int, optional): Number of generated. Defaults to 1.

    Returns:
        List[str]: List of similar words
    """
    ls_similar_word = []
    
    if word not in fasttext_data:
        return []
    
    ref_v = fasttext_data[word]
    
    top_num = 20
    top_w = ['']*top_num
    top_s = [-1]*top_num

    for k in fasttext_data.keys():
        if word == k:
            continue
        score =  np.dot(ref_v, fasttext_data[k])
        if score < np.min(top_s):
            continue
        
        for i in range(top_num):
            if score >= top_s[i]:
                top_w[i+1:] = top_w[i:top_num-1]
                top_w[i] = k
                top_s[i+1:] = top_s[i:top_num-1]
                top_s[i] = score
                break
        
    count = 0
    for i in range(top_num):
        if count >= num_similar:
            return ls_similar_word
        
        if top_w[i].lower() == word:
            continue
        
        ls_similar_word.append(top_w[i].lower())
        count += 1
        
    return ls_similar_word

In [18]:
find_similar_word("tôi đi học", num_similar=5)

Tôi 0.8070541195329557
anh 0.6859056984666219
muốn 0.6335938538320789
nghĩ 0.6285581462391725
mình 0.6075433200828794
thì 0.6056370953612328
Chắc 0.5955072270317948
cậu 0.5888613733823663
cũng 0.5821518868888097
chị 0.5752282965892783
biết 0.5721677601666934
nhưng 0.5715116778715326
nên 0.5706017748580544
thấy 0.567863576400363
rồi 0.56736808957184
Nhưng 0.5669557116514035
ông 0.5668420945218877
nếu 0.563732383604816
bạn 0.5599824338728905
khi 0.5580050167609801


['anh', 'muốn', 'nghĩ', 'mình', 'thì']

In [36]:
import string

TOKENIZER_REGEX = re.compile(r'(\W)')

def tokenize(text):
    tokens = TOKENIZER_REGEX.split(text)
    return [t for t in tokens if len(t.strip()) > 0]

class RandomSynonymInsert:
    def __init__(self, p_aug=0.1, min_aug=1, max_aug=10):
        self.p_aug = p_aug
        self.min_aug = min_aug
        self.max_aug = max_aug 

    def _transform(self, token, n_tokens):
        print(f"Processing token: {token}")
        synonyms = find_similar_word(token, num_similar=5)
        print(synonyms)
        chosen_word = random.choice(synonyms)
        chosen_idx = random.choice(range(n_tokens))
        
        return chosen_word, chosen_idx

    def augment(self, text):
        tokens = tokenize(text)
        
        if len(tokens) < 3:
            return text
    
        augmented_tokens = []
        new_sent = tokens.copy()
        
        num_aug = 0
        for token in tokens:
            if num_aug == self.max_aug:
                break
            
            if token in string.punctuation or random.uniform(0, 1) > self.p_aug:
                continue

            chosen_word, chosen_idx = self._transform(token, len(new_sent))
            
            new_sent.insert(chosen_idx, chosen_word)
            augmented_tokens.append(token)
            num_aug += 1
        
        if num_aug < self.min_aug:
            for _ in range(num_aug, self.min_aug):
                token = "."
                while token not in string.punctuation and token not in augmented_tokens:
                    token = random.choice(tokens)
                
                chosen_word, chosen_idx = self._transform(token, len(new_sent))
                new_sent.insert(chosen_idx, chosen_word)
                
                tokens.insert(chosen_idx, chosen_word)     
    
        return " ".join(new_sent)

In [37]:
rand_syn = RandomSynonymInsert(p_aug=0.1, min_aug=1, max_aug=2)
rand_syn.augment("mang vô đi")

Processing token: vô
['hư', 'hễ', 'xô', 'nó', 'bả']


'mang nó vô đi'