#### Xoá những ký tự không cần thiết 

In [None]:
from spacy.lang.vi import Vietnamese
import re
import string

nlp = Vietnamese()

def text_normalize(text, remove_urls=True, remove_emails=True, remove_mentions=True, remove_hashtags=True, remove_numbers=False):
    """
    Normalize and clean Vietnamese text with additional regex steps:
    - remove/normalize URLs, emails, mentions, hashtags, phone numbers, currency symbols
    - remove non-printable characters, reduce repeated punctuation and elongated letters
    - optional removal of digits
    Returns cleaned string (tokens joined by space) after spaCy tokenization and stopword filtering.
    """
    try:
        text = str(text)
        # basic normalization
        text = text.strip().lower()

        # remove urls
        if remove_urls:
            text = re.sub(r'https?://\S+|www\.\S+', ' ', text)
        # remove emails
        if remove_emails:
            text = re.sub(r'\S+@\S+', ' ', text)
        # remove html tags
        text = re.sub(r'<.*?>', ' ', text)
        # mentions (@username)
        if remove_mentions:
            text = re.sub(r'@\w+', ' ', text)
        # hashtags: keep the word but drop the #
        if remove_hashtags:
            text = re.sub(r'#(\w+)', r'\1', text)
        # remove phone numbers (simple patterns)
        text = re.sub(r'\b0\d{8,}\b|\b\d{9,}\b', ' ', text)
        # replace common currency symbols with space
        text = re.sub(r'[\$€£¥₫]', ' ', text)

        # remove punctuation (translate to space to avoid joining words)
        text = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', text)

        # remove non-printable characters
        text = ''.join(ch for ch in text if ch.isprintable())

        # collapse repeated punctuation (e.g., '!!!' -> '!')
        text = re.sub(r'([!?.]){2,}', r'\1', text)
        # reduce elongated characters (>2 repeats -> 2 repeats) e.g. heyyyy -> heyy
        text = re.sub(r'(.)\1{2,}', r'\1\1', text)

        # optionally remove digits entirely
        if remove_numbers:
            text = re.sub(r'\d+', ' ', text)

        # normalize whitespace
        text = re.sub(r'\s+', ' ', text).strip()

        # tokenize with spaCy (Vietnamese) and filter stopwords/punct/space
        doc = nlp(text)
        cleaned_tokens = []
        for token in doc:
            # Keep tokens that are not stopwords/punctuation/space
            if not token.is_stop and not token.is_punct and not token.is_space:
                # filter out tokens that contain digits (optional already handled above)
                if not any(char.isdigit() for char in token.text):
                    cleaned_tokens.append(token.lower_)

        return ' '.join(cleaned_tokens)

    except Exception as e:
        print("Error normalizing text:", e)
        return ''


#### Mask LM 

In [None]:
import random

def MLM(text, mask_prob=0.3):
    tokens = text.split()
    if not tokens:
        return text

    num_to_mask = max(1, int(len(tokens) * mask_prob))
    mask_indices = random.sample(range(len(tokens)), min(num_to_mask, len(tokens)))

    for idx in mask_indices:
        tokens[idx] = "[MASK]"

    return " ".join(tokens)

#### Cut mix phrase 

In [None]:
import random
import pandas as pd

def cut_mix_pair(sent1, sent2, tokenizer=None, min_frac=0.2, max_frac=0.8):
    """
    Cắt ghép ngẫu nhiên prefix của sent1 và suffix của sent2.
    tokenizer: hàm nhận string trả về list token (mặc định split by whitespace).
    min_frac/max_frac: tỉ lệ tối thiểu/tối đa cho vị trí cắt relative to length.
    """
    tok1 = tokenizer(sent1) if tokenizer else sent1.split()
    tok2 = tokenizer(sent2) if tokenizer else sent2.split()
    if not tok1 or not tok2:
        return ""
    # đảm bảo cut points hợp lệ
    lo1 = max(1, int(len(tok1) * min_frac))
    hi1 = max(1, int(len(tok1) * max_frac))
    lo2 = max(1, int(len(tok2) * min_frac))
    hi2 = max(1, int(len(tok2) * max_frac))
    cut1 = random.randint(lo1, max(lo1, hi1))
    cut2 = random.randint(lo2, max(lo2, hi2))
    new_tokens = tok1[:cut1] + tok2[cut2:]
    return " ".join(new_tokens)

def augment_cutmix_df(df, text_col='text', label_col='label', n_aug_per_sample=1, tokenizer=None, random_state=None):
    """
    Augment dataframe bằng cách với mỗi nhãn, chọn ngẫu nhiên 2 câu cùng nhãn và tạo câu mới bằng cut_mix.
    n_aug_per_sample: số mẫu augmented tạo trên mỗi sample gốc (ước tính, thực tế = n_aug_per_sample * N_label)
    Trả về DataFrame gộp data gốc + augmented.
    """
    if random_state is not None:
        random.seed(random_state)
    augmented = []
    for label, group in df.groupby(label_col):
        texts = group[text_col].dropna().astype(str).tolist()
        if len(texts) < 2:
            continue
        # tạo số lượng augment mong muốn dựa trên kích thước group
        for _ in range(n_aug_per_sample * len(texts)):
            a, b = random.sample(texts, 2)
            new_text = cut_mix_pair(a, b, tokenizer=tokenizer)
            if new_text:
                augmented.append({text_col: new_text, label_col: label})
    if not augmented:
        return df.copy()
    aug_df = pd.DataFrame(augmented)
    return pd.concat([df.reset_index(drop=True), aug_df.reset_index(drop=True)], ignore_index=True)