In [1]:
pip install underthesea pandas regex --quiet

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import re
from underthesea import text_normalize, word_tokenize

In [3]:
df_train = pd.read_csv( "./df_train.csv", encoding="utf-8")
df_dev   = pd.read_csv( "./df_dev.csv",   encoding="utf-8")
df_test  = pd.read_csv( "./df_test.csv",  encoding="utf-8")

print("Train columns:", df_train.columns.tolist())
print("Dev   columns:", df_dev.columns.tolist())
print("Test  columns:", df_test.columns.tolist())

Train columns: ['content', 'sentiment', 'topic']
Dev   columns: ['content', 'sentiment', 'topic']
Test  columns: ['content', 'sentiment', 'topic']


In [4]:
for df in (df_train, df_dev, df_test):
    if 'sentiment' in df.columns:
        df.rename(columns={'sentiment':'label'}, inplace=True)
    assert 'content' in df.columns and 'label' in df.columns, \
        f"Thi·∫øu c·ªôt ·ªü frame v·ªõi columns={df.columns.tolist()}"
    df['content'] = df['content'].fillna('').astype(str)

In [5]:
print("Loaded splits:", len(df_train), len(df_dev), len(df_test))

Loaded splits: 11426 1583 3166


In [6]:
df_misspell = pd.read_csv('./vietnamese-misspell.csv')
misspell_dict = dict(zip(df_misspell['wrong'], df_misspell['right']))

In [7]:
teencode_map = {}
with open("./vietnamese-teencode.txt", "r", encoding="utf8") as f:
    for line in f:
        parts = line.strip().split()
        if len(parts) >= 2:
            teencode_map[parts[0]] = parts[1]
print("V√≠ d·ª• teencode_map:", list(teencode_map.items())[:10])

V√≠ d·ª• teencode_map: [('ctrai', 'con'), ('kh√¥g', 'kh√¥ng'), ('bme', 'b·ªë'), ('cta', 'ch√∫ng'), ('mih', 'm√¨nh'), ('mqh', 'm·ªëi'), ('cgai', 'con'), ('nh·ªØg', 'nh·ªØng'), ('mng', 'm·ªçi'), ('svtn', 'sinh')]


In [8]:
emoticon_dict = {
    ":)": "‚ò∫Ô∏è",
    ":))": "‚ò∫Ô∏è",
    ":)))": "‚ò∫Ô∏è",
    "=))": "üòÑ",
    "=)": "üòÑ",
    ":D": "üòÄ",
    ":DD": "üòÅ",
    ":<": "üòû",
    ":(": "‚òπÔ∏è",
    ":'(": "üò¢",
    ":'(": "üò¢",
    ":')": "üòÇ",
    ":')": "üòÇ",
    "<3": "‚ù§Ô∏è",
    "</3": "üíî",
    ":P": "üòõ",
    ":p": "üòõ",
    ":O": "üò≤",
    ":o": "üò≤",
    ";)": "üòâ",
    ";-)": "üòâ",
    ":3": "üò∫",
    ":^)": "üòä",
    "^_^": "üòä",
    "-_-": "üòë",
    ">_<": "üò£",
    "XD": "üòÜ",
    "xD": "üòÜ",
    "T_T": "üò≠",
    ";_;": "üò≠",
    ":|": "üòê",
    ":/": "üòï",
    ":-/": "üòï",
    ":-\\": "üòï",
    ":'D": "üòÜ",
    ":'D": "üòÜ",
    ":-*": "üòò",
    ":*": "üòò",
    "<<": "üòì",
    ">_>": "üòí",
    "<_<": "üòí",
    "^\\^": "üòÜ",
    "\\^_^/": "üéâ",
    "*^_^*": "üéâ",
    "\\o/": "üôå",
    "\\O/": "üôå",
    "O_O": "üò≥",
    "o_o": "üò≥",
    ">:O": "üò†",
    "^^": "‚ò∫Ô∏è"
}


In [9]:
def standardize_emoticon(text):
    # G·ªôp c√°c chu·ªói emoticon th∆∞·ªùng g·∫∑p v·ªÅ chu·∫©n (g·ªôp l·∫∑p, v√≠ d·ª• ^^ ^^ ^^ -> ^^)
    # G·ªôp emoticon ki·ªÉu :) v·ªÅ 1
    text = re.sub(r'((:\)+))', ':)', text)
    text = re.sub(r'((=\)+))', '=)', text)
    text = re.sub(r'((\^_?\^)+)', '^^', text)        # ^^, ^_^, ^^ ^^, ...
    text = re.sub(r'(<3+)', '<3', text)
    text = re.sub(r'(\)+)', ')', text)
    text = re.sub(r'(\(+)', '(', text)
    # G·ªôp m·ªçi chu·ªói ^^ li√™n ti·∫øp v·ªÅ 1 ^^ (k·ªÉ c·∫£ c√≥ c√°ch ra)
    text = re.sub(r'(\^\^)(\s+\^\^)+', '^^', text)
    # G·ªôp ((
    text = re.sub(r'(\(+)', '(', text)
    text = re.sub(r'(\)+)', ')', text)
    return text

In [10]:
def standardize_emoticon(text):
    # G·ªôp c√°c chu·ªói emoticon th∆∞·ªùng g·∫∑p v·ªÅ chu·∫©n (g·ªôp l·∫∑p, v√≠ d·ª• ^^ ^^ ^^ -> ^^)
    # G·ªôp emoticon ki·ªÉu :) v·ªÅ 1
    text = re.sub(r'((:\)+))', ':)', text)
    text = re.sub(r'((=\)+))', '=)', text)
    text = re.sub(r'((\^_?\^)+)', '^^', text)        # ^^, ^_^, ^^ ^^, ...
    text = re.sub(r'(<3+)', '<3', text)
    text = re.sub(r'(\)+)', ')', text)
    text = re.sub(r'(\(+)', '(', text)
    # G·ªôp m·ªçi chu·ªói ^^ li√™n ti·∫øp v·ªÅ 1 ^^ (k·ªÉ c·∫£ c√≥ c√°ch ra)
    text = re.sub(r'(\^\^)(\s+\^\^)+', '^^', text)
    # G·ªôp ((
    text = re.sub(r'(\(+)', '(', text)
    text = re.sub(r'(\)+)', ')', text)
    return text

In [11]:
def convert_emoticon(text, emoticon_dict):
    # Duy·ªát emoticon d√†i tr∆∞·ªõc
    for emo in sorted(emoticon_dict, key=len, reverse=True):
        # Ch·ªâ thay th·∫ø khi l√† nguy√™n m·ªôt c·ª•m (d√πng word boundary n·∫øu c·∫ßn)
        text = re.sub(re.escape(emo) + r'(?=\s|$)', emoticon_dict[emo], text)
    return text

In [12]:
def standardize_word(text, misspell_dict):
    # G·ªôp k√Ω t·ª± k√©o d√†i (ƒë·∫πpppp -> ƒë·∫πp)
    text = re.sub(r'(\w)\1{2,}', r'\1', text)
    # Chu·∫©n h√≥a ch√≠nh t·∫£/vi·∫øt t·∫Øt
    for wrong, right in misspell_dict.items():
        text = re.sub(r'\b' + re.escape(wrong) + r'\b', right, text)
    return text

In [13]:
teencode_map = {}
with open("./vietnamese-teencode.txt", "r", encoding="utf8") as f:
    for line in f:
        parts = line.strip().split()
        if len(parts) >= 2:
            teencode_map[parts[0]] = parts[1]

In [14]:
import re

def remove_duplicate_emoji(text):
    # X√≥a emoji tr√πng l·∫∑p li√™n ti·∫øp (vd: üòÑüòÑüòÑ -> üòÑ)
    emoji_pattern = re.compile(
        r'([\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF])\1+'
    )
    return emoji_pattern.sub(r'\1', text)


In [15]:
def preprocess_text(text, emoticon_dict, misspell_dict):
    text = str(text)
    text = standardize_emoticon(text)
    text = convert_emoticon(text, emoticon_dict)
    text = remove_duplicate_emoji(text)
    for abbr, full in teencode_map.items():
        pattern = rf"\b{re.escape(abbr)}\b"
        text = re.sub(pattern, full, text, flags=re.IGNORECASE)
    text = standardize_word(text, misspell_dict)
    return text

In [16]:
text = "∆∞·ªõc g√¨ sau n√†y v·ªÅ gi√† v·∫´n c√≥ th·ªÉ nh∆∞ c·ª• n√†y :)) :)))) :)))"
text = preprocess_text(text, emoticon_dict, misspell_dict)
print(text) 

∆∞·ªõc g√¨ sau n√†y v·ªÅ gi√† v·∫´n c√≥ th·ªÉ nh∆∞ c·ª• n√†y ‚ò∫Ô∏è ‚ò∫Ô∏è ‚ò∫Ô∏è


In [17]:
df_test['content_clean'] = df_test['content'].apply(lambda x: preprocess_text(x, emoticon_dict, misspell_dict))
df_test[['content', 'content_clean']].head(10)

Unnamed: 0,content,content_clean
0,n√≥i ti·∫øng anh l∆∞u lo√°t .,n√≥i ti·∫øng anh l∆∞u lo√°t .
1,gi√°o vi√™n r·∫•t vui t√≠nh .,gi√°o vi√™n r·∫•t vui t√≠nh .
2,c√¥ max c√≥ t√¢m .,c√¥ max c√≥ t√¢m .
3,"gi·∫£ng b√†i thu h√∫t , d√≠ d·ªèm .","gi·∫£ng b√†i thu h√∫t , d√≠ d·ªèm ."
4,"gi√°o vi√™n kh√¥ng gi·∫£ng d·∫°y ki·∫øn th·ª©c , h∆∞·ªõng d·∫´...","gi√°o vi√™n kh√¥ng gi·∫£ng d·∫°y ki·∫øn th·ª©c , h∆∞·ªõng d·∫´..."
5,th·∫ßy d·∫°y nhi·ªát t√¨nh v√† t√¢m huy·∫øt .,th·∫ßy d·∫°y nhi·ªát t√¨nh v√† t√¢m huy·∫øt .
6,t√≠nh ƒëi·ªÉm thi ƒëua c√°c nh√≥m .,t√≠nh ƒëi·ªÉm thi ƒëua c√°c nh√≥m .
7,th·∫ßy nhi·ªát t√¨nh gi·∫£ng l·∫°i cho h·ªçc sinh .,th·∫ßy nhi·ªát t√¨nh gi·∫£ng l·∫°i cho h·ªçc sinh .
8,c√≥ ƒë√¥i l√∫c n√≥i h∆°i nhanh l√†m sinh vi√™n kh√¥ng t...,c√≥ ƒë√¥i l√∫c n√≥i h∆°i nhanh l√†m sinh vi√™n kh√¥ng t...
9,"gi·∫£ng d·∫°y nhi·ªát t√¨nh , li√™n h·ªá th·ª±c t·∫ø kh√° nhi...","gi·∫£ng d·∫°y nhi·ªát t√¨nh , li√™n h·ªá th·ª±c t·∫ø kh√° nhi..."


In [18]:
df_train['content_clean'] = df_train['content'].apply(lambda x: preprocess_text(x, emoticon_dict, misspell_dict))
df_train[['content', 'content_clean']].head(10)

Unnamed: 0,content,content_clean
0,slide gi√°o tr√¨nh ƒë·∫ßy ƒë·ªß .,slide gi√°o tr√¨nh ƒë·∫ßy ƒë·ªß .
1,"nhi·ªát t√¨nh gi·∫£ng d·∫°y , g·∫ßn g≈©i v·ªõi sinh vi√™n .","nhi·ªát t√¨nh gi·∫£ng d·∫°y , g·∫ßn g≈©i v·ªõi sinh vi√™n ."
2,ƒëi h·ªçc ƒë·∫ßy ƒë·ªß full ƒëi·ªÉm chuy√™n c·∫ßn .,ƒëi h·ªçc ƒë·∫ßy ƒë·ªß full ƒëi·ªÉm chuy√™n c·∫ßn .
3,ch∆∞a √°p d·ª•ng c√¥ng ngh·ªá th√¥ng tin v√† c√°c thi·∫øt ...,ch∆∞a √°p d·ª•ng c√¥ng ngh·ªá th√¥ng tin v√† c√°c thi·∫øt ...
4,"th·∫ßy gi·∫£ng b√†i hay , c√≥ nhi·ªÅu b√†i t·∫≠p v√≠ d·ª• ng...","th·∫ßy gi·∫£ng b√†i hay , c√≥ nhi·ªÅu b√†i t·∫≠p v√≠ d·ª• ng..."
5,"gi·∫£ng vi√™n ƒë·∫£m b·∫£o th·ªùi gian l√™n l·ªõp , t√≠ch c·ª±...","gi·∫£ng vi√™n ƒë·∫£m b·∫£o th·ªùi gian l√™n l·ªõp , t√≠ch c·ª±..."
6,"em s·∫Ω n·ª£ m√¥n n√†y , nh∆∞ng em s·∫Ω h·ªçc l·∫°i ·ªü c√°c h...","em s·∫Ω n·ª£ m√¥n n√†y , nh∆∞ng em s·∫Ω h·ªçc l·∫°i ·ªü c√°c h..."
7,"th·ªùi l∆∞·ª£ng h·ªçc qu√° d√†i , kh√¥ng ƒë·∫£m b·∫£o ti·∫øp th...","th·ªùi l∆∞·ª£ng h·ªçc qu√° d√†i , kh√¥ng ƒë·∫£m b·∫£o ti·∫øp th..."
8,"n·ªôi dung m√¥n h·ªçc c√≥ ph·∫ßn thi·∫øu tr·ªçng t√¢m , h·∫ßu...","n·ªôi dung m√¥n h·ªçc c√≥ ph·∫ßn thi·∫øu tr·ªçng t√¢m , h·∫ßu..."
9,c·∫ßn n√≥i r√µ h∆°n b·∫±ng c√°ch tr√¨nh b√†y l√™n b·∫£ng th...,c·∫ßn n√≥i r√µ h∆°n b·∫±ng c√°ch tr√¨nh b√†y l√™n b·∫£ng th...


In [19]:
df_dev['content_clean'] = df_dev['content'].apply(lambda x: preprocess_text(x, emoticon_dict, misspell_dict))
df_dev[['content', 'content_clean']].head(10)

Unnamed: 0,content,content_clean
0,gi√°o tr√¨nh ch∆∞a c·ª• th·ªÉ .,gi√°o tr√¨nh ch∆∞a c·ª• th·ªÉ .
1,gi·∫£ng bu·ªìn ng·ªß .,gi·∫£ng bu·ªìn ng·ªß .
2,"gi√°o vi√™n vui t√≠nh , t·∫≠n t√¢m .","gi√°o vi√™n vui t√≠nh , t·∫≠n t√¢m ."
3,"gi·∫£ng vi√™n n√™n giao b√†i t·∫≠p nhi·ªÅu h∆°n , chia n...","gi·∫£ng vi√™n n√™n giao b√†i t·∫≠p nhi·ªÅu h∆°n , chia n..."
4,"gi·∫£ng vi√™n c·∫ßn gi·∫£ng b√†i chi ti·∫øt h∆°n , ƒëi s√¢u...","gi·∫£ng vi√™n c·∫ßn gi·∫£ng b√†i chi ti·∫øt h∆°n , ƒëi s√¢u..."
5,n√™n c√≥ gi·∫£ng vi√™n n∆∞·ªõc ngo√†i d·∫°y ƒë·ªÉ sinh vi√™n ...,n√™n c√≥ gi·∫£ng vi√™n n∆∞·ªõc ngo√†i d·∫°y ƒë·ªÉ sinh vi√™n ...
6,n√™n c√≥ b√†i t·∫≠p l·ªõn ƒë·ªì √°n m√¥n h·ªçc .,n√™n c√≥ b√†i t·∫≠p l·ªõn ƒë·ªì √°n m√¥n h·ªçc .
7,"gi·∫£ng vi√™n ƒë·∫£m b·∫£o n·ªôi dung h·ªçc , ph√¢n t√≠ch gi...","gi·∫£ng vi√™n ƒë·∫£m b·∫£o n·ªôi dung h·ªçc , ph√¢n t√≠ch gi..."
8,"n√™u r√µ m·ª•c ti√™u , m·ª•c ƒë√≠ch m√¥n h·ªçc ƒë·ªÉ sinh vi√™...","n√™u r√µ m·ª•c ti√™u , m·ª•c ƒë√≠ch m√¥n h·ªçc ƒë·ªÉ sinh vi√™..."
9,c√≥ m·ªôt s·ªë v·∫•n ƒë·ªÅ n√≥i ch∆∞a r√µ .,c√≥ m·ªôt s·ªë v·∫•n ƒë·ªÅ n√≥i ch∆∞a r√µ .


In [20]:
df_train.to_csv("./df_train_clean.csv", index=False)
df_dev  .to_csv("./df_dev_clean.csv",   index=False)
df_test .to_csv("./df_test_clean.csv",  index=False)
print("Cleaned files written to /content/")

Cleaned files written to /content/
