# üáªüá≥ VeryGoodMail - PhoBERT Email Classification Training

Notebook n√†y gi√∫p b·∫°n train c√°c model PhoBERT cho:
- **Spam Detection**: Ph√°t hi·ªán email spam
- **Sentiment Analysis**: Ph√¢n t√≠ch c·∫£m x√∫c
- **Category Classification**: Ph√¢n lo·∫°i email

¬© 2025 VeryGoodMail by Ho√†n

## 1. Setup Environment

In [None]:
# Install required packages
!pip install transformers torch datasets scikit-learn pandas underthesea -q

In [None]:
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from datasets import Dataset
import warnings
warnings.filterwarnings('ignore')

# Check GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')
if device == 'cuda':
    print(f'GPU: {torch.cuda.get_device_name(0)}')

## 2. Load PhoBERT Tokenizer

In [None]:
# Load PhoBERT tokenizer
model_name = "vinai/phobert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
print(f'Loaded tokenizer: {model_name}')

In [None]:
# Upload datasets t·ª´ local (ch·∫°y cell n√†y ƒë·ªÉ up c√°c file CSV/rar em t·∫£i)
from google.colab import files
uploaded = files.upload()  # Ch·ªçn file t·ª´ m√°y t√≠nh: vietnamese_spam_post.csv, data.csv, viet_text_class.csv, Train_Full.rar, Test_Full.rar (n·∫øu d√πng VNTC)

# N·∫øu c√≥ rar cho VNTC, unrar
!unrar x Train_Full.rar  # Adjust t√™n file n·∫øu kh√°c
!unrar x Test_Full.rar

## 3. Prepare Dataset

Upload dataset c·ªßa b·∫°n ho·∫∑c s·ª≠ d·ª•ng sample data

In [None]:
# Sample data - Thay th·∫ø b·∫±ng dataset c·ªßa b·∫°n
# Format: text, label

# Load Spam Dataset 1: ViSpamReviews t·ª´ Hugging Face (kh√¥ng c·∫ßn t·∫£i file)
from datasets import load_dataset
ds_spam1 = load_dataset("SEACrowd/vispamreviews")
df_spam1 = pd.DataFrame(ds_spam1['train'])  # Ho·∫∑c concat train/test n·∫øu c√≥
df_spam1 = df_spam1[['review', 'label']]  # C·ªôt text v√† label (adjust n·∫øu kh√°c)
df_spam1.columns = ['text', 'label']
df_spam1['label'] = df_spam1['label'].astype(int)  # 0 non-spam, 1 spam

# Load Spam Dataset 2: Vietnamese Spam Post t·ª´ Kaggle (file ƒë√£ up)
df_spam2 = pd.read_csv('vietnamese_spam_post.csv')  # Adjust t√™n file n·∫øu kh√°c
df_spam2 = df_spam2[['post', 'label']]  # Adjust c·ªôt n·∫øu kh√°c (v√≠ d·ª• 'text' thay 'post')
df_spam2.columns = ['text', 'label']
df_spam2['label'] = df_spam2['label'].astype(int)

# Merge hai dataset
df_spam = pd.concat([df_spam1, df_spam2], ignore_index=True)
print(f'Spam dataset merged: {len(df_spam)} samples')

# Load Sentiment Dataset 1: UIT-VSFC t·ª´ Hugging Face
from datasets import load_dataset
ds_sent1 = load_dataset("uitnlp/vietnamese_students_feedback")
df_sent1_train = pd.DataFrame(ds_sent1['train'])[['sentence', 'sentiment']]
df_sent1_val = pd.DataFrame(ds_sent1['validation'])[['sentence', 'sentiment']]
df_sent1_test = pd.DataFrame(ds_sent1['test'])[['sentence', 'sentiment']]
df_sent1 = pd.concat([df_sent1_train, df_sent1_val, df_sent1_test], ignore_index=True)
df_sent1.columns = ['text', 'label']
df_sent1['label'] = df_sent1['label'].astype(int)  # 0 negative, 1 neutral, 2 positive

# Load Sentiment Dataset 2: Vietnamese Sentiment Analyst t·ª´ Kaggle (file ƒë√£ up)
df_sent2 = pd.read_csv('data.csv')  # Adjust t√™n file n·∫øu kh√°c
df_sent2 = df_sent2[['comment', 'label']]  # Adjust c·ªôt n·∫øu kh√°c
df_sent2.columns = ['text', 'label']
# N·∫øu label l√† string, map: uncomment d√≤ng d∆∞·ªõi
# df_sent2['label'] = df_sent2['label'].map({'negative': 0, 'neutral': 1, 'positive': 2})

# Merge
df_sentiment = pd.concat([df_sent1, df_sent2], ignore_index=True)
df_sentiment['label'] = df_sentiment['label'].astype(int)
print(f'Sentiment dataset merged: {len(df_sentiment)} samples')

# Load Category Dataset 1: Vietnamese Text Classification t·ª´ Kaggle (file ƒë√£ up)
df_cat1 = pd.read_csv('viet_text_class.csv')  # Adjust t√™n file n·∫øu kh√°c
df_cat1 = df_cat1[['text', 'label']]  # Assume c·ªôt s·∫µn
df_cat1['label'] = df_cat1['label'].astype(int)  # Labels 0-n (kho·∫£ng 10 class)

# Load Category Dataset 2: VNTC (n·∫øu em t·∫£i v√† unrar folders)
# N·∫øu skip VNTC, comment ph·∫ßn n√†y v√† d√πng df_category = df_cat1
import os
train_dir = 'Train_Full/'  # Path ƒë·∫øn folder unrar
test_dir = 'Test_Full/'  # N·∫øu c√≥
categories = os.listdir(train_dir)  # C√°c folder category
df_cat2 = pd.DataFrame(columns=['text', 'label'])
for idx, cat in enumerate(categories):
    cat_dir = os.path.join(train_dir, cat)
    for file in os.listdir(cat_dir):
        if file.endswith('.txt'):
            with open(os.path.join(cat_dir, file), 'r', encoding='utf-16') as f:  # VNTC d√πng utf-16
                text = f.read().strip()
            df_cat2 = pd.concat([df_cat2, pd.DataFrame({'text': [text], 'label': [idx]})], ignore_index=True)
# T∆∞∆°ng t·ª± cho test_dir n·∫øu mu·ªën concat

# Merge
df_category = pd.concat([df_cat1, df_cat2], ignore_index=True) if 'df_cat2' in locals() else df_cat1
df_category['label'] = df_category['label'].astype(int)
# N·∫øu mu·ªën gi·ªØ 5 class nh∆∞ g·ªëc, map random: df_category['label'] = df_category['label'] % 5
# Adjust num_labels trong train_model category th√†nh len(df_category['label'].unique())

print(f'Category dataset merged: {len(df_category)} samples')

# Convert to DataFrames
df_spam = pd.DataFrame(spam_data, columns=['text', 'label'])
df_sentiment = pd.DataFrame(sentiment_data, columns=['text', 'label'])
df_category = pd.DataFrame(category_data, columns=['text', 'label'])

print(f'Spam dataset: {len(df_spam)} samples')
print(f'Sentiment dataset: {len(df_sentiment)} samples')
print(f'Category dataset: {len(df_category)} samples')

In [None]:
# Upload your own dataset (optional)
# from google.colab import files
# uploaded = files.upload()
# df_spam = pd.read_csv('your_spam_data.csv')

## 4. Tokenize Data

In [None]:
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=256
    )

def prepare_dataset(df):
    """Convert DataFrame to HuggingFace Dataset"""
    dataset = Dataset.from_pandas(df)
    tokenized = dataset.map(tokenize_function, batched=True)
    return tokenized

# Prepare datasets
spam_dataset = prepare_dataset(df_spam)
sentiment_dataset = prepare_dataset(df_sentiment)
category_dataset = prepare_dataset(df_category)

print('Datasets prepared!')

## 5. Train Spam Detection Model

In [None]:
def train_model(dataset, num_labels, output_dir, epochs=3):
    """Train a PhoBERT classification model"""

    # Load model
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels
    )

    # Split dataset
    split = dataset.train_test_split(test_size=0.2)

    # Training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=100,
        weight_decay=0.01,
        logging_dir=f'{output_dir}/logs',
        logging_steps=10,
        eval_strategy='epoch',
        save_strategy='epoch',
        load_best_model_at_end=True,
    )

    # Data collator
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=split['train'],
        eval_dataset=split['test'],
        data_collator=data_collator,
    )

    # Train
    trainer.train()

    # Save model
    model.save_pretrained(output_dir)

    return model, trainer

# Train spam model (2 classes: ham=0, spam=1)
print('Training Spam Detection Model...')
spam_model, spam_trainer = train_model(
    spam_dataset,
    num_labels=2,
    output_dir='./spam_model',
    epochs=3
)
print('Spam model trained!')

## 6. Train Sentiment Analysis Model

In [None]:
# Train sentiment model (3 classes: negative=0, neutral=1, positive=2)
print('Training Sentiment Analysis Model...')
sentiment_model, sentiment_trainer = train_model(
    sentiment_dataset,
    num_labels=3,
    output_dir='./sentiment_model',
    epochs=3
)
print('Sentiment model trained!')

## 7. Train Category Classification Model

In [None]:
# Train category model (5 classes)
print('Training Category Classification Model...')
category_model, category_trainer = train_model(
    category_dataset,
    num_labels=5,
    output_dir='./category_model',
    epochs=3
)
print('Category model trained!')

## 8. Save Tokenizer

In [None]:
# Save tokenizer
tokenizer.save_pretrained('./tokenizer')
print('Tokenizer saved!')

## 9. Test Models

In [None]:
def test_model(model, text, label_map):
    """Test a single prediction"""
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=256)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    model.to(device)
    model.eval()

    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=-1)
        pred_class = torch.argmax(probs, dim=-1).item()
        confidence = probs[0][pred_class].item()

    return label_map[pred_class], confidence

# Test spam detection
spam_labels = {0: 'Ham', 1: 'Spam'}
test_texts = [
    "Cu·ªôc h·ªçp v√†o 3 gi·ªù chi·ªÅu",
    "B·∫°n ƒë√£ tr√∫ng th∆∞·ªüng 1 t·ª∑ ƒë·ªìng!",
]
print('\n=== Spam Detection Test ===')
for text in test_texts:
    label, conf = test_model(spam_model, text, spam_labels)
    print(f'Text: "{text}"')
    print(f'Prediction: {label} (confidence: {conf:.2%})\n')

# Test sentiment
sentiment_labels = {0: 'Negative', 1: 'Neutral', 2: 'Positive'}
test_texts = [
    "C·∫£m ∆°n b·∫°n r·∫•t nhi·ªÅu!",
    "D·ªãch v·ª• r·∫•t t·ªá",
]
print('=== Sentiment Analysis Test ===')
for text in test_texts:
    label, conf = test_model(sentiment_model, text, sentiment_labels)
    print(f'Text: "{text}"')
    print(f'Prediction: {label} (confidence: {conf:.2%})\n')

## 10. Download Models

In [None]:
# Zip and download all models
!zip -r models.zip spam_model sentiment_model category_model tokenizer

from google.colab import files
files.download('models.zip')

print('\n‚úÖ Download complete!')
print('Extract models.zip and copy to PhoBERT-Service/models/ directory')

## üìù Next Steps

1. Download file `models.zip`
2. Extract v√†o th∆∞ m·ª•c `PhoBERT-Service/models/`
3. C·∫•u tr√∫c th∆∞ m·ª•c:
   ```
   PhoBERT-Service/models/
   ‚îú‚îÄ‚îÄ spam_model/
   ‚îú‚îÄ‚îÄ sentiment_model/
   ‚îú‚îÄ‚îÄ category_model/
   ‚îî‚îÄ‚îÄ tokenizer/
   ```
4. Ch·∫°y PhoBERT service:
   ```bash
   cd PhoBERT-Service
   pip install -r requirements.txt
   uvicorn app.main:app --host 0.0.0.0 --port 8000
   ```
5. C·∫≠p nh·∫≠t Email-System-Server `.env`:
   ```
   PHOBERT_URL=http://localhost:8000
   ```