In [None]:
# Install required libraries
!pip install -q -U transformers datasets accelerate evaluate

from datasets import load_dataset

# Load the dataset
ds = load_dataset('thainq107/ntc-scv')

In [None]:
# Import tokenizer
from transformers import AutoTokenizer

# Define the model name
model_name = "distilbert-base-uncased"  # Change to "bert-base-uncased" if needed

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    use_fast=True
)

# Set max sequence length
max_seq_length = 100
max_seq_length = min(max_seq_length, tokenizer.model_max_length)

# Preprocessing function
def preprocess_function(examples):
    result = tokenizer(
        examples["preprocessed_sentence"],  # Column name in the dataset
        padding="max_length",
        max_length=max_seq_length,
        truncation=True
    )
    result["label"] = examples["label"]
    return result

# Apply the preprocessing pipeline
processed_dataset = ds.map(
    preprocess_function,
    batched=True,
    desc="Running tokenizer on dataset",
)

In [None]:
from transformers import AutoConfig, AutoModelForSequenceClassification

# Define the number of labels
num_labels = 2

# Load model configuration
config = AutoConfig.from_pretrained(
    model_name,
    num_labels=num_labels,
    finetuning_task="text-classification"
)

# Load the model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    config=config
)

In [None]:
import numpy as np
import evaluate

# Load accuracy metric
metric = evaluate.load("accuracy")

# Compute metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    result = metric.compute(predictions=predictions, references=labels)
    return result

In [None]:
from transformers import TrainingArguments, Trainer

# Define training arguments
training_args = TrainingArguments(
    output_dir="save_model",               # Directory to save the model
    learning_rate=2e-5,                    # Learning rate
    per_device_train_batch_size=128,      # Batch size for training
    per_device_eval_batch_size=128,       # Batch size for evaluation
    num_train_epochs=10,                   # Number of training epochs
    eval_strategy="epoch",                 # Evaluation strategy
    save_strategy="epoch",                 # Save strategy
    load_best_model_at_end=True            # Load the best model at the end of training
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset["train"],
    eval_dataset=processed_dataset["valid"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()