In [2]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score
from peft import get_peft_model, LoraConfig, TaskType

In [3]:
df_train = pd.read_csv("data/train.csv", encoding='ISO-8859-1')
df_train.dropna(subset=['text', 'sentiment'], inplace=True)
df_test = pd.read_csv("data/test.csv", encoding='ISO-8859-1')
df_test.dropna(subset=['text', 'sentiment'], inplace=True)

# Ensure text data is a list of strings and drop missing values
df_train['text'] = df_train['text'].astype(str)
df_test['text'] = df_test['text'].astype(str)

df_train = df_train.rename(columns={'sentiment': 'label'})
df_test = df_test.rename(columns={'sentiment': 'label'})

df_train['label'] = df_train['label'].apply(lambda x: 2 if x == 'positive' else 1 if x == 'neutral' else 0)
df_test['label'] = df_test['label'].apply(lambda x: 2 if x == 'positive' else 1 if x == 'neutral' else 0)


df_train['label'] = df_train['label'].astype(int)
df_test['label'] = df_test['label'].astype(int)

In [4]:
# model_name = 'google-bert/bert-base-uncased'
model_name = 'distilbert/distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [5]:
import spacy

# 1. Load spaCy, disabling heavy components for speed
#    Only the tagger & lemmatizer (which also gives you .is_stop/.is_punct) remain
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])

# 2. Define a cleaning function
def spacy_clean(text):
    doc = nlp(text)
    # keep only tokens that are NOT stop words or punctuation,
    # then lemmatize and lowercase them
    tokens = [
        token.lemma_.lower()
        for token in doc
        if not (token.is_stop or token.is_punct)
    ]
    return " ".join(tokens)

train_dataset = Dataset.from_pandas(df_train[['text', 'label']])
test_dataset  = Dataset.from_pandas(df_test[['text', 'label']])

# 4. Map the cleaning function over the "text" column
#    Use batched=True to process in chunks (faster), and num_proc to parallelize if you like
train_dataset = train_dataset.map(
    lambda batch: {"text": [spacy_clean(txt) for txt in batch["text"]]},
    batched=True,
    batch_size=500,
    num_proc=4,          # adjust or remove if you don’t want multiprocessing
)
test_dataset = test_dataset.map(
    lambda batch: {"text": [spacy_clean(txt) for txt in batch["text"]]},
    batched=True,
    batch_size=500,
    num_proc=4,
)

# 5. (Optional) If you want to keep both raw and clean text:
#    map into a new column instead of overwriting "text"
train_dataset = train_dataset.map(
    lambda batch: {"clean_text": [spacy_clean(txt) for txt in batch["text"]]},
    batched=True,
    batch_size=500,
    num_proc=4,
)

Map (num_proc=4):   0%|          | 0/27480 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/3534 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/27480 [00:00<?, ? examples/s]

In [6]:
# Convert DataFrames to Hugging Face Datasets
# train_dataset = Dataset.from_pandas(df_train[['text', 'label']])
# test_dataset = Dataset.from_pandas(df_test[['text', 'label']])

# Adding short max length to lower training time
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=64)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/27480 [00:00<?, ? examples/s]

Map:   0%|          | 0/3534 [00:00<?, ? examples/s]

In [7]:
import torch
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print("Using device:", device)

Using device: mps


In [13]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
model = model.to(device)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query", "value"],  # BERT uses query, key, value in attention
    lora_dropout=0.1,
    bias="all",
    task_type=TaskType.SEQ_CLS
)

training_args = TrainingArguments(
    output_dir="./results/" + model_name,
    learning_rate=2e-3,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    gradient_accumulation_steps=3,
    #gradient_checkpointing=True,
    num_train_epochs=30,
    dataloader_num_workers=5,
    logging_steps=400,
    weight_decay=0.01,
)

# model = get_peft_model(model, lora_config)
model = model.to(device)

for name, param in model.named_parameters():
    if "classifier" in name:
        param.requires_grad = True

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# Define a compute metrics function
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    accuracy = accuracy_score(p.label_ids, preds)
    return {'accuracy': accuracy}

In [15]:
from transformers import TrainerCallback

class LossAccuracyLogger(TrainerCallback):
    def __init__(self):
        self.train_loss = []
        self.eval_accuracy = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            if "loss" in logs:
                self.train_loss.append((state.epoch, logs["loss"]))
            if "eval_accuracy" in logs:
                self.eval_accuracy.append((state.epoch, logs["eval_accuracy"]))


In [16]:
logger_callback = LossAccuracyLogger()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[logger_callback]
)

In [None]:
# --- 在 Cell 2 和 Cell 3 執行後，加入這個新的 Cell ---

# 計算訓練集中每個文本的 Token 長度
train_token_lengths = [len(tokenizer.encode(text)) for text in df_train['text']]
# 計算測試集中每個文本的 Token 長度
test_token_lengths = [len(tokenizer.encode(text)) for text in df_test['text']]

# 合併所有長度
all_token_lengths = train_token_lengths + test_token_lengths

# 找出最長的 Token 長度
max_len = max(all_token_lengths)

print(f"資料集中最長的 Token 長度是: {max_len}")

# (選用) 繪製長度分佈圖，幫助決定 max_length
import matplotlib.pyplot as plt
import numpy as np

plt.figure(figsize=(10, 6))
plt.hist(all_token_lengths, bins=50) # bins 可以調整，觀察更細緻的分佈
plt.title('Token 長度分佈圖')
plt.xlabel('Token 長度')
plt.ylabel('文本數量')
# 可以在圖上標示一個常用的百分位數，例如 95% 或 99%
percentile_95 = np.percentile(all_token_lengths, 95)
plt.axvline(percentile_95, color='red', linestyle='dashed', linewidth=1)
plt.text(percentile_95 * 1.05, plt.ylim()[1] * 0.9, f'95th percentile: {int(percentile_95)}')
plt.show()

print(f"資料集中 95% 的文本 Token 長度小於或等於: {int(percentile_95)}")

In [None]:
# Train the model
trainer.train()

Step,Training Loss


In [None]:
trainer.save_model("./results/" + model_name)

In [None]:
# Evaluate the model
results = trainer.evaluate()
print(results)

import matplotlib.pyplot as plt

# Unpack the epoch and values
train_epochs, train_losses = zip(*logger_callback.train_loss)
eval_epochs, eval_accuracies = zip(*logger_callback.eval_accuracy)

# Plot training loss
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(train_epochs, train_losses, marker='o')
plt.title("Training Loss per Epoch")
plt.xlabel("Epoch")
plt.ylabel("Loss")

# Plot eval accuracy
plt.subplot(1, 2, 2)
plt.plot(eval_epochs, eval_accuracies, marker='o', color='green')
plt.title("Evaluation Accuracy per Epoch")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")

plt.tight_layout()
plt.show()
