### load 資料集

[jpwahle/machine-paraphrase-dataset](https://huggingface.co/datasets/jpwahle/machine-paraphrase-dataset)

In [1]:
from datasets import load_dataset
ds = load_dataset("jpwahle/machine-paraphrase-dataset")
print(ds['train'])

Dataset({
    features: ['text', 'label', 'dataset', 'method'],
    num_rows: 200767
})


In [2]:
from datasets import DatasetDict

train_test = ds['train'].train_test_split(test_size=0.2)
dataset = DatasetDict({
    'train': train_test['train'],
    'validation': train_test['test'],
    'test': ds['test']
})

### load Longformer model

In [3]:
from transformers import LongformerTokenizer, LongformerForSequenceClassification

# 載入 Tokenizer 和模型
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model = LongformerForSequenceClassification.from_pretrained(
    'allenai/longformer-base-4096',
    num_labels=2  # 根據分類任務的類別數調整
)

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### tokenize

In [4]:
def preprocess_function(examples):
    return tokenizer(
        examples['text'],  # 假設文字欄位是 'text'
        truncation=True,
        padding='max_length',
        max_length=512
    )

In [5]:
# 對 train 和 test 資料進行預處理
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/160613 [00:00<?, ? examples/s]

Map:   0%|          | 0/40154 [00:00<?, ? examples/s]

### 先使用小部分資料測試訓練

In [6]:
tokenized_datasets['train'] = tokenized_datasets['train'].shuffle(seed=42).select(range(int(0.1 * len(tokenized_datasets['train']))))

In [7]:
tokenized_datasets['train'].shape

(16061, 6)

### device check

In [8]:
import os, torch
print(torch.cuda.is_available())  # True 表示 GPU 可用
print(torch.cuda.device_count())  # 返回可用 GPU 的數量
print(torch.cuda.get_device_name(0))  # 確認 GPU 的名稱
print(torch.cuda.get_device_name(1))  # 確認 GPU 的名稱

True
2
NVIDIA GeForce RTX 4090
NVIDIA GeForce RTX 3060


### 訓練

In [9]:
from transformers import TrainingArguments, Trainer, AdamW

os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    eval_strategy='epoch',
    save_strategy='epoch',
    logging_dir='./logs',
    logging_steps=100,
    learning_rate=2e-5,
    save_total_limit=1,
    weight_decay=0.01,
    fp16=True,
    # Enable multi-GPU training
    dataloader_num_workers=4,
    report_to="none"
)

optimizer = AdamW(
    model.parameters(),
    lr=2e-5,
    betas=(0.9, 0.999),  # β1=0.9, β2=0.999
    eps=1e-8  # ǫ=1e-8
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    optimizers=(optimizer, None)
)

trainer.train()


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
Initializing global attention on CLS token...


Epoch,Training Loss,Validation Loss
1,0.0094,0.078352


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


TrainOutput(global_step=251, training_loss=0.05987613882676539, metrics={'train_runtime': 3225.0029, 'train_samples_per_second': 4.98, 'train_steps_per_second': 0.078, 'total_flos': 5274846867339264.0, 'train_loss': 0.05987613882676539, 'epoch': 1.0})

### evaluate

In [10]:
results = trainer.evaluate()
print(results)

    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


{'eval_loss': 0.07835180312395096, 'eval_runtime': 1660.6663, 'eval_samples_per_second': 24.179, 'eval_steps_per_second': 3.023, 'epoch': 1.0}


### 儲存模型 (有路徑)

In [11]:
model.save_pretrained('./test2/finetuned-longformer')
tokenizer.save_pretrained('./test2/finetuned-longformer')

('./test2/finetuned-longformer/tokenizer_config.json',
 './test2/finetuned-longformer/special_tokens_map.json',
 './test2/finetuned-longformer/vocab.json',
 './test2/finetuned-longformer/merges.txt',
 './test2/finetuned-longformer/added_tokens.json')

### load 模型做測試 (有路徑)

In [12]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# 加載模型
model = AutoModelForSequenceClassification.from_pretrained('./test2/finetuned-longformer')

# 加載分詞器
tokenizer = AutoTokenizer.from_pretrained('./test2/finetuned-longformer')

In [13]:
from datasets import load_dataset
ds = load_dataset("jpwahle/machine-paraphrase-dataset")

In [14]:
def preprocess_function(examples):
    return tokenizer(
        examples['text'],  # 假設文字欄位是 'text'
        truncation=True,
        padding='max_length',
        max_length=512
    )

### tokenize test data

In [15]:
tokenized_datasets = ds['test'].map(preprocess_function, batched=True)

Map:   0%|          | 0/384291 [00:00<?, ? examples/s]

### 裁切想測試的內容

In [16]:
wiki_data = tokenized_datasets.filter(lambda x: x['dataset'] == 'arxiv' and x['method'] == 'spinnerchief')

Filter:   0%|          | 0/384291 [00:00<?, ? examples/s]

In [17]:
wiki_data.shape

(85322, 6)

In [18]:
sampled_wiki_data = wiki_data.shuffle(seed=42).select(range(int(0.001 * len(wiki_data))))

In [19]:
sampled_wiki_data.shape

(85, 6)

### 處理成 torch dataloader 做測試

In [20]:
from torch.utils.data import DataLoader
import torch

# 移除不需要的列（如 'text', 'dataset', 'method' 等）
filtered_dataset = sampled_wiki_data.remove_columns(['text', 'dataset', 'method'])

# 創建 DataLoader
test_dataloader = DataLoader(filtered_dataset, batch_size=8, collate_fn=lambda x: {k: torch.tensor([d[k] for d in x]) for k in x[0]})

In [21]:
from tqdm import tqdm 

model.eval()
all_preds = []
all_labels = []

for batch in tqdm(test_dataloader, desc="Testing", unit="batch"):
    with torch.no_grad():
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        preds = outputs.logits.argmax(dim=-1)
    
    all_preds.extend(preds.cpu().numpy())
    all_labels.extend(batch['label'].cpu().numpy())

Testing: 100%|██████████| 11/11 [01:45<00:00,  9.61s/batch]


### 測試結果

In [22]:
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

accuracy = accuracy_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds)
print(f"Test Accuracy: {accuracy}")
print(f"f1 score: {f1}")

Test Accuracy: 0.788235294117647
f1 score: 0.7631578947368421


In [23]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

class_labels = ['paraphased', 'original']


cm = confusion_matrix(all_labels, all_preds)

plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='g', vmin=0, cmap='Blues')

plt.xticks(ticks=[0.5, 1.5], labels=class_labels)
plt.yticks(ticks=[0.5, 1.5], labels=class_labels)
plt.xlabel("Predicted")
plt.ylabel("Actual")

plt.title("Confusion Matrix")

plt.show()

ModuleNotFoundError: No module named 'matplotlib'