# Installing Packages

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m64.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m105.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m82.0 MB/s[0m eta [36m0:00:00[0m
Co

# Training and Evaluating RoBERTa Model

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

# Load the data
train_data = pd.read_csv('/content/drive/MyDrive/train1.csv')
dev_data = pd.read_csv('/content/drive/MyDrive/dev1.csv')

# Create a PyTorch Dataset
class ArgumentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Set some parameters
max_len = 256
batch_size = 8
epochs = 3

# Load the tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

# Create the PyTorch DataLoaders
train_dataset = ArgumentDataset(train_data['sentences'], train_data['classes'], tokenizer, max_len)
dev_dataset = ArgumentDataset(dev_data['sentences'], dev_data['classes'], tokenizer, max_len)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False)

# Use a GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Start training
optimizer = AdamW(model.parameters(), lr=1e-5)
for epoch in range(epochs):
    total_loss = 0
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_loader)
    print('Average training loss for epoch {}: '.format(epoch+1), avg_train_loss)

    # Evaluate on the dev set after each epoch
    model.eval()
    preds = []
    true = []
    with torch.no_grad():
        for batch in dev_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            preds.extend(torch.argmax(outputs[0], axis=1).cpu().numpy())
            true.extend(labels.cpu().numpy())

    accuracy = accuracy_score(true, preds)
    precision = precision_score(true, preds)
    recall = recall_score(true, preds)
    f1 = f1_score(true, preds)
    micro_f1 = f1_score(true, preds, average='micro')
    macro_f1 = f1_score(true, preds, average='macro')

    print('Accuracy on the dev set for epoch {}: {:.2%}'.format(epoch+1, accuracy))
    print('Precision on the dev set for epoch {}: {:.2%}'.format(epoch+1, precision))
    print('Recall on the dev set for epoch {}: {:.2%}'.format(epoch+1, recall))
    print('F1 Score on the dev set for epoch {}: {:.2%}'.format(epoch+1, f1))
    print('Micro-F1 Score on the dev set for epoch {}: {:.2%}'.format(epoch+1, micro_f1))
    print('Macro-F1 Score on the dev set for epoch {}: {:.2%}'.format(epoch+1, macro_f1))


# Evaluate on the dev set after all epochs
model.eval()
preds = []
true = []
with torch.no_grad():
    for batch in dev_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        preds.extend(torch.argmax(outputs[0], axis=1).cpu().numpy())
        true.extend(labels.cpu().numpy())

accuracy = accuracy_score(true, preds)
print('Final accuracy on the dev set: ', accuracy)

# After all epochs
precision = precision_score(true, preds)
recall = recall_score(true, preds)
f1 = f1_score(true, preds)
micro_f1 = f1_score(true, preds, average='micro')
macro_f1 = f1_score(true, preds, average='macro')
print('Final precision on the dev set: {:.2%}'.format(precision))
print('Final recall on the dev set: {:.2%}'.format(recall))
print('Final F1 Score on the dev set: {:.2%}'.format(f1))
print('Final Micro-F1 Score on the dev set: {:.2%}'.format(micro_f1))
print('Final Macro-F1 Score on the dev set: {:.2%}'.format(macro_f1))

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Average training loss for epoch 1:  0.5649734784416931
Accuracy on the dev set for epoch 1: 75.44%
Precision on the dev set for epoch 1: 74.51%
Recall on the dev set for epoch 1: 73.54%
F1 Score on the dev set for epoch 1: 74.02%
Micro-F1 Score on the dev set for epoch 1: 75.44%
Macro-F1 Score on the dev set for epoch 1: 75.36%
Average training loss for epoch 2:  0.46982276494970027
Accuracy on the dev set for epoch 2: 74.51%
Precision on the dev set for epoch 2: 68.58%
Recall on the dev set for epoch 2: 85.68%
F1 Score on the dev set for epoch 2: 76.18%
Micro-F1 Score on the dev set for epoch 2: 74.51%
Macro-F1 Score on the dev set for epoch 2: 74.38%
Average training loss for epoch 3:  0.3971750112231245
Accuracy on the dev set for epoch 3: 75.95%
Precision on the dev set for epoch 3: 73.75%
Recall on the dev set for epoch 3: 76.79%
F1 Score on the dev set for epoch 3: 75.24%
Micro-F1 Score on the dev set for epoch 3: 75.95%
Macro-F1 Score on the dev set for epoch 3: 75.93%
Average t

# Saving the Trained RoBERTa Model

In [None]:
output_model_file = "/content/drive/MyDrive/RoBERTa_saved_model.pth"
output_vocab_file = './'

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('All files saved')

All files saved


# Test and Save the results

In [None]:
import torch.nn.functional as F

def predict(sentence):
    model.eval()
    inputs = tokenizer.encode_plus(
        sentence,
        None,
        add_special_tokens=True,
        max_length=max_len,
        pad_to_max_length=True,
        return_token_type_ids=False,
        return_tensors='pt'
    )
    ids = inputs['input_ids'].to(device)
    mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(ids, mask)
        probs = F.softmax(outputs[0], dim=-1)
        _, predicted = torch.max(probs, dim=-1)

    sentiment = "1" if predicted.item() == 1 else "0"
    #sentiment = "1(Claim)" if predicted.item() == 1 else "0(Premise)"
    return sentiment

sentences_premise = [
    "And in that context, of course, they're lifting and shifting some of the older workloads, but they're modernizing the entire business process flow.",
    "It's a number that is incredibly competitive in our industry, and we want to continue to keep it that way.",
    "I mean, sometimes it's not that you came up with some brilliant strategy, it's just like really good work consistently over a long period of time.",
    "On the international, on an FX neutral basis, the growth was 15% in Q3 and 19% in Q4."
]

for sentence in sentences_premise:
    sentiment = predict(sentence)
    #print(f"Sentence: {sentence}")
    print(f"Sentiment: {sentiment}")
    print()


Sentiment: 0

Sentiment: 0

Sentiment: 0

Sentiment: 0



In [None]:
import torch.nn.functional as F

def predict(sentence):
    model.eval()
    inputs = tokenizer.encode_plus(
        sentence,
        None,
        add_special_tokens=True,
        max_length=max_len,
        pad_to_max_length=True,
        return_token_type_ids=False,
        return_tensors='pt'
    )
    ids = inputs['input_ids'].to(device)
    mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(ids, mask)
        probs = F.softmax(outputs[0], dim=-1)
        _, predicted = torch.max(probs, dim=-1)

    sentiment = "1" if predicted.item() == 1 else "0"
    #sentiment = "1(Claim)" if predicted.item() == 1 else "0(Premise)"
    return sentiment

sentences_claim = [
    "See, first of all, I'd say the opportunity for our shareholders when they think about Microsoft has never been better.",
    "APAC benefited from some of the product optimizations we did in the quarter."
]

for sentence_claim in sentences_claim:
    sentiment1 = predict(sentence_claim)
    #print(f"Sentence: {sentence_claim}")
    print(f"Sentiment: {sentiment1}")
    print()


Sentiment: 1

Sentiment: 1



In [None]:
import json

# 載入測試資料集
with open('/content/drive/MyDrive/ECC_Argument_Classification_Test.json', 'r') as f:
    test_data = json.load(f)

# 創建新的欄位來存放預測結果
for data in test_data:
    sentence = data['sentence']  # 取得句子
    sentiment = predict(sentence)  # 進行預測
    data['Prediction'] = sentiment  # 將預測結果加入到資料中

# 儲存更新後的測試資料集
with open('/content/drive/MyDrive/RoBERTa_test_predicted.json', 'w') as f:
    json.dump(test_data, f)

# K-Fold Cross Validation

In [None]:
# k-fold cross validation
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold
import pandas as pd

# 載入數據
train_data = pd.read_csv('/content/drive/MyDrive/train1.csv')
dev_data = pd.read_csv('/content/drive/MyDrive/dev1.csv')

# 創建一個 PyTorch Dataset
class ArgumentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# 設定一些參數
max_len = 256
batch_size = 8
epochs = 3

# 載入 tokenizer 和模型
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

# 創建 PyTorch DataLoaders
dev_dataset = ArgumentDataset(dev_data['sentences'], dev_data['classes'], tokenizer, max_len)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False)

# 使用 GPU 如果可用
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# 初始化 StratifiedKFold 物件
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 循環每一個折叠
for fold, (train_idx, _) in enumerate(skf.split(train_data['sentences'], train_data['classes'])):
    print(f"訓練折叠 {fold+1}...")

    # 獲取當前折叠的訓練數據
    train_fold_data = train_data.iloc[train_idx].reset_index(drop=True)

    # 創建 PyTorch DataLoaders
    train_dataset = ArgumentDataset(train_fold_data['sentences'], train_fold_data['classes'], tokenizer, max_len)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # 開始訓練
    optimizer = AdamW(model.parameters(), lr=1e-5)
    for epoch in range(epochs):
        total_loss = 0
        model.train()
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs[0]
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

        avg_train_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch+1} 的平均訓練損失: ', avg_train_loss)

        # 每个 epoch 后，使用 dev_loader 评估模型
        model.eval()
        preds = []
        true_labels = []
        with torch.no_grad():
            for batch in dev_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask)
                preds.extend(torch.argmax(outputs[0], axis=1).cpu().numpy())
                true_labels.extend(labels.cpu().numpy())

        accuracy = accuracy_score(true_labels, preds)
        precision = precision_score(true_labels, preds)
        recall = recall_score(true_labels, preds)
        f1 = f1_score(true_labels, preds)

        print(f'Epoch {epoch+1} 的验证集精度: {accuracy:.2f}')
        print(f'Epoch {epoch+1} 的验证集精确度: {precision:.2f}')
        print(f'Epoch {epoch+1} 的验证集召回率: {recall:.2f}')
        print(f'Epoch {epoch+1} 的验证集 F1 分数: {f1:.2f}')

    print(f"折叠 {fold+1} 完成！")

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


訓練折叠 1...




Epoch 1 的平均訓練損失:  0.5728474644963274
Epoch 1 的验证集精度: 0.74
Epoch 1 的验证集精确度: 0.70
Epoch 1 的验证集召回率: 0.77
Epoch 1 的验证集 F1 分数: 0.74
Epoch 2 的平均訓練損失:  0.4713335047888848
Epoch 2 的验证集精度: 0.72
Epoch 2 的验证集精确度: 0.64
Epoch 2 的验证集召回率: 0.90
Epoch 2 的验证集 F1 分数: 0.75
Epoch 3 的平均訓練損失:  0.3906515409123421
Epoch 3 的验证集精度: 0.74
Epoch 3 的验证集精确度: 0.68
Epoch 3 的验证集召回率: 0.83
Epoch 3 的验证集 F1 分数: 0.75
折叠 1 完成！
訓練折叠 2...




Epoch 1 的平均訓練損失:  0.363246938540149
Epoch 1 的验证集精度: 0.75
Epoch 1 的验证集精确度: 0.73
Epoch 1 的验证集召回率: 0.74
Epoch 1 的验证集 F1 分数: 0.73
Epoch 2 的平均訓練損失:  0.27223175263377963
Epoch 2 的验证集精度: 0.75
Epoch 2 的验证集精确度: 0.74
Epoch 2 的验证集召回率: 0.73
Epoch 2 的验证集 F1 分数: 0.73
Epoch 3 的平均訓練損失:  0.18891896272243777
Epoch 3 的验证集精度: 0.74
Epoch 3 的验证集精确度: 0.70
Epoch 3 的验证集召回率: 0.80
Epoch 3 的验证集 F1 分数: 0.75
折叠 2 完成！
訓練折叠 3...




Epoch 1 的平均訓練損失:  0.1855989531247838
Epoch 1 的验证集精度: 0.73
Epoch 1 的验证集精确度: 0.69
Epoch 1 的验证集召回率: 0.77
Epoch 1 的验证集 F1 分数: 0.73
Epoch 2 的平均訓練損失:  0.12145207114859734
Epoch 2 的验证集精度: 0.73
Epoch 2 的验证集精确度: 0.70
Epoch 2 的验证集召回率: 0.75
Epoch 2 的验证集 F1 分数: 0.72
Epoch 3 的平均訓練損失:  0.09215547784397225
Epoch 3 的验证集精度: 0.72
Epoch 3 的验证集精确度: 0.73
Epoch 3 的验证集召回率: 0.64
Epoch 3 的验证集 F1 分数: 0.68
折叠 3 完成！
訓練折叠 4...




Epoch 1 的平均訓練損失:  0.10318459412752118
Epoch 1 的验证集精度: 0.74
Epoch 1 的验证集精确度: 0.70
Epoch 1 的验证集召回率: 0.79
Epoch 1 的验证集 F1 分数: 0.74
Epoch 2 的平均訓練損失:  0.06750280207839732
Epoch 2 的验证集精度: 0.72
Epoch 2 的验证集精确度: 0.67
Epoch 2 的验证集召回率: 0.80
Epoch 2 的验证集 F1 分数: 0.73
Epoch 3 的平均訓練損失:  0.06265518906264933
Epoch 3 的验证集精度: 0.72
Epoch 3 的验证集精确度: 0.69
Epoch 3 的验证集召回率: 0.76
Epoch 3 的验证集 F1 分数: 0.72
折叠 4 完成！
訓練折叠 5...




Epoch 1 的平均訓練損失:  0.06473695933430267
Epoch 1 的验证集精度: 0.74
Epoch 1 的验证集精确度: 0.74
Epoch 1 的验证集召回率: 0.69
Epoch 1 的验证集 F1 分数: 0.72
Epoch 2 的平均訓練損失:  0.04313480087358519
Epoch 2 的验证集精度: 0.73
Epoch 2 的验证集精确度: 0.69
Epoch 2 的验证集召回率: 0.78
Epoch 2 的验证集 F1 分数: 0.73
Epoch 3 的平均訓練損失:  0.04182003367262126
Epoch 3 的验证集精度: 0.73
Epoch 3 的验证集精确度: 0.70
Epoch 3 的验证集召回率: 0.75
Epoch 3 的验证集 F1 分数: 0.72
折叠 5 完成！


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold
import pandas as pd

# 載入數據
train_data = pd.read_csv('/content/drive/MyDrive/train1.csv')
dev_data = pd.read_csv('/content/drive/MyDrive/dev1.csv')

# 創建一個 PyTorch Dataset
class ArgumentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# 設定參數
max_len = 256
batch_size = 8
epochs = 3

# 載入 tokenizer 和模型
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

# 創建 PyTorch DataLoaders
dev_dataset = ArgumentDataset(dev_data['sentences'], dev_data['classes'], tokenizer, max_len)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False)

# 使用 GPU 如果可用
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# 初始化 StratifiedKFold 物件
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 循環每一個折叠
for fold, (train_idx, _) in enumerate(skf.split(train_data['sentences'], train_data['classes'])):
    print(f"訓練折叠 {fold+1}...")

    # 獲取當前折叠的訓練數據
    train_fold_data = train_data.iloc[train_idx].reset_index(drop=True)

    # 創建 PyTorch DataLoaders
    train_dataset = ArgumentDataset(train_fold_data['sentences'], train_fold_data['classes'], tokenizer, max_len)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # 開始訓練
    optimizer = AdamW(model.parameters(), lr=1e-5)
    for epoch in range(epochs):
        total_loss = 0
        model.train()
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs[0]
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

        avg_train_loss = total_loss / len(train_loader)
        print(f'折叠 {fold+1}, Epoch {epoch+1} 的平均訓練損失: {avg_train_loss:.2f}')

        # 每個 epoch 後，使用 dev_loader 評估模型
        model.eval()
        preds = []
        true_labels = []
        with torch.no_grad():
            for batch in dev_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask)
                preds.extend(torch.argmax(outputs[0], axis=1).cpu().numpy())
                true_labels.extend(labels.cpu().numpy())

        accuracy = accuracy_score(true_labels, preds)
        precision = precision_score(true_labels, preds)
        recall = recall_score(true_labels, preds)
        f1 = f1_score(true_labels, preds)

        print(f'折叠 {fold+1}, Epoch {epoch+1} - 驗證集accuracy: {accuracy:.2f}, 驗證集precision: {precision:.2f}, 驗證集recall: {recall:.2f}, 驗證集 F1-score: {f1:.2f}')

    print(f"折叠 {fold+1} 完成！")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


訓練折叠 1...
折叠 1, Epoch 1 的平均訓練損失: 0.57
折叠 1, Epoch 1 - 驗證集accuracy: 0.75, 驗證集precision: 0.73, 驗證集recall: 0.73, 驗證集 F1-score: 0.73
折叠 1, Epoch 2 的平均訓練損失: 0.46
折叠 1, Epoch 2 - 驗證集accuracy: 0.75, 驗證集precision: 0.70, 驗證集recall: 0.84, 驗證集 F1-score: 0.76
折叠 1, Epoch 3 的平均訓練損失: 0.39
折叠 1, Epoch 3 - 驗證集accuracy: 0.75, 驗證集precision: 0.70, 驗證集recall: 0.83, 驗證集 F1-score: 0.76
折叠 1 完成！
訓練折叠 2...




折叠 2, Epoch 1 的平均訓練損失: 0.36
折叠 2, Epoch 1 - 驗證集accuracy: 0.75, 驗證集precision: 0.73, 驗證集recall: 0.74, 驗證集 F1-score: 0.73
折叠 2, Epoch 2 的平均訓練損失: 0.26
折叠 2, Epoch 2 - 驗證集accuracy: 0.72, 驗證集precision: 0.65, 驗證集recall: 0.86, 驗證集 F1-score: 0.74
折叠 2, Epoch 3 的平均訓練損失: 0.18
折叠 2, Epoch 3 - 驗證集accuracy: 0.73, 驗證集precision: 0.68, 驗證集recall: 0.83, 驗證集 F1-score: 0.74
折叠 2 完成！
訓練折叠 3...




折叠 3, Epoch 1 的平均訓練損失: 0.19
折叠 3, Epoch 1 - 驗證集accuracy: 0.72, 驗證集precision: 0.66, 驗證集recall: 0.88, 驗證集 F1-score: 0.75
折叠 3, Epoch 2 的平均訓練損失: 0.13
折叠 3, Epoch 2 - 驗證集accuracy: 0.75, 驗證集precision: 0.78, 驗證集recall: 0.65, 驗證集 F1-score: 0.71
折叠 3, Epoch 3 的平均訓練損失: 0.11
折叠 3, Epoch 3 - 驗證集accuracy: 0.73, 驗證集precision: 0.67, 驗證集recall: 0.85, 驗證集 F1-score: 0.75
折叠 3 完成！
訓練折叠 4...




折叠 4, Epoch 1 的平均訓練損失: 0.10
折叠 4, Epoch 1 - 驗證集accuracy: 0.74, 驗證集precision: 0.76, 驗證集recall: 0.66, 驗證集 F1-score: 0.71
折叠 4, Epoch 2 的平均訓練損失: 0.07
折叠 4, Epoch 2 - 驗證集accuracy: 0.74, 驗證集precision: 0.72, 驗證集recall: 0.75, 驗證集 F1-score: 0.74
折叠 4, Epoch 3 的平均訓練損失: 0.06
折叠 4, Epoch 3 - 驗證集accuracy: 0.72, 驗證集precision: 0.68, 驗證集recall: 0.81, 驗證集 F1-score: 0.74
折叠 4 完成！
訓練折叠 5...




折叠 5, Epoch 1 的平均訓練損失: 0.06
折叠 5, Epoch 1 - 驗證集accuracy: 0.75, 驗證集precision: 0.77, 驗證集recall: 0.66, 驗證集 F1-score: 0.71
折叠 5, Epoch 2 的平均訓練損失: 0.05
折叠 5, Epoch 2 - 驗證集accuracy: 0.74, 驗證集precision: 0.70, 驗證集recall: 0.77, 驗證集 F1-score: 0.74
折叠 5, Epoch 3 的平均訓練損失: 0.04
折叠 5, Epoch 3 - 驗證集accuracy: 0.75, 驗證集precision: 0.74, 驗證集recall: 0.74, 驗證集 F1-score: 0.74
折叠 5 完成！
