參考資料

    https://www.kaggle.com/code/salmarashwan/llm-prompt-recovery-fine-tuning-t5-for-success
    https://www.kaggle.com/code/richolson/t5-prompt-scoring-playground

In [None]:
import pandas as pd # 匯入套件
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration
train_df = pd.read_csv('/kaggle/input/nlp-49/01.csv') # 匯入 train, test 資料
test_df = pd.read_csv('/kaggle/input/llm-prompt-recovery/test.csv')
train_df.head()

In [None]:
def preprocess_data(original_text, rewritten_text): # 詢問模型 句子內容 的提示詞
    return f"What was the base for rewriting the prompt with {original_text} and {rewritten_text} ?"

In [None]:
class PromptData(Dataset):
    def __init__(self, dataframe, tokenizer, max_input_len=128, max_label_len=128, is_training=True):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_input_len = max_input_len
        self.max_label_len = max_label_len
        self.is_training = is_training

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        input_text = preprocess_data(row['original_text'], row['rewritten_text'])

        input_encodings = self.tokenizer(
            input_text,
            max_length=self.max_input_len, # 設定最大長度
            truncation=True,               # 設定自動截斷
            padding='max_length',          # 設定自動補齊 
            return_tensors='pt'
        )
        if self.is_training:
            label_text = row['rewrite_prompt']
            label_encodings = self.tokenizer(
                label_text,
                max_length=self.max_label_len,
                truncation=True,
                padding='max_length',
                return_tensors='pt'
            )
            return {
                'input_ids': input_encodings['input_ids'].squeeze(0), # 去掉 batch 維度
                'attention_mask': input_encodings['attention_mask'].squeeze(0),
                'labels': label_encodings['input_ids'].squeeze(0)
            }
        else:
            return {
                'input_ids': input_encodings['input_ids'].squeeze(0),
                'attention_mask': input_encodings['attention_mask'].squeeze(0)
            }

In [None]:
tokenizer = T5Tokenizer.from_pretrained('/kaggle/input/t5/transformers/default/1/snapshots/a9723ea7f1b39c1eae772870f3b547bf6ef7e6c1') # 匯入 T5-BASE 模型 
model = T5ForConditionalGeneration.from_pretrained('/kaggle/input/t5/transformers/default/1/snapshots/a9723ea7f1b39c1eae772870f3b547bf6ef7e6c1')
torch.manual_seed(2) # 避免每次訓練不一致(數字隨便都可以)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # 使用 GPU or CPU
model = model.to(device)

In [None]:
train_data = PromptData(train_df, tokenizer, is_training=True)
train_loader = DataLoader(train_data, batch_size=8, shuffle=True)

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)
epochs = 2

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()
    print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}')

In [None]:
test_data = PromptData(test_df, tokenizer, is_training=False)
test_loader = DataLoader(test_data, batch_size=8, shuffle=False)

In [None]:
model.eval()
predicted_prompts = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=128)
        predicted_prompts.extend([tokenizer.decode(output, skip_special_tokens=True) for output in outputs])

test_df['rewrite_prompt'] = predicted_prompts

In [None]:
test_df[['id', 'rewrite_prompt']].to_csv('submission.csv', index=False)

In [None]:
test_df['rewrite_prompt'].tolist()