## Finetune Model

In [1]:
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm import tqdm
import pandas as pd
import random
import torch

**Read data:**

In [2]:
train_data = pd.read_csv("./data/train_set.csv")
val_data = pd.read_csv("./data/val_set.csv")
train_data.head()

Unnamed: 0.1,Unnamed: 0,#1 String,#2 String,label
0,3272,"At first blush, then, the distinction drawn by...","At first blush, then, the creators of the regi...",1
1,3671,"This is unacceptable,"" said Patrick Pelloux, t...",Two CIA operatives have been killed in an ambu...,0
2,5553,Yunos allegedly prepared the bombs' wiring whi...,"Yunos prepared the bombs' wiring, and Ghozi ad...",1
3,2929,But this could happen with any document format...,Mrs. Clinton said she was incredulous that he ...,0
4,2614,As they wound through police barricades to the...,"""It is highly conditional, faces significant r...",0


In [3]:
train_data = train_data.head(100)

**Convert to dataset:**

In [4]:
class ParaphraseDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence1 = str(self.data.iloc[idx]['#1 String'])
        sentence2 = str(self.data.iloc[idx]['#2 String'])
        label = self.data.iloc[idx]['label']

        encoding = self.tokenizer(
            sentence1,
            sentence2,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding="max_length",
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 128



In [6]:
train_dataset = ParaphraseDataset(train_data, tokenizer, max_len)
val_dataset = ParaphraseDataset(val_data, tokenizer, max_len)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

**Load Model:**

In [7]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
optimizer = AdamW(model.parameters(), lr=2e-5)
device = torch.device("mps" if torch.cuda.is_available() else "cpu")
model = model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**Train model:**

In [8]:
# Training loop with warnings addressed
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}")

# Save model at each epoch
save_path = f"./models/bert_paraphrase_model"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print(f"Model saved at {save_path}")

Epoch 1/3: 100%|██████████| 7/7 [00:16<00:00,  2.33s/it]


Epoch 1/3, Loss: 0.6590963346617562


Epoch 2/3: 100%|██████████| 7/7 [00:15<00:00,  2.18s/it]


Epoch 2/3, Loss: 0.542468249797821


Epoch 3/3: 100%|██████████| 7/7 [00:15<00:00,  2.16s/it]


Epoch 3/3, Loss: 0.3848019071987697
Model saved at ./models/bert_paraphrase_model
