# M2177.003100 Deep Learning Assignment #1<br> Part 1-4. Finetune BERT (Pytorch) 

Copyright (C) Data Science & AI Laboratory, Seoul National University. This material is for educational uses only. Some contents are based on the material provided by other paper/book authors and may be copyrighted by them. Written by JunYong Ahn, September 2023

**For understanding of this work, please carefully look at given PDF file.**

In this notebook, you will learn how to fine-tune the lightweight BERT variant using Hugging Face's transformers library for text classification tasks; BERT(Bidirectional Encoder Representations from Transformers) is a groundbreaking model in the NLP domain.  <br>
There are **2 sections**, and in each section, you need to follow the instructions to complete the skeleton codes.

**Note**: certain details are missing or ambiguous on purpose, in order to test your knowledge on the related materials. However, if you really feel that something essential is missing and cannot proceed to the next step, then contact the teaching staff with clear description of your problem.

### Submitting your work:
<font color=red>**DO NOT clear the final outputs**</font> so that TAs can grade both your code and results.

### Some helpful tutorials and references for assignment #1-4:
- [1] BERT original paper (Devlin et al., 2018). [[link]](https://arxiv.org/abs/1810.04805)
- [2] Tutorials about BERT [[link]](https://medium.com/@khang.pham.exxact/text-classification-with-bert-7afaacc5e49b)

### Check virtual env and import packages

In [None]:
import os
assert os.environ["CONDA_DEFAULT_ENV"] == "deep-learning-23", "current environment is not deep-learning-23"
!python3 -m pip install pandas
!python3 -m pip install transformers

import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from tqdm import tqdm
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

%env CUDA_VISIBLE_DEVICES = 0

if torch.cuda.is_available() is True:
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

## 1. Finetune BERT

### Prepare dataset

link : https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

1. Download the dataset from attached link.
2. Move the downloaded zip file under the "data" directory and then unzip the zip file.
3. Run the following cell

In [None]:
def load_imdb_data(data_file_path):
    if os.path.exists(data_file_path):
        df = pd.read_csv(data_file_path)
        texts = df['review'].tolist()
        labels = [1 if sentiment == "positive" else 0 for sentiment in df['sentiment'].tolist()]
        return texts, labels
    else:
        raise FileNotFoundError(f"The file '{data_file_path}' does not exist.")

data_file_path = './data/IMDB Dataset.csv'
texts, labels = load_imdb_data(data_file_path)

### Define Dataset class

In [None]:
class CustomTextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_seq_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            max_length=self.max_seq_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

### Classifier head for BERT (Design your model's prediction head )

In [None]:
class CustomBERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(CustomBERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        ######################## TO-DO ########################

        
        
        ######################## TO-DO ########################

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        ######################## TO-DO ########################

        
        
        ######################## TO-DO ########################
        return logits

### Training and evaluation function

In [None]:
def train_model(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in tqdm(data_loader, desc="Train"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        
def evaluate_model(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Validation"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

## 2. Train your model

In [None]:
# Set up parameters
# Hint: generally, less than 10 epochs will be enough.
bert_model_name = 'bert-base-uncased'
num_classes = 2
######################## TO-DO ########################
max_seq_length = 
batch_size = 
num_epochs = 
learning_rate = 
######################## TO-DO ########################

### Define data utils

In [None]:
######################## DO NOT CHANGE ########################
train_texts, val_texts, train_labels, val_labels = \
train_test_split(texts, labels, test_size=0.4, random_state=42)
val_texts, test_texts, val_labels, test_labels = \
train_test_split(val_texts, val_labels, test_size=0.5, random_state=42)
######################## DO NOT CHANGE ########################

tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = CustomTextClassificationDataset(train_texts, train_labels, tokenizer, max_seq_length)
val_dataset = CustomTextClassificationDataset(val_texts, val_labels, tokenizer, max_seq_length)
test_dataset = CustomTextClassificationDataset(test_texts, test_labels, tokenizer, max_seq_length)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

### Define model, optimizer, and scheduler

In [None]:
model = CustomBERTClassifier(bert_model_name, num_classes).to(device)
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train_model(model, train_dataloader, optimizer, scheduler, device)
    accuracy, report = evaluate_model(model, val_dataloader, device)
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(report)

# evaluation
accuracy, report = evaluate_model(model, test_dataloader, device)
print(f"Test Accuracy: {accuracy:.4f}")
print(report)