In [17]:
import pandas as pd
import torch
from torch import optim, nn
from torch.utils.data import Dataset, random_split, DataLoader
import transformers
from transformers import BertTokenizerFast, BertForSequenceClassification
import timeit
import pathlib
from tqdm import tqdm 
import timeit

In [5]:
from transformers.utils.logging import set_verbosity_error
set_verbosity_error()

device = "cuda" if torch.cuda.is_available() else "cpu"
# You might want to use your own path here
# However, you might 
BASE_PATH = "YOUR_PATH_HERE"
MAX_LENGTH = 256
# Disregard, will just download from TF
# MODEL_PATH = pathlib.Path(BASE_PATH, "input/transformers/bert-base-uncased")
BATCH_SIZE = 32
# Usual
LEARNING_RATE = 2e-5
# Result is a non-issue here, and it will take way too long to train anything per epoch in this case otherwise
EPOCHS = 2

In [7]:
train_df = pd.read_csv(pathlib.Path(BASE_PATH,"train.csv"))
train_df.head(1)

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009


In [8]:
test_df = pd.read_csv(pathlib.Path(BASE_PATH,"test.csv"))
test_df.head(1)

Unnamed: 0,id,url_legal,license,excerpt
0,c0f722661,,,My hope lay in Jack's promise that he would ke...


In [10]:
submission_df = pd.read_csv(pathlib.Path(BASE_PATH,"sample_submission.csv"))
submission_df.head(1)

Unnamed: 0,id,target
0,c0f722661,0.0


In [12]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1).to(device)

Downloading (…)/main/tokenizer.json: 100%|████| 466k/466k [00:00<00:00, 116MB/s]
Downloading model.safetensors: 100%|█████████| 440M/440M [00:38<00:00, 11.4MB/s]


In [13]:
class ComplexityDataset(Dataset):
    """
    Custom dataset class for complexity prediction.

    Args:
        sentences (list of str): List of input sentences.
        targets (list of float): List of target complexity scores.
        tokenizer (transformers.PreTrainedTokenizer): Tokenizer for encoding sentences.

    Attributes:
        encodings (dict): Encoded representations of sentences.
        targets (list of float): List of target complexity scores.

    Methods:
        __getitem__(self, idx): Returns a dictionary containing encoded input and target.
        __len__(self): Returns the number of samples in the dataset.
    """

    def __init__(self, sentences, targets, tokenizer):
        """
        Initialize the ComplexityDataset.

        Args:
            sentences (list of str): List of input sentences.
            targets (list of float): List of target complexity scores.
            tokenizer (transformers.PreTrainedTokenizer): Tokenizer for encoding sentences.
        """
        self.encodings = tokenizer(sentences, padding=True, truncation=True, max_length=MAX_LENGTH)
        self.targets = targets
        
    def __getitem__(self, idx):
        """
        Get the encoded input and target at the specified index.

        Args:
            idx (int): Index of the sample.

        Returns:
            dict: Dictionary containing encoded input and target.
        """
        out_dict = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        out_dict["targets"] = torch.tensor(self.targets[idx], dtype=torch.float)
        return out_dict
    
    def __len__(self):
        """
        Get the total number of samples in the dataset.

        Returns:
            int: Number of samples in the dataset.
        """
        return len(self.targets)

In [14]:
class ComplexitySubmitDataset(Dataset):
    """
    Custom dataset class for complexity prediction on UNSEEN data (SUB).

    Args:
        sentences (list of str): List of input sentences.
        tokenizer (transformers.PreTrainedTokenizer): Tokenizer for encoding sentences.
        ids (list of str): List of identifiers for the data samples.

    Attributes:
        ids (list of str): List of identifiers for the data samples.
        encodings (dict): Encoded representations of sentences.

    Methods:
        __getitem__(self, idx): Returns a dictionary containing encoded input and identifiers.
        __len__(self): Returns the number of samples in the dataset.
    """

    def __init__(self, sentences, tokenizer, ids):
        """
        Initialize the ComplexitySubmitDataset.

        Args:
            sentences (list of str): List of input sentences.
            tokenizer (transformers.PreTrainedTokenizer): Tokenizer for encoding sentences.
            ids (list of str): List of identifiers for the data samples.
        """
        self.ids = ids
        self.encodings = tokenizer(sentences, padding=True, truncation=True, max_length=MAX_LENGTH)
        
    def __getitem__(self, idx):
        """
        Get the encoded input and identifiers at the specified index.

        Args:
            idx (int): Index of the sample.

        Returns:
            dict: Dictionary containing encoded input and identifiers.
        """
        out_dict = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        out_dict["ids"] = self.ids[idx]
        return out_dict
    
    def __len__(self):
        """
        Get the total number of samples in the dataset.

        Returns:
            int: Number of samples in the dataset.
        """
        return len(self.ids)

In [15]:
dataset = ComplexityDataset(train_df["excerpt"].to_list(), train_df["target"].to_list(), tokenizer)
test_dataset = ComplexitySubmitDataset(test_df["excerpt"].to_list(), tokenizer, test_df["id"].to_list())

generator = torch.Generator().manual_seed(42)
num_samples = len(dataset)
train_size = int(0.9 * num_samples)
val_size = num_samples - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size], generator=generator)

In [16]:
train_dataloader = DataLoader(dataset=train_dataset,
                              batch_size=BATCH_SIZE,
                              shuffle=True)

val_dataloader = DataLoader(dataset=val_dataset,
                              batch_size=BATCH_SIZE,
                              shuffle=True)

test_dataloader = DataLoader(dataset=test_dataset,
                              batch_size=BATCH_SIZE,
                              shuffle=False)

In [18]:
# Initialize the optimizer
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)

# Measure training time
start = timeit.default_timer()

# Training loop
for epoch in tqdm(range(EPOCHS), position=0, leave=True):
    model.train()  # Set the model to training mode
    train_running_loss = 0 

    # Iterate through training batches
    for idx, sample in enumerate(tqdm(train_dataloader, position=0, leave=True)):
        input_ids = sample['input_ids'].to(device)
        attention_mask = sample['attention_mask'].to(device)
        targets = sample["targets"].to(device)

        # Forward pass and compute loss
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=targets)
        loss = outputs.loss

        # Backpropagation and optimizer step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_running_loss += loss.item()

    # Calculate average training loss for the epoch
    train_loss = train_running_loss / (idx + 1)

    model.eval()  # Set the model to evaluation mode
    val_running_loss = 0 

    # Evaluate on validation data
    with torch.no_grad():
        for idx, sample in enumerate(tqdm(val_dataloader, position=0, leave=True)):
            input_ids = sample['input_ids'].to(device)
            attention_mask = sample['attention_mask'].to(device)
            targets = sample["targets"].to(device)

            # Forward pass and compute loss for validation
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=targets)
            val_running_loss += outputs.loss.item()

        # Calculate average validation loss for the epoch
        val_loss = val_running_loss / (idx + 1)

    # Print epoch results
    print("-"*30)
    print(f"Train Loss EPOCH {epoch+1}: {train_loss:.4f}")
    print(f"Valid Loss EPOCH {epoch+1}: {val_loss:.4f}")
    print("-"*30)

# Calculate total training time
stop = timeit.default_timer()
print(f"Training Time: {stop-start:.2f}s")

100%|███████████████████████████████████████████| 80/80 [10:23<00:00,  7.80s/it]
100%|█████████████████████████████████████████████| 9/9 [00:19<00:00,  2.22s/it]
 50%|██████████████████████                      | 1/2 [10:43<10:43, 643.63s/it]

------------------------------
Train Loss EPOCH 1: 0.6184
Valid Loss EPOCH 1: 0.3126
------------------------------


100%|███████████████████████████████████████████| 80/80 [10:20<00:00,  7.76s/it]
100%|█████████████████████████████████████████████| 9/9 [00:20<00:00,  2.26s/it]
100%|████████████████████████████████████████████| 2/2 [21:24<00:00, 642.26s/it]

------------------------------
Train Loss EPOCH 2: 0.3031
Valid Loss EPOCH 2: 0.3905
------------------------------
Training Time: 1284.46s





In [19]:
# torch.cuda.empty_cache()

In [20]:
# Initialize empty lists to store predictions and IDs
preds = []
ids = []

# Set the model to evaluation mode (no gradient computation)
model.eval()

# Perform inference on the test dataset
with torch.no_grad():
    # Iterate through batches in the test data loader
    for idx, sample in enumerate(tqdm(test_dataloader, position=0, leave=True)):
        # Move input data to the appropriate device (GPU or CPU)
        input_ids = sample['input_ids'].to(device)
        attention_mask = sample['attention_mask'].to(device)
        
        # Extend the 'ids' list with the IDs from the current batch
        ids.extend(sample["ids"])
        
        # Forward pass through the model to obtain predictions
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        
        # Extract the predictions (logits) and convert them to a list of floats
        preds.extend([float(i) for i in outputs["logits"].squeeze()])

100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.84it/s]


In [22]:
SUB_PATH = pathlib.Path(BASE_PATH,"submission.csv")
submission_df = pd.DataFrame(list(zip(ids, preds)),
               columns =['id', 'target'])
submission_df.to_csv(SUB_PATH, index=False)

In [23]:
# Save the trained model
model.save_pretrained(BASE_PATH)

In [24]:
import json

metrics = {
    'train_loss': [0.6184, 0.3031],  # Training loss for each epoch
    'valid_loss': [0.3126, 0.3905],  # Validation loss for each epoch
}

# Specify the path to save metrics
metrics_save_path = pathlib.Path(BASE_PATH,"metrics.json")

# Save metrics to a JSON file
with open(metrics_save_path, 'w') as metrics_file:
    json.dump(metrics, metrics_file)