In [1]:
#this file trains the model on the dataset that has filtered genres
import os

# Set the environment variable for PyTorch CUDA allocator
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

import torch
import datasets
import transformers
import pandas as pd
import numpy as np
from torch.nn import BCEWithLogitsLoss
from transformers import BigBirdTokenizer 
import random

In [2]:
# Load your dataset
df = pd.read_csv('/home/km947/ddp/filtered_scripts.csv')

df=df[['script', 'imdb user rating']]
df['imdb user rating'] = df['imdb user rating']-5
df.columns = ['text', 'labels']
df.head()

Unnamed: 0,text,labels
0,A NIGHT AT THE ROXBURY written by Steve Ko...,1
1,AT FIRST SIGHTEXT. VALLEY - DUSK Gold light da...,1
2,BamboozledbySpike LeeBLACK SCREENWe HEAR the v...,1
3,THE BIG LEBOWSKIWe are floating up a steep scr...,3
4,Boys on the SideSCENE 1JANEThank you. I'm Jan...,1


In [4]:
dataset = datasets.Dataset.from_pandas(df)

# Split the dataset into train (80%) and temporary (20%)
train_testvalid = dataset.train_test_split(test_size=0.2)

# Split the temporary dataset into validation and test datasets (50% each)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)

# Prepare the final DatasetDict
final_datasets = datasets.DatasetDict({
    'train': train_testvalid['train'],
    'validation': test_valid['train'],
    'test': test_valid['test']
})

In [5]:
tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base')

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=4096)
    #return tokenizer(examples['script'], padding="max_length")

# Apply the tokenize function to all splits
final_datasets = final_datasets.map(tokenize_function, batched=True)

Map:   0%|          | 0/1180 [00:00<?, ? examples/s]

Map:   0%|          | 0/147 [00:00<?, ? examples/s]

Map:   0%|          | 0/148 [00:00<?, ? examples/s]

In [6]:
#save datasets the first time they're created
final_datasets.save_to_disk('/home/km947/ddp/data_train_test_val_filtered')

Saving the dataset (0/1 shards):   0%|          | 0/1180 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/147 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/148 [00:00<?, ? examples/s]

In [2]:
from datasets import load_from_disk

# Load datasets every time after the first
final_datasets = load_from_disk('/home/km947/ddp/data_train_test_val_filtered')

In [3]:
final_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 1180
    })
    validation: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 147
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 148
    })
})

In [4]:
from datasets import load_metric

accuracy_metric = load_metric('accuracy')
precision_metric = load_metric('precision')
recall_metric = load_metric('recall')
f1_metric = load_metric('f1')

#function to compute the metrics of the model
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(-1)
    acc = accuracy_metric.compute(predictions=predictions, references=labels)
    prec = precision_metric.compute(predictions=predictions, references=labels, average='macro')
    rec = recall_metric.compute(predictions=predictions, references=labels, average='macro')
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='macro')
    return {
        'accuracy': acc['accuracy'],
        'precision': prec['precision'],
        'recall': rec['recall'],
        'f1': f1['f1']
    }

  accuracy_metric = load_metric('accuracy')
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [7]:
from transformers import BigBirdForSequenceClassification, Trainer, TrainingArguments

# Load the model
model = BigBirdForSequenceClassification.from_pretrained('google/bigbird-roberta-base', num_labels=4)  # Adjust num_labels accordingly

# Set up training arguments
training_args = TrainingArguments(
    output_dir='/home/km947/ddp/results',
    evaluation_strategy="epoch",
    learning_rate=1e-3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_total_limit=3
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=final_datasets['train'],
    eval_dataset=final_datasets['validation'],
    compute_metrics=compute_metrics  # Assuming compute_metrics function is defined as shown earlier
)

# Train the model
trainer.train()


Some weights of BigBirdForSequenceClassification were not initialized from the model checkpoint at google/bigbird-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,1.231783,0.44898,0.112245,0.25,0.15493
2,1.255400,1.248046,0.44898,0.112245,0.25,0.15493
3,1.255400,1.294898,0.319728,0.079932,0.25,0.121134
4,1.228700,1.233604,0.44898,0.112245,0.25,0.15493
5,1.228700,1.22807,0.44898,0.112245,0.25,0.15493
6,1.219500,1.271441,0.44898,0.112245,0.25,0.15493
7,1.218000,1.234598,0.44898,0.112245,0.25,0.15493
8,1.218000,1.225661,0.44898,0.112245,0.25,0.15493
9,1.203700,1.223657,0.44898,0.112245,0.25,0.15493
10,1.203700,1.224218,0.44898,0.112245,0.25,0.15493


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=2950, training_loss=1.222094685182733, metrics={'train_runtime': 7027.2885, 'train_samples_per_second': 1.679, 'train_steps_per_second': 0.42, 'total_flos': 2.5009399431168e+16, 'train_loss': 1.222094685182733, 'epoch': 10.0})

In [8]:
results = trainer.evaluate(final_datasets['test'])
print(results)

{'eval_loss': 1.1200023889541626, 'eval_accuracy': 0.5472972972972973, 'eval_precision': 0.13682432432432431, 'eval_recall': 0.25, 'eval_f1': 0.17685589519650652, 'eval_runtime': 120.3429, 'eval_samples_per_second': 1.23, 'eval_steps_per_second': 0.307, 'epoch': 10.0}


  _warn_prf(average, modifier, msg_start, len(result))


In [6]:
torch.cuda.empty_cache()