In [1]:
#this file trains the model on the original dataset (all genres)
import torch
import datasets
import transformers
import pandas as pd
import numpy as np
from torch.nn import BCEWithLogitsLoss
from transformers import BigBirdTokenizer, \
BigBirdForSequenceClassification, Trainer, TrainingArguments,EvalPrediction, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import random

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Load your dataset
df = pd.read_csv('scripts.csv')

df=df[['script', 'imdb user rating']]
df.head()

Unnamed: 0,script,imdb user rating
0,A NIGHT AT THE ROXBURY written by Steve Ko...,6
1,AT FIRST SIGHTEXT. VALLEY - DUSK Gold light da...,6
2,BamboozledbySpike LeeBLACK SCREENWe HEAR the v...,6
3,THE BIG LEBOWSKIWe are floating up a steep scr...,8
4,Boys on the SideSCENE 1JANEThank you. I'm Jan...,6


In [6]:
dataset = datasets.Dataset.from_pandas(df)

# Split the dataset into train (80%) and temporary (20%)
train_testvalid = dataset.train_test_split(test_size=0.2)

# Split the temporary dataset into validation and test datasets (50% each)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)

# Prepare the final DatasetDict
final_datasets = datasets.DatasetDict({
    'train': train_testvalid['train'],
    'validation': test_valid['train'],
    'test': test_valid['test']
})

In [7]:
tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base')

def tokenize_function(examples):
    return tokenizer(examples['script'], padding="max_length", truncation=True, max_length=4096)

# Apply the tokenize function to all splits
final_datasets = final_datasets.map(tokenize_function, batched=True)

Map: 100%|██████████| 1907/1907 [03:09<00:00, 10.08 examples/s]
Map: 100%|██████████| 238/238 [00:22<00:00, 10.79 examples/s]
Map: 100%|██████████| 239/239 [00:23<00:00, 10.14 examples/s]


In [8]:
final_datasets.save_to_disk('data_train_test_val')

Saving the dataset (1/1 shards): 100%|██████████| 1907/1907 [00:00<00:00, 3028.47 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 238/238 [00:00<00:00, 3058.66 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 239/239 [00:00<00:00, 3245.77 examples/s]


In [2]:
from datasets import load_from_disk

# Load datasets from disk
final_datasets = load_from_disk('data_train_test_val')

In [9]:
from datasets import load_metric

accuracy_metric = load_metric('accuracy')
precision_metric = load_metric('precision')
recall_metric = load_metric('recall')
f1_metric = load_metric('f1')

#function to compute the metrics of the model
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(-1)
    acc = accuracy_metric.compute(predictions=predictions, references=labels)
    prec = precision_metric.compute(predictions=predictions, references=labels, average='macro')
    rec = recall_metric.compute(predictions=predictions, references=labels, average='macro')
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='macro')
    return {
        'accuracy': acc['accuracy'],
        'precision': prec['precision'],
        'recall': rec['recall'],
        'f1': f1['f1']
    }

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [10]:
from transformers import BigBirdForSequenceClassification, Trainer, TrainingArguments

# Load the model
model = BigBirdForSequenceClassification.from_pretrained('google/bigbird-roberta-base', num_labels=4)  # Adjust num_labels accordingly

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=final_datasets['train'],
    eval_dataset=final_datasets['validation'],
    compute_metrics=compute_metrics  # Assuming compute_metrics function is defined as shown earlier
)

# Train the model
trainer.train()


Some weights of BigBirdForSequenceClassification were not initialized from the model checkpoint at google/bigbird-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


: 

In [None]:
results = trainer.evaluate(final_datasets['test'])
print(results)