In [None]:
from transformers import BertForSequenceClassification, BertTokenizerFast, Trainer, TrainingArguments
import torch
import numpy as np
import pandas as pd
from datasets import Dataset
import datasets
from transformers import AutoModel, AutoTokenizer
import wandb

In [None]:
np.__version__

In [None]:
!apt install git-lfs

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

In [None]:
params = {'WANDB_PROJECT': 'review_classifier',
          'ENTITY': 'lilouuch',
          'CLASSES': {i: c for i, c in enumerate(range(0, 6))},
          'RAW_DATA_AT': 'Goodreads_Books_Review_Rating',
          'PROCESSED_DATA_AT': 'Goodreads_Books_Review_Rating_load'}

In [None]:
run =  wandb.init(project=params['WANDB_PROJECT'], entity=params['ENTITY'], job_type="training")

In [None]:
raw_data_at = run.use_artifact(f"Goodreads_Books_Review_Rating_VAL:latest")

In [None]:
raw_data_at.download()

In [None]:
train_df = pd.read_csv('/kaggle/working/artifacts/Goodreads_Books_Review_Rating_VAL:v2/train_val_split.csv')

In [None]:
train_df.rename({'rating': 'label'}, inplace=True, axis=1)

In [None]:
valid_df = train_df[train_df['fold'] == 1]  
train_df = train_df[train_df['fold'] != 1]  

In [None]:
train_ds = Dataset.from_pandas(train_df)
valid_ds = Dataset.from_pandas(valid_df)

data_all_splits = datasets.DatasetDict({"train":train_ds, "val":valid_ds})

In [None]:
model = BertForSequenceClassification.from_pretrained('prajjwal1/bert-medium',num_labels=6)

In [None]:
model.cuda()

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('prajjwal1/bert-medium')

In [None]:
import re

def clean_text_row(text):
    # Remove URLs
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    
    # Remove emails
    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)
    
    # Remove phone numbers (formats: xxx-xxx-xxxx, (xxx)xxx-xxxx, xxxxxxxxxx)
    text = re.sub(r'\b(?:\d{3}[-.]?)?\d{3}[-.]?\d{4}\b', '', text)
    
    # Remove special characters except whitespace
    text = re.sub(r'[^\w\s!.,;?\'"\-]', '', text)
    
    return text

In [None]:
def clean_text(example):
    example['review_text_clean']= clean_text_row(example['review_text'])
    return example

In [None]:
data_all_splits=data_all_splits.map(clean_text)

In [None]:
def preprocess(data):
    return tokenizer(data['review_text'], padding=True, truncation=True, max_length=512)

In [None]:
tokenized_datasets = data_all_splits.map(preprocess, batched=True)

In [None]:
tokenized_datasets

In [None]:
tokenized_datasets.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [None]:
batch_size = 32
epochs = 2

In [None]:
warmup_steps = 1000 # help increasing lr 
weight_decay = 0.0001# some sort of regularization parameter

In [None]:
%env WANDB_LOG_MODEL=true

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
from datasets import load_metric
f1_score_metric = load_metric('f1')
accuracy_metric= load_metric("accuracy")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    f1_score = f1_score_metric.compute(predictions=predictions, references=labels, average="macro")["f1"]
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    return {"f1": f1_score, "accuracy": accuracy}


In [None]:
training_args = TrainingArguments(
    report_to = 'wandb',
    output_dir='Goodreads_Books_Reviews_med2_50',
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-5,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=warmup_steps,
    weight_decay=weight_decay,
    logging_dir='./logs',
    push_to_hub=True,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_steps = 100
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["val"],
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub()


In [None]:
run.finish()