In [2]:
from transformers import BertForSequenceClassification, BertTokenizerFast, Trainer, TrainingArguments
import torch
import numpy as np
import pandas as pd
from datasets import Dataset
import datasets

### note
if you want to use other model, all you have to do is to import 
* Roberta : RobertaTokenizer , RobertaForSequenceClassification
* Albert : AlbertTokenizer, AlbertForSequenceClassification
* AutoModelForSequenceClassification : for more details check [hugging face documentation](https://huggingface.co/docs/transformers/model_doc/auto#transformers.AutoModelForSequenceClassification)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla T4'

In [4]:
train_set = pd.read_csv("/kaggle/input/goodreads-books-reviews-290312/goodreads_train.csv")
train_set['rating'].value_counts()

4    313688
5    265007
3    188972
2     72627
0     30988
1     28718
Name: rating, dtype: int64

Due to resource constraints, I wasn't able to use all the data. Consequently, I selected a subset from each class.

In [5]:
train_rate0 = train_set[train_set['rating']==0][:30987]
print(train_rate0.shape)
train_rate1 = train_set[train_set['rating']==1][:28717]
print(train_rate1.shape)
train_rate2 = train_set[train_set['rating']==2][:48000]
print(train_rate2.shape)

train_rate3 = train_set[train_set['rating']==3][:48000]
print(train_rate3.shape)

train_rate4 = train_set[train_set['rating']==4][:48000]
print(train_rate4.shape)

train_rate5= train_set[train_set['rating']==5][:48000]
print(train_rate5.shape)


(30987, 11)
(28717, 11)
(48000, 11)
(48000, 11)
(48000, 11)
(48000, 11)


In [6]:
new_train= pd.concat([train_rate0,train_rate1, train_rate2, train_rate3, train_rate4, train_rate5], axis=0)

In [7]:
new_train

Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
3,8842281e1d1347389f2ab93d60773d4d,27161156,ced5675e55cd9d38a524743f5c40996e,0,Recommended reading to understand what is goin...,Wed Nov 09 17:37:04 -0800 2016,Wed Nov 09 17:38:20 -0800 2016,,,5,1
7,8842281e1d1347389f2ab93d60773d4d,24189224,dbc01e2438df7a87ee3dc16ee23a53e5,0,Numerous people in publishing have told me thi...,Fri May 29 17:48:57 -0700 2015,Fri May 29 17:49:40 -0700 2015,,,11,5
13,8842281e1d1347389f2ab93d60773d4d,16158596,6ff8bbc4856aa403bbd8990407c9c77a,0,Recommended by David Risher,Mon Jul 07 10:56:15 -0700 2014,Mon Jul 07 10:56:39 -0700 2014,,,0,0
54,8842281e1d1347389f2ab93d60773d4d,151,daab5f2752243787e471e2ac01bf12fc,0,"Well if Melanie says its her BBE, I gotta chec...",Mon May 14 12:55:56 -0700 2007,Sat Jan 07 11:40:38 -0800 2017,,,1,2
58,8842281e1d1347389f2ab93d60773d4d,259028,fb4acc8a30bac6bf1414a03303d43c26,0,"If steve recommends it, it must be good!",Thu Jan 18 11:09:48 -0800 2007,Mon Mar 09 00:38:30 -0700 2015,,,2,2
...,...,...,...,...,...,...,...,...,...,...,...
164051,50c82fffc560a19ca82ab87ee3a95b92,30258320,092f3faaf9db761193c6726a8fc82bf8,5,"Alternative history/fantasy. Very interesting,...",Sat Apr 15 16:56:30 -0700 2017,Sat Apr 15 16:57:51 -0700 2017,,,0,0
164052,50c82fffc560a19ca82ab87ee3a95b92,18222716,4237dbb90067669edbc7978df8f5a31c,5,** spoiler alert ** \n I greatly enjoyed this ...,Thu Jan 19 17:33:23 -0800 2017,Thu Jan 19 17:35:58 -0800 2017,Fri Jan 06 00:00:00 -0800 2017,Mon Jan 02 00:00:00 -0800 2017,0,0
164054,50c82fffc560a19ca82ab87ee3a95b92,22299763,7208191258727d84a429cd83a1dc4910,5,I dreamed this book--went to sleep and woke up...,Mon Dec 05 10:47:40 -0800 2016,Sat Apr 15 17:05:41 -0700 2017,,,0,0
164056,50c82fffc560a19ca82ab87ee3a95b92,2731276,0f2b6e84833089d7507bb611aa1053be,5,It's obvious that a great deal of care and tim...,Wed Nov 23 16:43:49 -0800 2016,Wed Nov 23 16:45:01 -0800 2016,Thu Sep 01 00:00:00 -0700 2016,,0,0


In [8]:
new_train.rename({'rating': 'label'}, inplace=True, axis=1)
train_ds = Dataset.from_pandas(new_train)

In [9]:
new_train['label'].nunique()

6

In [10]:
train_dataset, validation_dataset= train_ds.train_test_split(test_size=0.1).values()
data_all_splits = datasets.DatasetDict({"train":train_dataset, "val":validation_dataset})

In [11]:
data_all_splits

DatasetDict({
    train: Dataset({
        features: ['user_id', 'book_id', 'review_id', 'label', 'review_text', 'date_added', 'date_updated', 'read_at', 'started_at', 'n_votes', 'n_comments', '__index_level_0__'],
        num_rows: 226533
    })
    val: Dataset({
        features: ['user_id', 'book_id', 'review_id', 'label', 'review_text', 'date_added', 'date_updated', 'read_at', 'started_at', 'n_votes', 'n_comments', '__index_level_0__'],
        num_rows: 25171
    })
})

# load the pre-trained model and its tokenizer

In [12]:
model_id = 'bert-base-uncased'
model = BertForSequenceClassification.from_pretrained(model_id,num_labels=6)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [13]:
model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [14]:
tokenizer = BertTokenizerFast.from_pretrained(model_id)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [15]:
def preprocess(data):
    return tokenizer(data['review_text'], padding=True, truncation=True, max_length=512)

In [16]:
tokenized_datasets = data_all_splits.map(preprocess, batched=True)


  0%|          | 0/227 [00:00<?, ?ba/s]

  0%|          | 0/26 [00:00<?, ?ba/s]

In [17]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['user_id', 'book_id', 'review_id', 'label', 'review_text', 'date_added', 'date_updated', 'read_at', 'started_at', 'n_votes', 'n_comments', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 226533
    })
    val: Dataset({
        features: ['user_id', 'book_id', 'review_id', 'label', 'review_text', 'date_added', 'date_updated', 'read_at', 'started_at', 'n_votes', 'n_comments', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25171
    })
})

In [18]:
tokenized_datasets.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# Connect notebook to hugging face account

In [21]:
!apt install git-lfs

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (2.9.2-1).
0 upgraded, 0 newly installed, 0 to remove and 115 not upgraded.


In [22]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [23]:
import os
import gc
torch.cuda.empty_cache()
gc.collect()
os.environ["WANDB_DISABLED"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# load metric

In [24]:
from datasets import load_metric
f1_score_metric = load_metric('f1')
accuracy_metric= load_metric("accuracy")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    f1_score = f1_score_metric.compute(predictions=predictions, references=labels, average="macro")["f1"]
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    return {"f1": f1_score, "accuracy": accuracy}


Downloading builder script:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

# Fine tuning

In [None]:
batch_size = 16
epochs = 2

In [None]:
warmup_steps = 500
weight_decay = 0.01

In [25]:
output_dir='Goodreads_Books_Reviews_BERT_51'
training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=warmup_steps,
    weight_decay=weight_decay,
    logging_dir='./logs',
    push_to_hub=True,
    load_best_model_at_end=True,
    metric_for_best_model="f1"
    
)

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [26]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["val"],
    compute_metrics=compute_metrics
)

Cloning https://huggingface.co/lilouuch/Goodreads_Books_Reviews_BERT_51 into local empty directory.


In [27]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: date_added, started_at, read_at, review_text, date_updated, review_id, user_id, n_votes, n_comments, __index_level_0__, book_id. If date_added, started_at, read_at, review_text, date_updated, review_id, user_id, n_votes, n_comments, __index_level_0__, book_id are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 226533
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 14160


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.9474,0.941511,0.616549,0.617894
2,0.8295,0.907856,0.636594,0.635533


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: date_added, started_at, read_at, review_text, date_updated, review_id, user_id, n_votes, n_comments, __index_level_0__, book_id. If date_added, started_at, read_at, review_text, date_updated, review_id, user_id, n_votes, n_comments, __index_level_0__, book_id are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 25171
  Batch size = 32
Saving model checkpoint to Goodreads_Books_Reviews_BERT_51/checkpoint-7080
Configuration saved in Goodreads_Books_Reviews_BERT_51/checkpoint-7080/config.json
Model weights saved in Goodreads_Books_Reviews_BERT_51/checkpoint-7080/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: date_added, started_at, read_at

TrainOutput(global_step=14160, training_loss=0.9454089148569915, metrics={'train_runtime': 25082.1595, 'train_samples_per_second': 18.063, 'train_steps_per_second': 0.565, 'total_flos': 1.1921095464221491e+17, 'train_loss': 0.9454089148569915, 'epoch': 2.0})

In [28]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: date_added, started_at, read_at, review_text, date_updated, review_id, user_id, n_votes, n_comments, __index_level_0__, book_id. If date_added, started_at, read_at, review_text, date_updated, review_id, user_id, n_votes, n_comments, __index_level_0__, book_id are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 25171
  Batch size = 32


{'eval_loss': 0.9078561067581177,
 'eval_f1': 0.636594234520338,
 'eval_accuracy': 0.6355329545906003,
 'eval_runtime': 520.1101,
 'eval_samples_per_second': 48.396,
 'eval_steps_per_second': 1.513,
 'epoch': 2.0}

In [29]:
trainer.push_to_hub()


Saving model checkpoint to Goodreads_Books_Reviews_BERT_51
Configuration saved in Goodreads_Books_Reviews_BERT_51/config.json
Model weights saved in Goodreads_Books_Reviews_BERT_51/pytorch_model.bin
Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Text Classification', 'type': 'text-classification'}, 'metrics': [{'name': 'F1', 'type': 'f1', 'value': 0.636594234520338}, {'name': 'Accuracy', 'type': 'accuracy', 'value': 0.6355329545906003}]}
To https://huggingface.co/lilouuch/Goodreads_Books_Reviews_BERT_51
   5f9cee5..ccffc0f  main -> main

