<a href="https://colab.research.google.com/github/jskaza/deep-learning-projects/blob/main/goodreads_book_reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<a href="https://www.kaggle.com/code/jonskaza/goodreads-book-reviews?scriptVersionId=119917271" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [5]:
# !pip install datasets transformers optuna
# ! pip -q install kaggle
# ! mkdir ~/.kaggle
# ! cp kaggle.json ~/.kaggle/
# ! chmod 600 ~/.kaggle/kaggle.json
# ! kaggle competitions download goodreads-books-reviews-290312

In [6]:
from datasets import load_dataset, ClassLabel, Features
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
import os
from tqdm.auto import tqdm
import torch
import zipfile
from google.colab import drive
drive.mount("/content/drive")
# from huggingface_hub import HfApi, HfFolder
# from kaggle_secrets import UserSecretsClient

Mounted at /content/drive


In [18]:
# config
token = os.environ.get("HUGGINGFACE") # huggingface token
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true" # no wandb

In [8]:
# extract training data
with zipfile.ZipFile("/content/goodreads-books-reviews-290312.zip") as z:
   with z.open("goodreads_train.csv") as f:
     pd.read_csv(f).to_csv("goodreads_train.csv", index = False)
dataset = load_dataset("csv", data_files="goodreads_train.csv", usecols = ["review_text","rating"]).cast_column("rating", ClassLabel(num_classes = 6))
dataset = dataset.rename_column("rating", "label")
dataset = dataset.rename_column("review_text", "text")
dataset["train"].features

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-6a58fe8045bac472/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-6a58fe8045bac472/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Casting the dataset:   0%|          | 0/900000 [00:00<?, ? examples/s]

{'label': ClassLabel(names=['0', '1', '2', '3', '4', '5'], id=None),
 'text': Value(dtype='string', id=None)}

In [9]:
# train/val split
train_dataset = dataset["train"].train_test_split(test_size=0.15, seed=99)["train"]
val_dataset = dataset["train"].train_test_split(test_size=0.15, seed=99)["test"]



In [10]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", use_fast = True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [11]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [12]:
# tokenization
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val =  val_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/765000 [00:00<?, ? examples/s]

Map:   0%|          | 0/135000 [00:00<?, ? examples/s]

In [13]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [14]:
# f1 as the metric
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"f1": f1_score(y_true=labels, y_pred=predictions, average="macro")}

In [15]:
id2label = {}
label2id = {}
for i in range(6):
    id2label[i] = i
    label2id[i] = i
    
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=6, id2label=id2label, label2id=label2id)

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'classifier

In [19]:
training_args = TrainingArguments(
    output_dir="goodreads-text-classification",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
    hub_token=token
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [20]:
# hyperparameter search for learning rate using 1/20 of training data
def model_init():
    return AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=6)

def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
    }


trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=tokenized_train.shard(index=1, num_shards=20),
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

best_trial = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=optuna_hp_space,
    n_trials=3
)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.26.1",
  "vocab_size": 30522
}



Download file pytorch_model.bin:   0%|          | 8.00k/255M [00:00<?, ?B/s]

Download file runs/Feb22_23-34-07_d17c62766b7a/events.out.tfevents.1677108853.d17c62766b7a.20134.0: 100%|#####…

Download file runs/Feb22_23-02-25_d17c62766b7a/1677107285.1832297/events.out.tfevents.1677107285.d17c62766b7a.…

Clean file runs/Feb22_23-34-07_d17c62766b7a/events.out.tfevents.1677108853.d17c62766b7a.20134.0:  16%|#6      …

Download file runs/Feb22_23-02-25_d17c62766b7a/1677107468.7801063/events.out.tfevents.1677107468.d17c62766b7a.…

Clean file runs/Feb22_23-02-25_d17c62766b7a/1677107285.1832297/events.out.tfevents.1677107285.d17c62766b7a.658…

Clean file runs/Feb22_23-02-25_d17c62766b7a/1677107468.7801063/events.out.tfevents.1677107468.d17c62766b7a.658…

Download file runs/Feb23_16-26-09_802d74a17cd1/1677169574.9800794/events.out.tfevents.1677169574.802d74a17cd1.…

Download file runs/Feb23_16-26-09_802d74a17cd1/events.out.tfevents.1677175303.802d74a17cd1.4418.4: 100%|######…

Clean file runs/Feb23_16-26-09_802d74a17cd1/1677169574.9800794/events.out.tfevents.1677169574.802d74a17cd1.441…

Download file runs/Feb23_16-26-09_802d74a17cd1/events.out.tfevents.1677169574.802d74a17cd1.4418.0: 100%|######…

Clean file runs/Feb23_16-26-09_802d74a17cd1/events.out.tfevents.1677175303.802d74a17cd1.4418.4:  16%|#5       …

Download file runs/Feb22_23-34-07_d17c62766b7a/1677108853.2327404/events.out.tfevents.1677108853.d17c62766b7a.…

Download file runs/Feb22_23-34-07_d17c62766b7a/1677113979.4587486/events.out.tfevents.1677113979.d17c62766b7a.…

Clean file runs/Feb23_16-26-09_802d74a17cd1/events.out.tfevents.1677169574.802d74a17cd1.4418.0:  16%|#5       …

Download file runs/Feb22_23-02-25_d17c62766b7a/1677107663.2827442/events.out.tfevents.1677107663.d17c62766b7a.…

Clean file runs/Feb22_23-34-07_d17c62766b7a/1677108853.2327404/events.out.tfevents.1677108853.d17c62766b7a.201…

Download file runs/Feb23_16-26-09_802d74a17cd1/events.out.tfevents.1677172430.802d74a17cd1.4418.2: 100%|######…

Download file runs/Feb23_16-26-09_802d74a17cd1/1677178185.16607/events.out.tfevents.1677178185.802d74a17cd1.44…

Clean file runs/Feb22_23-34-07_d17c62766b7a/1677113979.4587486/events.out.tfevents.1677113979.d17c62766b7a.201…

Download file runs/Feb22_23-02-25_d17c62766b7a/1677106966.4319966/events.out.tfevents.1677106966.d17c62766b7a.…

Download file runs/Feb22_23-02-25_d17c62766b7a/1677107112.141582/events.out.tfevents.1677107112.d17c62766b7a.6…

Clean file runs/Feb22_23-02-25_d17c62766b7a/1677107663.2827442/events.out.tfevents.1677107663.d17c62766b7a.658…

Download file runs/Feb22_23-02-25_d17c62766b7a/events.out.tfevents.1677107468.d17c62766b7a.658.20: 100%|######…

Download file runs/Feb23_16-26-09_802d74a17cd1/events.out.tfevents.1677178185.802d74a17cd1.4418.6: 100%|######…

Download file runs/Feb22_23-02-25_d17c62766b7a/events.out.tfevents.1677107112.d17c62766b7a.658.16: 100%|######…

Clean file runs/Feb23_16-26-09_802d74a17cd1/events.out.tfevents.1677172430.802d74a17cd1.4418.2:  16%|#5       …

Download file runs/Feb23_16-26-09_802d74a17cd1/1677175303.296535/events.out.tfevents.1677175303.802d74a17cd1.4…

Download file runs/Feb22_23-02-25_d17c62766b7a/events.out.tfevents.1677107663.d17c62766b7a.658.22: 100%|######…

Clean file runs/Feb23_16-26-09_802d74a17cd1/1677178185.16607/events.out.tfevents.1677178185.802d74a17cd1.4418.…

Download file runs/Feb22_23-02-25_d17c62766b7a/events.out.tfevents.1677107285.d17c62766b7a.658.18: 100%|######…

Download file runs/Feb22_23-02-25_d17c62766b7a/events.out.tfevents.1677106966.d17c62766b7a.658.14: 100%|######…

Download file runs/Feb22_23-34-07_d17c62766b7a/events.out.tfevents.1677113979.d17c62766b7a.20134.2: 100%|#####…

Clean file runs/Feb22_23-02-25_d17c62766b7a/1677106966.4319966/events.out.tfevents.1677106966.d17c62766b7a.658…

Clean file runs/Feb22_23-02-25_d17c62766b7a/1677107112.141582/events.out.tfevents.1677107112.d17c62766b7a.658.…

Clean file runs/Feb22_23-02-25_d17c62766b7a/events.out.tfevents.1677107468.d17c62766b7a.658.20:  18%|#8       …

Download file runs/Feb23_15-51-30_802d74a17cd1/1677167564.3062172/events.out.tfevents.1677167564.802d74a17cd1.…

Clean file runs/Feb23_16-26-09_802d74a17cd1/events.out.tfevents.1677178185.802d74a17cd1.4418.6:   9%|8        …

Download file runs/Feb23_16-26-09_802d74a17cd1/1677172430.4120102/events.out.tfevents.1677172430.802d74a17cd1.…

Clean file runs/Feb22_23-02-25_d17c62766b7a/events.out.tfevents.1677107112.d17c62766b7a.658.16:  18%|#8       …

Clean file runs/Feb23_16-26-09_802d74a17cd1/1677175303.296535/events.out.tfevents.1677175303.802d74a17cd1.4418…

Clean file runs/Feb22_23-02-25_d17c62766b7a/events.out.tfevents.1677107663.d17c62766b7a.658.22:  18%|#8       …

Clean file runs/Feb22_23-02-25_d17c62766b7a/events.out.tfevents.1677107285.d17c62766b7a.658.18:  18%|#8       …

Download file training_args.bin: 100%|##########| 3.50k/3.50k [00:00<?, ?B/s]

Clean file runs/Feb22_23-02-25_d17c62766b7a/events.out.tfevents.1677106966.d17c62766b7a.658.14:  18%|#8       …

Download file runs/Feb23_15-51-30_802d74a17cd1/events.out.tfevents.1677167564.802d74a17cd1.435.0: 100%|#######…

Clean file runs/Feb22_23-34-07_d17c62766b7a/events.out.tfevents.1677113979.d17c62766b7a.20134.2:  19%|#8      …

Clean file runs/Feb23_15-51-30_802d74a17cd1/1677167564.3062172/events.out.tfevents.1677167564.802d74a17cd1.435…

Clean file runs/Feb23_16-26-09_802d74a17cd1/1677172430.4120102/events.out.tfevents.1677172430.802d74a17cd1.441…

Clean file training_args.bin:  29%|##8       | 1.00k/3.50k [00:00<?, ?B/s]

Clean file runs/Feb23_15-51-30_802d74a17cd1/events.out.tfevents.1677167564.802d74a17cd1.435.0:  26%|##5       …

Clean file pytorch_model.bin:   0%|          | 1.00k/255M [00:00<?, ?B/s]

In [30]:
best_trial

BestRun(run_id='2', objective=0.519019249545317, hyperparameters={'learning_rate': 1.2027342302971699e-05})

In [21]:
# use best learning rate on full data
for n, v in best_trial.hyperparameters.items():
    setattr(trainer.args, n, v)
trainer.train_dataset = tokenized_train
trainer.train()

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.26.1",
  "vocab_size": 30522
}



ValueError: ignored

In [None]:
# inference on test data
with zipfile.ZipFile("/content/goodreads-books-reviews-290312.zip") as z:
   with z.open("goodreads_test.csv") as f:
     df = pd.read_csv(f)
tokenizer = AutoTokenizer.from_pretrained("jskaza/my_awesome_model")
model = AutoModelForSequenceClassification.from_pretrained("jskaza/my_awesome_model").to("cuda")
texts = df["review_text"].tolist()
batchsize = 32
preds = []
for i in tqdm(range(0, len(texts), batchsize)):
    batch = texts[i:i+batchsize]
    inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to("cuda")
    with torch.no_grad():
        logits = model(**inputs).logits
    preds.extend(logits.argmax(dim=1).tolist())

In [None]:
pd.DataFrame({
    "review_id": df["review_id"],
    "rating": preds
}).to_csv("/content/drive/MyDrive/submission.csv", index=False)
    