<a href="https://colab.research.google.com/github/jskaza/deep-learning-projects/blob/main/goodreads_book_reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<a href="https://www.kaggle.com/code/jonskaza/goodreads-book-reviews?scriptVersionId=119917271" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [15]:
# !pip install datasets transformers optuna
# ! pip -q install kaggle
# ! mkdir ~/.kaggle
# ! cp kaggle.json ~/.kaggle/
# ! chmod 600 ~/.kaggle/kaggle.json
# ! kaggle competitions download goodreads-books-reviews-290312

In [16]:
from datasets import load_dataset, ClassLabel, Features
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
import os
from tqdm.auto import tqdm
import torch
import zipfile
from google.colab import drive
drive.mount("/content/drive")
# from huggingface_hub import HfApi, HfFolder
# from kaggle_secrets import UserSecretsClient

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
# config
token = os.environ.get("HUGGINGFACE") # huggingface token
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true" # no wandb

In [18]:
# extract training data
with zipfile.ZipFile("/content/goodreads-books-reviews-290312.zip") as z:
   with z.open("goodreads_train.csv") as f:
     # 200k random examples
     pd.read_csv(f).sample(200000).to_csv("goodreads_train.csv", index = False)
dataset = load_dataset("csv", data_files="goodreads_train.csv", usecols = ["review_text","rating"]).cast_column("rating", ClassLabel(num_classes = 6))
dataset = dataset.rename_column("rating", "label")
dataset = dataset.rename_column("review_text", "text")
dataset["train"].features

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-8d3670b944d0c2e2/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-8d3670b944d0c2e2/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Casting the dataset:   0%|          | 0/200000 [00:00<?, ? examples/s]

{'label': ClassLabel(names=['0', '1', '2', '3', '4', '5'], id=None),
 'text': Value(dtype='string', id=None)}

In [19]:
# train/val split
train_dataset = dataset["train"].train_test_split(test_size=0.15, seed=99)["train"]
val_dataset = dataset["train"].train_test_split(test_size=0.15, seed=99)["test"]



In [20]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", use_fast = True)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.26.1",
  "vocab_size": 30522
}

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/vocab.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapsh

In [21]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [22]:
# tokenization
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val =  val_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/170000 [00:00<?, ? examples/s]

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

In [23]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [24]:
# f1 as the metric
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"f1": f1_score(y_true=labels, y_pred=predictions, average="macro")}

In [25]:
id2label = {}
label2id = {}
for i in range(6):
    id2label[i] = i
    label2id[i] = i
    
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=6, id2label=id2label, label2id=label2id)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": 0,
    "1": 1,
    "2": 2,
    "3": 3,
    "4": 4,
    "5": 5
  },
  "initializer_range": 0.02,
  "label2id": {
    "0": 0,
    "1": 1,
    "2": 2,
    "3": 3,
    "4": 4,
    "5": 5
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.26.1",
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/mo

In [26]:
training_args = TrainingArguments(
    output_dir="goodreads-text-classification",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=4,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
    hub_token=token
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
# hyperparameter search for learning rate using 1/10 of training data
def model_init():
    return AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=6)

def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
    }


trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=tokenized_train.shard(index=1, num_shards=10),
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

best_trial = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=optuna_hp_space,
    n_trials=5
)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.26.1",
  "vocab_size": 30522
}



Epoch,Training Loss,Validation Loss


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 30000
  Batch size = 32


In [None]:
best_trial

In [None]:
# use best learning rate on full data
for n, v in best_trial.hyperparameters.items():
    setattr(trainer.args, n, v)
trainer.train_dataset = tokenized_train
trainer.train()

In [None]:
# inference on test data
with zipfile.ZipFile("/content/goodreads-books-reviews-290312.zip") as z:
   with z.open("goodreads_test.csv") as f:
     df = pd.read_csv(f)
tokenizer = AutoTokenizer.from_pretrained("jskaza/my_awesome_model")
model = AutoModelForSequenceClassification.from_pretrained("jskaza/my_awesome_model").to("cuda")
texts = df["review_text"].tolist()
batchsize = 32
preds = []
for i in tqdm(range(0, len(texts), batchsize)):
    batch = texts[i:i+batchsize]
    inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to("cuda")
    with torch.no_grad():
        logits = model(**inputs).logits
    preds.extend(logits.argmax(dim=1).tolist())

In [None]:
pd.DataFrame({
    "review_id": df["review_id"],
    "rating": preds
}).to_csv("/content/drive/MyDrive/submission.csv", index=False)
    