<a href="https://www.kaggle.com/code/jonskaza/goodreads-book-reviews?scriptVersionId=119917271" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [40]:
from datasets import load_dataset, ClassLabel, Features
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
import os
from huggingface_hub import HfApi, HfFolder
from kaggle_secrets import UserSecretsClient
from tqdm.auto import tqdm
import torch

In [41]:
# config
token = UserSecretsClient().get_secret("huggingface")
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"

In [42]:
pd.read_csv("../input/goodreads-books-reviews-290312/goodreads_train.csv").sample(50000).to_csv("sample.csv", index = False)
dataset = load_dataset("csv", data_files="sample.csv", usecols = ["review_text","rating"]).cast_column("rating", ClassLabel(num_classes = 6))
# dataset = load_dataset("csv", data_files="../input/goodreads-books-reviews-290312/goodreads_train.csv", usecols = ["review_text","rating"]).cast_column("rating", ClassLabel(num_classes = 6))
dataset = dataset.rename_column("rating", "label")
dataset = dataset.rename_column("review_text", "text")
dataset["train"].features

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-00e143b2201835b0/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-00e143b2201835b0/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Casting the dataset:   0%|          | 0/5 [00:00<?, ?ba/s]

{'label': ClassLabel(num_classes=6, names=['0', '1', '2', '3', '4', '5'], id=None),
 'text': Value(dtype='string', id=None)}

In [43]:
train_dataset = dataset["train"].train_test_split(test_size=0.1, seed=99)["train"]
val_dataset = dataset["train"].train_test_split(test_size=0.1, seed=99)["test"]

In [44]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", use_fast = True)

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.20.1",
  "vocab_size": 30522
}

loading file https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/0e1bbfda7f63a99bb52e3915dcf10

In [45]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [46]:
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val =  val_dataset.map(preprocess_function, batched=True)

  0%|          | 0/45 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

In [47]:
tokenized_train

Dataset({
    features: ['label', 'text', 'input_ids', 'attention_mask'],
    num_rows: 45000
})

In [48]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [49]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"f1": f1_score(y_true=labels, y_pred=predictions, average="macro")}

In [50]:
id2label = {}
label2id = {}
for i in range(6):
    id2label[i] = i
    label2id[i] = i
    
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=6, id2label=id2label, label2id=label2id)

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": 0,
    "1": 1,
    "2": 2,
    "3": 3,
    "4": 4,
    "5": 5
  },
  "initializer_range": 0.02,
  "label2id": {
    "0": 0,
    "1": 1,
    "2": 2,
    "3": 3,
    "4": 4,
    "5": 5
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.20.1",
  "vo

In [None]:
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
    hub_token=token
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
/kaggle/working/my_awesome_model is already a clone of https://huggingface.co/jskaza/my_awesome_model. Make sure you pull the latest changes with `repo.git_pull()`.
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 45000
  Num Epochs = 1
  In

Epoch,Training Loss,Validation Loss


In [None]:
df = pd.read_csv("../input/goodreads-books-reviews-290312/goodreads_test.csv").head(1000)
tokenizer = AutoTokenizer.from_pretrained("jskaza/my_awesome_model")
model = AutoModelForSequenceClassification.from_pretrained("jskaza/my_awesome_model").to("cuda")
texts = df["review_text"].tolist()
batchsize = 32
preds = []
for i in tqdm(range(0, len(texts), batchsize)):
    batch = texts[i:i+batchsize]
    inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to("cuda")
    with torch.no_grad():
        logits = model(**inputs).logits
    preds.extend(logits.argmax(dim=1).tolist())

In [17]:
for i in tqdm(range(0, len(texts), batchsize)):
    batch = texts[i:i+batchsize]
    inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to("cuda")
    with torch.no_grad():
        logits = model(**inputs).logits
    print(logits)

  0%|          | 0/32 [00:00<?, ?it/s]

tensor([[-0.3737, -0.3665, -0.1387,  0.1193,  0.4873,  0.3005],
        [-0.3405, -0.2933, -0.1827,  0.1522,  0.4885,  0.3114],
        [-0.3115, -0.2627, -0.1168,  0.0734,  0.4207,  0.2242],
        [-0.2956, -0.2955, -0.1939,  0.1479,  0.5383,  0.2410],
        [-0.4131, -0.3369, -0.1880,  0.1987,  0.4657,  0.3585],
        [-0.3432, -0.2479, -0.1741,  0.1362,  0.5026,  0.3145],
        [-0.3918, -0.3101, -0.1532,  0.1509,  0.5451,  0.3137],
        [-0.4396, -0.3497, -0.1430,  0.1644,  0.5313,  0.3518],
        [-0.3065, -0.2844, -0.2148,  0.1207,  0.4260,  0.2948],
        [-0.3259, -0.3013, -0.1758,  0.1550,  0.5328,  0.3020],
        [-0.4172, -0.3254, -0.1709,  0.1257,  0.4958,  0.3005],
        [-0.3651, -0.2885, -0.1992,  0.1463,  0.4953,  0.3095],
        [-0.3460, -0.2441, -0.1728,  0.1033,  0.4902,  0.2829],
        [-0.4122, -0.3370, -0.1320,  0.1409,  0.4818,  0.3339],
        [-0.3184, -0.2610, -0.1520,  0.0838,  0.4801,  0.2673],
        [-0.3827, -0.3096, -0.1722,  0.1

In [15]:
pd.DataFrame({
    "review_id": df["review_id"],
    "rating": preds
})
    

Unnamed: 0,review_id,rating
0,5c4df7e70e9b438c761f07a4620ccb7c,4
1,8eaeaf13213eeb16ad879a2a2591bbe5,4
2,dce649b733c153ba5363a0413cac988f,4
3,8a46df0bb997269d6834f9437a4b0a77,4
4,d11d3091e22f1cf3cb865598de197599,4
...,...,...
995,5162e3acc7b885da745e9e1936f6d7fc,4
996,c8b097ade0962c0a2ee035797ae4dcf5,4
997,bf8c2c80bbcf2550aba8ff75a9e8162d,4
998,60a622207b3b41d18769d87341c22019,4
