In [1]:
import pandas as pd
from tqdm.auto import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

In [2]:
from glob import glob

In [3]:
dataset = load_dataset('csv', data_files={"train": "/home/karysoares/Documents/book-reviews/data/sentiment_train.csv", "test": "/home/karysoares/Documents/book-reviews/data/sentiment_test.csv"})

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [4]:
labels = [1, 2, 3, 4, 5]

id2label = {n: i for n, i in enumerate(labels)}
label2id = {i: n for n, i in enumerate(labels)}

In [5]:
tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

def preprocess_function(examples):
    inputs = tokenizer(examples["text"], truncation=True, padding=True, return_tensors="pt")
    inputs["label"] = [label2id[label] for label in examples["label"]]
    return inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, batch_size=100)

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



Map:   0%|          | 0/48000 [00:00<?, ? examples/s]

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

In [6]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [7]:
accuracy = evaluate.load("accuracy")

In [8]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [9]:
model = AutoModelForSequenceClassification.from_pretrained(
    "nlptown/bert-base-multilingual-uncased-sentiment", num_labels=len(labels), id2label=id2label, label2id=label2id
)

pytorch_model.bin:   0%|          | 0.00/669M [00:00<?, ?B/s]

In [10]:
training_args = TrainingArguments(
    output_dir="/home/karysoares/Documents/book-reviews/notebooks/genre_classification",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)



In [11]:
trainer.train()

  0%|          | 0/9000 [00:00<?, ?it/s]

: 