In [1]:
import pandas as pd
from tqdm.auto import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
import evaluate
import numpy as np
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

In [2]:
from glob import glob

In [3]:
dataset = load_dataset('csv', data_files={"train": "/home/karysoares/Documents/book-reviews/data/category_train.csv", "test": "/home/karysoares/Documents/book-reviews/data/category_test.csv"})

In [4]:
labels = ['fiction',
 'history',
 'religion',
 'juvenile fiction',
 'biography & autobiography',
 'business & economics',
 'computers',
 'social science',
 'juvenile nonfiction',
 'science',
 'education',
 'cooking',
 'sports & recreation',
 'family & relationships',
 'literary criticism',
 'music',
 'medical',
 'health & fitness',
 'body, mind & spirit',
 'language arts & disciplines',
 'political science',
 'art',
 'psychology',
 'philosophy',
 'travel',
 'technology & engineering',
 'self-help',
 'poetry',
 'foreign language study',
 'crafts & hobbies',
 'performing arts',
 'reference',
 'comics & graphic novels',
 'mathematics',
 'nature',
 'architecture',
 'transportation',
 'law',
 'humor',
 'photography',
 'antiques & collectibles',
 'drama',
 'young adult fiction',
 'pets',
 'literary collections',
 'gardening',
 'games',
 'study aids',
 'games & activities',
 'house & home',
 'bibles',
 'true crime',
 'design',
 "children's stories",
 'english language',
 'animals',
 'bible',
 'detective and mystery stories']

id2label = {n: i for n, i in enumerate(labels)}
label2id = {i: n for n, i in enumerate(labels)}

In [5]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

def preprocess_function(examples):
    inputs = tokenizer(examples["text"], truncation=True, padding=True, return_tensors="pt")
    inputs["label"] = [label2id[label] for label in examples["label"]]
    return inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, batch_size=100)



Map:   0%|          | 0/4976 [00:00<?, ? examples/s]

In [6]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [7]:
accuracy = evaluate.load("accuracy")

In [8]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [9]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=len(labels), id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
training_args = TrainingArguments(
    output_dir="genre_classification",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)



In [13]:
# Iniciando o treinamento
trainer.train()

  0%|          | 0/2488 [00:00<?, ?it/s]