In [None]:
# This Jupyter file should be ran on https://colab.research.google.com
# don't forget to upload `train.jsonl` and `test.jsonl`

In [1]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
# supress output
%%capture

!pip install transformers
!pip install evaluate
!pip install -U datasets

In [13]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
import os

In [22]:
# load train and validation files
dataset = load_dataset(
    "json",
    data_files={"train": "/content/train.jsonl", "test": "/content/test.jsonl" },
    storage_options={})

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [23]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 500
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 100
    })
})

In [24]:
checkpoint = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
  return tokenizer(example['text'], padding=True, truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
from transformers import TrainingArguments

output_dir = "./bert-customer-message-classification"
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    eval_strategy="steps",
    eval_steps=200,
    save_total_limit=2,
    save_steps=200,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none"
)

In [26]:
from transformers import Trainer, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)
# Disabling HuggingFace remote lookups
os.environ['WANDB_DISABLE'] = "true"
os.environ['WANDB_MODE'] = "offline"

import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
# Create new Trainer, and train model
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics)

trainer.train()
trainer.evaluate()
trainer.save_model()

Step,Training Loss,Validation Loss


In [30]:
# push trained model to huggingface
trainer.push_to_hub("jimmyarfs/modelcustmsgclassification")

Uploading...:   0%|          | 0.00/438M [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/jimmyarfs/bert-customer-message-classification/commit/f28e8a018a3f1f42c1fd8e1e65871b700f649eaa', commit_message='jimmyarfs/modelcustmsgclassification', commit_description='', oid='f28e8a018a3f1f42c1fd8e1e65871b700f649eaa', pr_url=None, repo_url=RepoUrl('https://huggingface.co/jimmyarfs/bert-customer-message-classification', endpoint='https://huggingface.co', repo_type='model', repo_id='jimmyarfs/bert-customer-message-classification'), pr_revision=None, pr_num=None)

In [32]:
# instance model for testingtest model
from transformers import pipeline

pipe = pipeline("text-classification", model="jimmyarfs/bert-customer-message-classification")


config.json:   0%|          | 0.00/841 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Device set to use cpu


In [33]:
# deve retornar LABEL_1 (venda)
pipe("Olá, gostaria de fazer a aquisição do novo produto")

[{'label': 'LABEL_1', 'score': 0.9990980625152588}]

In [34]:
# deve retornar LABEL_0 (suporte)
pipe("tudo bom, queria verificar como funciona a TV Smart x0912")

[{'label': 'LABEL_0', 'score': 0.9993481040000916}]