In [1]:
from transformers import AutoTokenizer  
task = "cola"
model_checkpoint = "distilbert-base-uncased"
batch_size = 16
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

  from .autonotebook import tqdm as notebook_tqdm


In [33]:
import json
with open("../input/parser_training/training.json", "r") as f:
    data = json.load(f)["data"]

phrases = []
labels = []

for phrase in data:
    text = phrase["text"].split(": ")
    if len(text) == 2:
        phrases.append(text[1])
        labels.append(int(text[0]))

examples = {
    "sentence": phrases,
    "label": labels
}

from datasets import Dataset
dataset = Dataset.from_dict(examples).train_test_split(test_size=0.2)

In [34]:
from datasets import load_metric

metric = load_metric('glue', task)

In [35]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /Users/maribelrb/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.17.0",
  "vocab_size": 30522
}

loading file https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt from cache at /Users/maribelrb/.cache/huggingface/transformers/0e1bbfd

In [36]:
def preprocess_function(input):
    tokenized_input = tokenizer(input['sentence'], truncation=True)
    return tokenized_input

In [37]:
encoded_dataset = dataset.map(preprocess_function, batched=True)

100%|██████████| 26/26 [00:01<00:00, 25.18ba/s]
100%|██████████| 7/7 [00:00<00:00, 39.24ba/s]


In [39]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

num_labels = 3
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /Users/maribelrb/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.17.0",
  "vocab_size": 30522
}


In [40]:
args = TrainingArguments(
    "TextClassification",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="matthews_correlation",
    push_to_hub=False,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [41]:
import numpy as np

def compute_metrics(eval_pred):
    pred, labels = eval_pred
    predictions = np.argmax(pred, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [42]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [14]:
trainer.save_model()

Saving model checkpoint to PPIBot model
Configuration saved in PPIBot model/config.json
Model weights saved in PPIBot model/pytorch_model.bin
tokenizer config file saved in PPIBot model/tokenizer_config.json
Special tokens file saved in PPIBot model/special_tokens_map.json


In [5]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("../TextClassification")


In [17]:
phrase = "The ratio of reimbursements rejected by employees"

tokens  = tokenizer(phrase.split(" "), return_tensors='pt', is_split_into_words=True, truncation=True)

predictions = model(**tokens)
logits = predictions["logits"]

predictions = logits.argmax(-1).tolist()[0]
predictions


1