download package

In [None]:
!pip3 install http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl
!pip3 install torchvision
!pip3 install pickle
!pip3 install datasets
!pip3 install transformers

upload data

In [67]:
from zipfile import ZipFile
file_name = "intent.zip"

with ZipFile(file_name, 'r') as zip:
  zip.extractall()
  print('Done')

Done


create dataset

In [None]:
from datasets import load_dataset

data_files = {}
extension = ""
data_files["train"] = "intent/train.json"
data_files["eval"] = "intent/eval.json"
extension = "intent/train.json".split(".")[-1]
datasets = load_dataset(
    extension,
    data_files=data_files
)
datasets
datasets["train"].features

In [None]:
for item in datasets["train"]["text"][:5]:
    print(item)
for item in datasets["train"]["intent"][:5]:
    print(item)

get data

In [70]:
train_texts = [item["text"] for item in datasets["train"]]
train_labels = [item["intent"] for item in datasets["train"]]
dev_texts = [item["text"] for item in datasets["eval"]]
dev_labels = [item["intent"] for item in datasets["eval"]]
labels = list(set(train_labels))
len(labels)
idx2label = labels
label2idx = {k:idx for idx,k in enumerate(labels)}

utils class

In [71]:
import torch

class ClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['label'] = label2idx[self.labels[idx]]
        return item

    def __len__(self):
        return len(self.labels)

In [72]:
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc
}

train

In [None]:
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification

model_ids = ["prajjwal1/bert-tiny"]

accuracies = []
for model_id in model_ids:
    
    print(f"*** {model_id} ***")

    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=len(labels))

    train_texts_encoded = tokenizer(train_texts, padding=True, truncation=True, return_tensors="pt")
    dev_texts_encoded = tokenizer(dev_texts, padding=True, truncation=True, return_tensors="pt")
    
    train_dataset = ClassificationDataset(train_texts_encoded, train_labels)
    dev_dataset = ClassificationDataset(dev_texts_encoded, dev_labels)
    
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=8,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        warmup_steps=int(len(train_dataset)/16),
        weight_decay=0.01,
        logging_dir='./logs',
        evaluation_strategy="steps",
        eval_steps=50,
        save_steps=50,
        save_total_limit=10,
        load_best_model_at_end=True,
        no_cuda=False
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=dev_dataset,
    )

    trainer.train()
    # test_results = trainer.evaluate(test_dataset)
    
    # accuracies.append(test_results["eval_accuracy"])