# Install deps

In [None]:
!pip install --quiet transformers
!pip install --quiet pandas
!pip install --quiet numpy
!pip install --quiet datasets

In [2]:
!python --version
!pip --version

Python 3.7.11
pip 21.1.3 from /usr/local/lib/python3.7/dist-packages/pip (python 3.7)


# Import deps

In [31]:
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np

# Global variables

In [3]:
# variables
GLUE_TASKS = ["cola", "mnli", "mnli-mm", "mrpc", "qnli", "qqp", "rte", "sst2", "stsb", "wnli"]
task = "sst2"
model_checkpoint = "bert-base-uncased"
batch_size = 16

In [None]:
# tokenizer - instantiate our tokenizer using AutoTokenizer
# download the vocab used during pretraining the given model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

# Download data

In [None]:
actual_task = "mnli" if task == "mnli-mm" else task
dataset = load_dataset("glue", actual_task)
metric = load_metric('glue', actual_task)

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [6]:
dataset["train"][0]

{'idx': 0,
 'label': 0,
 'sentence': 'hide new secretions from the parental units '}

# Preprocessing

In [10]:
tokenizer("Hello, this one sentence!", "And this sentence goes with it.")

{'input_ids': [101, 7592, 1010, 2023, 2028, 6251, 999, 102, 1998, 2023, 6251, 3632, 2007, 2009, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [12]:

task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mnli-mm": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}

sentence1_key, sentence2_key = task_to_keys[task]
print(sentence1_key, sentence2_key)

sentence None


In [13]:
def preprocess_function(examples):
    if sentence2_key is None:
        return tokenizer(examples[sentence1_key], truncation=True)
    return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True)

In [None]:
encoded_dataset = dataset.map(preprocess_function, batched=True)

In [18]:
encoded_dataset["train"][0]

{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'idx': 0,
 'input_ids': [101, 5342, 2047, 3595, 8496, 2013, 1996, 18643, 3197, 102],
 'label': 0,
 'sentence': 'hide new secretions from the parental units ',
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

# Fine tuning

In [None]:
num_labels = 3 if task.startswith("mnli") else 1 if task=="stsb" else 2
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

In [23]:
metric_name = "pearson" if task == "stsb" else "matthews_correlation" if task == "cola" else "accuracy"
print(metric_name)

accuracy


In [27]:
args = TrainingArguments(
    "test-glue",
    evaluation_strategy = "steps",
    eval_steps = 10,
    save_total_limit = 5,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
)

In [28]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    if task != "stsb":
        predictions = np.argmax(predictions, axis=1)
    else:
        predictions = predictions[:, 0]
    return metric.compute(predictions=predictions, references=labels)

In [29]:
validation_key = "validation_mismatched" if task == "mnli-mm" else "validation_matched" if task == "mnli" else "validation" 
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset[validation_key],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

# Custom Dataset

## Download the dataset
[Amazon Reviews](http://jmcauley.ucsd.edu/data/amazon/links.html)

## Load dataset

In [None]:
import json
data = [json.loads(line) for line in open('AMAZON_FASHION_5.json', 'r')]

In [None]:
def read_custom_dataset(dataset):
    text = []
    label = []
    for review in dataset:
        try:
            text.append(review['reviewText'])
            label.append(0 if review['overall'] < 3 else 1)
        except:
            print(review)
    return text, label

texts, labels = read_custom_dataset(data)

## Train and validation Split

In [None]:
from sklearn.model_selection import train_test_split

train_ratio = 0.70
validation_ratio = 0.10
test_ratio = 0.20

# train is now 75% of the entire data set
# the _junk suffix means that we drop that variable completely
x_train, x_test, y_train, y_test = train_test_split(texts, labels, test_size=1 - train_ratio)

# test is now 10% of the initial data set
# validation is now 15% of the initial data set
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio)) 

print(len(x_train), len(x_val), len(x_test))

In [None]:
from transformers import DistilBertTokenizerFast, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
train_encodings = tokenizer(x_train, truncation=True, padding=True)
val_encodings = tokenizer(x_val, truncation=True, padding=True)
test_encodings = tokenizer(x_test, truncation=True, padding=True)

In [None]:
import torch

class AmazonDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = AmazonDataset(train_encodings, y_train)
val_dataset = AmazonDataset(val_encodings, y_val)
test_dataset = AmazonDataset(test_encodings, y_test)

In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="test-amazon",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = Trainer(
    model=model,                      # the instantiated 🤗 Transformers model to be trained
    args=args,                        # training arguments, defined above
    train_dataset=train_dataset,      # training dataset
    eval_dataset=val_dataset,         # evaluation dataset
    compute_metrics=compute_metrics
)

trainer.train()

In [None]:
trainer.evaluate(test_dataset)

In [None]:
pred = trainer.predict(test_dataset)