# Install/Import Libraries

In [10]:
!pip install transformers
!pip install datasets
!pip install transformers[sentencepiece]



In [11]:
import pandas as pd
import numpy as np
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer, pipeline, EarlyStoppingCallback
from datasets import load_dataset, load_metric, DatasetDict
from tqdm import tqdm

# Mount Google drive

In [12]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Import Data

In [13]:
cd /content/drive/MyDrive/Corelli/Duke Spring 2022/AIPI 540/NLP Module Project

/content/drive/MyDrive/Corelli/Duke Spring 2022/AIPI 540/NLP Module Project


In [14]:
data = pd.read_csv("./reviews.csv")
data

Unnamed: 0,Id,Review,Label
0,0,good and interesting,5
1,1,"This class is very helpful to me. Currently, I...",5
2,2,like!Prof and TAs are helpful and the discussi...,5
3,3,Easy to follow and includes a lot basic and im...,5
4,4,Really nice teacher!I could got the point eazl...,4
...,...,...,...
107013,107013,Trendy topic with talks from expertises in the...,4
107014,107014,"Wonderful! Simple and clear language, good ins...",5
107015,107015,an interesting and fun course. thanks. dr quincy,5
107016,107016,"very broad perspective, up to date information...",4


# Import data as Huggingface Dataset
- 80% training split: 85,614 examples
- 20% test split: 21,404 examples

In [15]:
dataset = load_dataset('csv', data_files='reviews.csv', split='train[:100%]')
dataset = dataset.rename_column("Label", "labels")

dataset = dataset.train_test_split(test_size=0.2)

val_and_test = dataset['test'].train_test_split(test_size=0.5)
val_and_test['val'] = val_and_test['train']
dataset = DatasetDict({'train': dataset['train'], 'val': val_and_test['val'], 'test': val_and_test['test']})

dataset

Using custom data configuration default-8448cd7c1f063266
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-8448cd7c1f063266/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


DatasetDict({
    train: Dataset({
        features: ['Id', 'Review', 'labels'],
        num_rows: 85614
    })
    val: Dataset({
        features: ['Id', 'Review', 'labels'],
        num_rows: 10702
    })
    test: Dataset({
        features: ['Id', 'Review', 'labels'],
        num_rows: 10702
    })
})

# Import model

In [16]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-small", model_max_length=256)
deberta = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-small", num_labels=6)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at microsoft/deberta-v3-small were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the c

# Prepare dataset (tokenize)

In [17]:
def tokenize_function(examples):
    return tokenizer(examples["Review"], padding="max_length", truncation=True)


dataset = dataset.map(tokenize_function, batched=True)

  0%|          | 0/86 [00:00<?, ?ba/s]

  0%|          | 0/11 [00:00<?, ?ba/s]

  0%|          | 0/11 [00:00<?, ?ba/s]

In [18]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Id', 'Review', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 85614
    })
    val: Dataset({
        features: ['Id', 'Review', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10702
    })
    test: Dataset({
        features: ['Id', 'Review', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10702
    })
})

In [21]:
batch_size = 16

training_args = TrainingArguments(output_dir="real_run", evaluation_strategy="epoch", save_strategy="epoch", num_train_epochs=25, fp16=True, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, gradient_accumulation_steps=4, load_best_model_at_end=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


metric = load_metric("accuracy")

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


# Finetune model

In [22]:
trainer = Trainer(
    model=deberta,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['val'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(3, 0.0)]
)

Using amp half precision backend


In [23]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: Id, Review. If Id, Review are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 85614
  Num Epochs = 25
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 4
  Total optimization steps = 33425


Epoch,Training Loss,Validation Loss,Accuracy
0,0.5203,0.513748,0.796487
1,0.4763,0.521699,0.799103
2,0.429,0.527739,0.795459
3,0.3673,0.607524,0.798169


The following columns in the evaluation set  don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: Id, Review. If Id, Review are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10702
  Batch size = 16
Saving model checkpoint to real_run/checkpoint-1337
Configuration saved in real_run/checkpoint-1337/config.json
Model weights saved in real_run/checkpoint-1337/pytorch_model.bin
tokenizer config file saved in real_run/checkpoint-1337/tokenizer_config.json
Special tokens file saved in real_run/checkpoint-1337/special_tokens_map.json
added tokens file saved in real_run/checkpoint-1337/added_tokens.json
The following columns in the evaluation set  don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: Id, Review. If Id, Review are not expected by `DebertaV2ForSequenceClassification.forward`,

TrainOutput(global_step=5348, training_loss=0.45397068443098615, metrics={'train_runtime': 2194.801, 'train_samples_per_second': 975.191, 'train_steps_per_second': 15.229, 'total_flos': 2.268455370633216e+16, 'train_loss': 0.45397068443098615, 'epoch': 4.0})

In [24]:
trainer.predict(dataset['test'])

The following columns in the test set  don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: Id, Review. If Id, Review are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 10702
  Batch size = 16


PredictionOutput(predictions=array([[-5.51  , -3.441 , -3.58  , -0.6675,  3.309 ,  4.934 ],
       [-4.617 ,  0.3555,  0.8184,  1.457 ,  0.9087, -0.3892],
       [-5.445 , -3.854 , -3.934 , -1.158 ,  3.402 ,  5.664 ],
       ...,
       [-5.406 , -3.834 , -3.947 , -1.21  ,  3.375 ,  5.695 ],
       [-5.312 , -3.223 , -3.264 , -0.4246,  3.188 ,  4.457 ],
       [-5.03  , -3.336 , -3.61  , -1.444 ,  2.846 ,  5.76  ]],
      dtype=float16), label_ids=array([5, 2, 5, ..., 5, 5, 5]), metrics={'test_loss': 0.4878043830394745, 'test_accuracy': 0.807886376378247, 'test_runtime': 29.3649, 'test_samples_per_second': 364.449, 'test_steps_per_second': 22.782})

# Inference

In [26]:
from transformers import DebertaForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/Corelli/Duke Spring 2022/AIPI 540/NLP Module Project/real_run/checkpoint-1337")

loading configuration file /content/drive/MyDrive/Corelli/Duke Spring 2022/AIPI 540/NLP Module Project/real_run/checkpoint-1337/config.json
Model config DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Corelli/Duke Spring 2022/AIPI 540/NLP Module Project/real_run/checkpoint-1337",
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_

In [27]:
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer)