In [3]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np


In [5]:
model_checkpoint = 'distilbert-base-uncased'       # selected this because it only has 67million parameters and can run in my computer

#define label maps
id2label = { 0:"Negative", 1:"Positive" }
label2id = { "Negative":0, "Positive":1 }

# generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id)

#load dataset
dataset = load_dataset("shawhin/imdb-truncated")    # its a dataset of IMDB movie reviews with an associated positive or negative label
dataset

# both training and validation datasets consist of 1000 rows.


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


README.md:   0%|          | 0.00/592 [00:00<?, ?B/s]

(…)-00000-of-00001-5a744bf76a1d84b2.parquet:   0%|          | 0.00/836k [00:00<?, ?B/s]

(…)-00000-of-00001-a3a52fabb70c739f.parquet:   0%|          | 0.00/853k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

In [7]:
# NOW LETS PREPROCESS THE DATA

# create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

# create a tokenization function
def tokenize_function(examples):
  #extract text
  text = examples["text"]

  # tokenize and truncate text
  tokenizer.truncation_side = "left"      #truncation neden tam anlamadım
  tokenized_inputs = tokenizer(
      text,
      return_tensors="np",
      truncation=True,
      max_length = 512
  )

  return tokenized_inputs


# adding pad token if not exist
if tokenizer.pad_token is None:
  tokenizer.add_special_tokens({'pad_token': '[PAD]'})
  model.resize_token_embeddings(len(tokenizer))

# apply tokenization function
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [9]:
# create data collator
# ----> dynamically pads examples in a given batch to be as long as the longest sequence in that batch.
# ----> this is helpful because if you pad your sequences dynamically like this with a collater, it is a lot more computationally efficient than padding all your examples
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# import accuracy evaluation metric
accurtacy = evaluate.load("accuracy")

#define an evaluation function to pass intro trainer later
def compute_metrics(p):
  predictions, labels = p
  predictions = np.argmax(predictions, axis=1)

  return {"accuracy": accurtacy.compute(predictions=predictions, references=labels)}

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [10]:
#Before training our fine-tuned model, we can evaluate the performance of the base model

# define list of examples
text_list = ["ıt was good.", "Not a fan, don't recommend.", "Better than the first one.", "This is not worth watching even once.", "This one is a pass."]

print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
  # tokenize text
  inputs = tokenizer.encode(text, return_tensors="pt")
  # compute logits
  logits = model(inputs).logits
  # convert logits to label
  predictions = torch.argmax(logits)
  print(text, ":", id2label[predictions.tolist()])





Untrained model predictions:
----------------------------
ıt was good. : Positive
Not a fan, don't recommend. : Negative
Better than the first one. : Negative
This is not worth watching even once. : Negative
This one is a pass. : Negative


In [12]:
# Fine Tuning with LoRA

# first thing to do is define our LoRA configuration parameters
peft_config = LoraConfig(
    task_type = "SEQ_CLS",    #sequence classification
    r = 4,                    # intrinsic rank of our trainable weight matrix
    lora_alpha = 32,          # this is like a learning rate
    lora_dropout = 0.01,       # probability of a dropout
    target_modules = ['q_lin']   # we apply lora to query layer
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9307
