In [None]:
pip install -qU accelerate bitsandbytes torch transformers wandb evaluate trl datasets peft

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m79.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m51.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.2/245.2 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━

In [None]:
# Imports
import os
import numpy as np
from datasets import load_dataset, load_metric
from tqdm import tqdm
import torch
import re
import evaluate

from transformers import (
    Trainer,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorForSeq2Seq,
    DataCollatorWithPadding,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    TrainingArguments
)

In [None]:
# Check if GPU and CUDA are available.
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.current_device())
print(torch.cuda.device(0))
print(torch.cuda.get_device_name(0))

True
1
0
<torch.cuda.device object at 0x7c0473d90f10>
Tesla T4


In [None]:
dataset = load_dataset("imdb", trust_remote_code=True)
dataset.pop("unsupervised")

Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label'],
    num_rows: 50000
})

In [None]:
print(f"DATASET TYPE: {type(dataset)}")
print(f"DATASET INFO: {dataset.items()}")
print(f"DATASET COL NAMES: {dataset.column_names}")
print(f"DATASET EXAMPLE: \n {dataset['train'][2]}")

DATASET TYPE: <class 'datasets.dataset_dict.DatasetDict'>
DATASET INFO: dict_items([('train', Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})), ('test', Dataset({
    features: ['text', 'label'],
    num_rows: 25000
}))])
DATASET COL NAMES: {'train': ['text', 'label'], 'test': ['text', 'label']}
DATASET EXAMPLE: 
 {'text': "If only to avoid making this type of film in the future. This film is interesting as an experiment but tells no cogent story.<br /><br />One might feel virtuous for sitting thru it because it touches on so many IMPORTANT issues but it does so without any discernable motive. The viewer comes away with no new perspectives (unless one comes up with one while one's mind wanders, as it will invariably do during this pointless film).<br /><br />One might better spend one's time staring out a window at a tree growing.<br /><br />", 'label': 0}


In [None]:
def clear_text(text: str) -> str:
    return re.sub('<[^<]+?>', '', text)

In [None]:
# Settings
model_checkpoint = "distilbert-base-uncased"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
def preprocess_function(examples):
    inputs = [clear_text(ex) for ex in examples["text"]]

    model_inputs = tokenizer(
        inputs, truncation=True
    )

    return model_inputs

In [None]:
tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
})

In [None]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [None]:
finetuned_ids = [0, 1, 2, 3, 4]
learning_rates = [2e-3, 2e-4, 2e-5, 2e-7, 2e-9]
n_epochs = [2, 3, 3, 5, 7]

In [None]:
for f_id, lr, n_e in zip(finetuned_ids, learning_rates, n_epochs):

    print(f"ID: {f_id}, LEARNING_RATE: {lr}, N_EPOCHS: {n_e}")

    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id
    )

    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    training_args = TrainingArguments(
        output_dir=f"{model_checkpoint}-finetuned-CLASSIFICATION-{f_id}",
        learning_rate=lr,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=n_e,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()

ID: 0, LEARNING_RATE: 0.002, N_EPOCHS: 2


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6934,0.69315,0.5
2,0.693,0.693172,0.5


ID: 1, LEARNING_RATE: 0.0002, N_EPOCHS: 3


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6252,0.624346,0.67912


In [None]:
input = "movie was amazing...."

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer

my_checkpoint = "distilbert-base-uncased-finetuned-CLASSIFICATION-2/checkpoint-9375"

tokenizer = AutoTokenizer.from_pretrained(my_checkpoint)
inputs = tokenizer(input, return_tensors="pt")

model = AutoModelForSequenceClassification.from_pretrained(my_checkpoint)
with torch.no_grad():
    logits = model(**inputs).logits
predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

In [None]:
predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

In [None]:
from peft import AutoPeftModelForCausalLM

# Local path, check post scriptum for explanation
model_id = "./ArcturusAI/Crystalline-1.1B-v23.12-tagger"
peft_model = AutoPeftModelForCausalLM.from_pretrained(model_id)
print(type(peft_model))

merged_model = peft_model.merge_and_unload()
# The adapters are merged now and it is transformers class again
print(type(merged_model))