In [None]:
# install dependencies
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.2/401.2 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.4/302.4 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for peft (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pypr

In [None]:
import torch
torch.cuda.is_available()

True

In [None]:
import pandas as pd
from datasets import load_dataset
import torch
import numpy as np
from transformers import (AutoTokenizer,AutoConfig,AutoModelForSequenceClassification,DataCollatorWithPadding,TrainingArguments,Trainer)
from peft import PeftModel,PeftConfig,get_peft_model,LoraConfig
import bitsandbytes as bnb

In [None]:
model_checkpoint = "distilbert-base-uncased"

id2label = {0:"Gym", 1:"Groceries", 2:"Dining", 3: "Entertainment", 4:"Pharmacy", 5:"Merchandise", 6:"Travel", 7:"Miscellaneous",8:"Gas/Automotive"}
label2id = {"Gym":0, "Groceries":1, "Dining":2, "Entertainment":3, "Pharmacy":4, "Merchandise":5, "Travel":6, "Miscellaneous":7,"Gas/Automotive":8}
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint,num_labels=9,id2label=id2label,label2id=label2id)
#Loads dataset into DatasetDict and preferably into train and validation sets
data = load_dataset("csv",data_files ="/content/drive/MyDrive/LLM Stuff/credit_card_data_labeled.csv")

data

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 334
    })
})

In [None]:
#Step 2 Data Preperation. We need to tokenize our data as Neural Nets do not read text but numbers.

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint,add_prefix_space = True)
#add_prefix_space = True, The actual need is transfer learning while preserving the model ability on the original domain, where both domains have different vocabularies

def tokenize_function(examples):
    text = examples['text']

    #tokenize and truncate text
    #We need to make sure all inputs to the model are the same size. we can do this by truncating longer inputs or padding shorter inputs to match input lengths
    tokenizer.truncation_side = "left" #Arbitrary direction. Could be left or right
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np", #Return Numpy Tensors
        truncation = True, #Truncate Input
        max_length = 512 #max length of token
    )

    return tokenized_inputs

#Add pad token if none exists.
"""
Padding Token (PAD): In machine learning tasks involving sequence data, input sequences often need to have uniform lengths for efficient processing
(e.g., in mini-batch training). Padding tokens are inserted to standardize the length of input sequences by filling empty spaces in shorter sequences.
"""
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token':'[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

#Tokenize training set
tokenized_dataset = data.map(tokenize_function,batched=True)
tokenized_dataset

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/334 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 334
    })
})

In [None]:
#Step 3 Create a Data collator.
#A Data Collator that will dynamically pad training examples within a batch during training. For example if we have a batch size of
#4 inputs and 1 is 500 but the other 3 are 200, it will dynamically pad the shorter ones but it is based on each batch. Abother batch could have different sizes
#but this was it is more computationally efficent

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
#Step 4 Using Lora for Better Performance. we first define out Lora configuration parameters

peft_config = LoraConfig(task_type="SEQ_CLS", #Sequence Classification
                         r=4, #intrinsic rank of trainable weight matrix.
                         lora_alpha =32, #This is like a learning rate when using adam optimizer
                         lora_dropout =0.01, # probability of dropout
                         target_modules = ['q_lin'] #We apply lora to query layer
                         )

model = get_peft_model(model,peft_config)
#We are training less than 1% of the models available parameters
model.print_trainable_parameters()

trainable params: 634,377 || all params: 67,594,770 || trainable%: 0.9385


In [None]:
#Step 5 Define Hyper Parameters and Define Training Arguments
#Fine Tuning LLM's can be costly beccause fine tuning an LLM is not a one time task. We have to keep tweaking this
#Parameters in order to get optimal results.

lr = 2e-4 # size of optimization step
batch_size = 32 #number of training examples processed per optimization step
num_epochs = 10 #number of times model runs through training data


#define training arguments
training_args = TrainingArguments(
    output_dir = model_checkpoint + "-lora-credit-card-labeler",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=1,
    save_total_limit=3,
    logging_steps = 3,
    num_train_epochs=num_epochs,
    optim = "paged_adamw_8bit",
    lr_scheduler_type="linear",
    warmup_ratio=0.1,
)

trainer = Trainer(
    model = model,
    train_dataset=tokenized_dataset["train"],
    args = training_args,
    data_collator=data_collator
)

model.config.use_cache = False
trainer.train()

Step,Training Loss
3,0.0001
6,0.0
9,0.0
12,0.0001
15,0.0
18,0.0
21,0.0
24,0.0
27,0.0
30,0.0


TrainOutput(global_step=110, training_loss=0.0003602250775408968, metrics={'train_runtime': 6.2807, 'train_samples_per_second': 531.788, 'train_steps_per_second': 17.514, 'total_flos': 21188910603600.0, 'train_loss': 0.0003602250775408968, 'epoch': 10.0})

In [None]:
#Save Model
#model.save_pretrained("credit-card-labeler-model")



In [None]:
testing_examples = ["Sq *park Shelton","The Salvation Army 7 Rochester Hls Mi","Csc Servicework Auburn Hills Mi","Auth : Roccos Italian Deli Ll","Belding Cleaners 131-38225800 Mi","Mcdonald's F34783","Amzn Mktp Us*o79qd5x53 Amzn.com/bill Wa","Lime*ride Httpswww.li.m Ca"]
device = "cuda:0"
print("Trained Model Predictions:")
print("--------------------------")

for text in testing_examples:
    inputs = tokenizer.encode(text,return_tensors="pt").to(device)

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])

Trained Model Predictions:
--------------------------
Sq *park Shelton - Miscellaneous
The Salvation Army 7 Rochester Hls Mi - Merchandise
Csc Servicework Auburn Hills Mi - Dining
Auth : Roccos Italian Deli Ll - Dining
Belding Cleaners 131-38225800 Mi - Gas/Automotive
Mcdonald's F34783 - Dining
Amzn Mktp Us*o79qd5x53 Amzn.com/bill Wa - Merchandise
Lime*ride Httpswww.li.m Ca - Miscellaneous
