 # *This code is applying the ["Finetune_mistral_7b_sarcasm_detection.ipynb"](https://github.com/pal4ai/FinetuneMistral7B/blob/main/Finetune_mistral_7b_sarcasm_detection.ipynb) written by [RiyaJoshi](https://github.com/pal4ai) to my customized electric vehicle dataset

In [None]:
!git

## Install necessary libraries

In [1]:
!pip install --upgrade pyarrow
!pip install -q -U transformers
!pip install -q -U accelerate
!pip install -q -U bitsandbytes
!pip install -q -U datasets scipy ipywidgets
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q bitsandbytes trl peft

Collecting pyarrow
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyarrow
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 14.0.2
    Uninstalling pyarrow-14.0.2:
      Successfully uninstalled pyarrow-14.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 17.0.0 which is incompatible.
ibis-framework 8.0.0 requires pyarrow<16,>=2, but you have pyarrow 17.0.0 which is incompatible.[0m[31m
[0mSuccessfully installed pyarrow-17.0.0
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43

## Import necessary libraries

In [4]:
import pandas as pd
import numpy as np
import io
from datasets import Dataset
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training
from peft import LoraConfig, get_peft_model
from peft import AutoPeftModelForCausalLM,PeftConfig
import transformers
from datetime import datetime
from trl import SFTTrainer
from tqdm import tqdm



In [78]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


## Load Dataset

In [6]:
train_pd_full = pd.read_csv('/content/drive/MyDrive/electric-vehicle/data/train_202.csv',on_bad_lines='skip',header=0,encoding='utf-8')
column_names_list=train_pd_full.columns

test_pd_full = pd.read_csv('/content/drive/MyDrive/electric-vehicle/data/test_50.csv',on_bad_lines='skip',header=0,encoding='utf-8')
column_names_list=test_pd_full.columns

train_pd_full['CAFV_indicator'] = train_pd_full['CAFV_indicator'].map(int)
test_pd_full['CAFV_indicator'] = test_pd_full['CAFV_indicator'].map(int)
train_pd_full.rename(columns={"CAFV_indicator": 'label'}, inplace=True)
test_pd_full.rename(columns={"CAFV_indicator": 'label'}, inplace=True)


In [None]:
print('Actual data size of the full train dataframe',train_pd_full.shape)
print('Actual data size of the full test dataframe',test_pd_full.shape)

Actual data size of the full train dataframe (202, 3)
Actual data size of the full test dataframe (50, 3)


### Convert pandas dataframe to a dataset

In [9]:
train_dataset = Dataset.from_pandas(train_pd_full)
test_dataset = Dataset.from_pandas(test_pd_full)
print(train_dataset)
print(test_dataset)

Dataset({
    features: ['Make-Model-Year', 'review', 'label'],
    num_rows: 202
})
Dataset({
    features: ['Make-Model-Year', 'review', 'label'],
    num_rows: 50
})


## LOAD BASE MODEL AND CONFIGs

In [85]:
with open("/content/drive/MyDrive/electric-vehicle/huggingface-token", "r") as f:
  MY_HUGGINGFACE_TOKEN=str(f.read()).split("\n")[0]

In [86]:
from huggingface_hub import login
login(token=MY_HUGGINGFACE_TOKEN, add_to_git_credential=True)

Token is valid (permission: fineGrained).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [11]:
## Base model - Pretrained LLM you want to eventually finetune
base_model_name = 'mistralai/Mistral-7B-v0.1'

## Quantization Config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
## Load the pretrained model after quantization
model = AutoModelForCausalLM.from_pretrained(base_model_name, quantization_config=bnb_config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

## TOKENIZATION
### Tokenize the input data along with prompt

In [12]:
## set up tokenizer parameters

tokenizer = AutoTokenizer.from_pretrained(
    base_model_name,
    model_max_length=512,  ## Max Length of input to the model (play around with it may be )
    padding_side="left",
    add_eos_token=True)
tokenizer.pad_token = tokenizer.eos_token

def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=512, ## Max Length of input to the model (play around with it may be )
        padding="max_length",
    )
    result["labels"] = result["input_ids"].copy()
    return result

tokenizer_config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

## PROMPT GENERATION

### Train Prompt

In [13]:
label_to_text = ["Not CAFV", "CAFV"]

In [14]:
def generate_train_prompt(data_point):
    full_prompt =f"""You are a CAFV (Clean Alternative Fuel Vehicle) Eligibility detection bot for Kelley Blue Book reviews. Your task is to assess the review and categorize it in context of the comment after <<< >>> into one of the following predefined categories:
    CAFV
    Not CAFV

    ####
    Here are some examples:
    review:
    pros
    outstanding off-road chops
    unmistakable styling
    excellent resale value
    cons
    unrefined on-road ride and handling
    what's new?
    design updates
    comfort improvements
    new trims
    Category: Not CAFV

    review:
    pros
    excellent electric driving range
    enjoyable to drive, powerful
    software updates/improvement occur continuously
    well-integrated infotainment
    driving assistance features work well
    cons
    access to ’s supercharger charging station network is not free, as it is for model s and model x owners
    virtually all controls operated from within the touchscreen, unlike every other car
    the conventional trunk configuration, rather than a hatchback, limits cargo flexibility
    what's new?
    new and updated driving-assistance systems
    re-configuring packages for the most popular versions
    tax credit incentives cease
    new software adds  theater, smart summon, and more
    Category: CAFV

    If the text doesn't fit into any of the above categories, classify it as:
    Not CAFV
    <<<
    review:
    {data_point["review"]}
    >>>
    Category: {label_to_text[data_point["label"]]}
    """
    return full_prompt

### PROMPT ENGINEERING
#### Evaluating the performance of a zero shot prompt on test data on pretrained base model


#### Evaluation Prompt

In [16]:
## ith example
i=1
print("Parent comment: " + train_dataset[i]['review'])
print("Label: " + str(train_dataset[i]['label']) + "\n")

Parent comment: pros
exceptional  style
excellent  driving dynamics
zero-emissions driving
cons
pricey
fresher rivals are proliferating
what's new?
lineup shrinks to one trim level
updated infotainment system
faster onboard charger
price: the 2022  i- starts at $69,900.
the 2022  i- is an all-electric small luxury suv/crossover with the ability to run for 234 miles between charges. it’s a thrill to drive, has plenty of visual impact, great infotainment tech, and that special  class.
when the i- was introduced in 2019, it won many awards — like world car of the year, green car of the year, a couple of design awards, etc. deservedly so.
however, the electric vehicle (ev) game has moved on even in the short time between now and 2019. back then, the only tesla alternatives were things like the nissan leaf and chevrolet bolt. but this year sees some interesting ev stuff coming out from audi, kia, and hyundai. tesla also expanded its lineup with the model y. suddenly, the i- is beginning to 

In [15]:
def generate_eval_prompt(data_point):
    full_prompt =f"""You are a CAFV (Clean Alternative Fuel Vehicle) Eligibility detection bot for Kelley Blue Book reviews. Your task is to assess the review and categorize it in context of the comment after <<< >>> into one of the following predefined categories:
    CAFV
    Not CAFV

    ####
    Here are some examples:
    review:
    pros
    outstanding off-road chops
    unmistakable styling
    excellent resale value
    cons
    unrefined on-road ride and handling
    what's new?
    design updates
    comfort improvements
    new trims
    Category: Not CAFV

    review:
    pros
    excellent electric driving range
    enjoyable to drive, powerful
    software updates/improvement occur continuously
    well-integrated infotainment
    driving assistance features work well
    cons
    access to ’s supercharger charging station network is not free, as it is for model s and model x owners
    virtually all controls operated from within the touchscreen, unlike every other car
    the conventional trunk configuration, rather than a hatchback, limits cargo flexibility
    what's new?
    new and updated driving-assistance systems
    re-configuring packages for the most popular versions
    tax credit incentives cease
    new software adds  theater, smart summon, and more
    Category: CAFV

    If the text doesn't fit into any of the above categories, classify it as:
    Not CAFV
    <<<
    review:
    {data_point["review"]}
    >>>
    Category: {label_to_text[data_point["label"]]}
    """
    return full_prompt

#### Testing

In [17]:
## ith example
i=40
print("Parent comment: " + test_dataset[i]['review'])
print("Label: " + str(test_dataset[i]['label']) + "\n")

Parent comment: pros
all-new model to the  line-up
elegant luxury styling
plug-in hybrid and gasoline-powered options
full of innovative tech
cons
doesn’t offer a high-performance option
heavy vehicle for either powertrain
what's new?
 is the first 3-row midsize suv for 
“phone as a key” is a brand-new available tech feature
adaptive suspension with road preview capability
#11 in best midsize luxury suvs of 2020
the 2020   boasts gasoline and plug-in hybrid (phev) powertrain options, a luxuriously appointed interior, and seating capacity for up to seven.
available in rear-wheel drive (rwd) and all-wheel drive (awd), the  is available with comprehensive tech features such as ’s “phone as a key” technology, myriad standard safety features, an optional adaptive suspension that can read the road and make adjustments as you drive, and world-class styling, the  glides effortlessly into the midsize luxury suv market.
used 2020   pricingused 2020   pricing starts at $28,146 for the  sport util

In [18]:
eval_prompt = generate_eval_prompt(test_dataset[i])
model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")
model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=256, pad_token_id=2)[0], skip_special_tokens=True))

Token indices sequence length is longer than the specified maximum sequence length for this model (5288 > 512). Running this sequence through the model will result in indexing errors


You are a CAFV (Clean Alternative Fuel Vehicle) Eligibility detection bot for Kelley Blue Book reviews. Your task is to assess the review and categorize it in context of the comment after <<< >>> into one of the following predefined categories:
    CAFV
    Not CAFV

    ####
    Here are some examples:
    review:
    pros
    outstanding off-road chops
    unmistakable styling
    excellent resale value
    cons
    unrefined on-road ride and handling
    what's new?
    design updates
    comfort improvements
    new trims
    Category: Not CAFV

    review:
    pros
    excellent electric driving range
    enjoyable to drive, powerful
    software updates/improvement occur continuously
    well-integrated infotainment
    driving assistance features work well
    cons
    access to ’s supercharger charging station network is not free, as it is for model s and model x owners
    virtually all controls operated from within the touchscreen, unlike every other car
    the conventional 

#### Tokenize the prompts

In [19]:
def generate_and_tokenize_train_prompt(data_point):
  return {'text':generate_train_prompt(data_point)}

def generate_and_tokenize_eval_prompt(data_point):
  return{'text':generate_eval_prompt(data_point)}

## EVALUATION LOOP - FEW SHOT PROMPT ENGINEERING ON BASE MODEL

In [20]:
n= len(test_dataset)
print(n)
basemodel_results_df= pd.DataFrame(columns=['model_raw_op'])
for i in tqdm(range(n)):
  eval_prompt = generate_eval_prompt(test_dataset[i])
  model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")
  model.eval()
  with torch.no_grad():
    decoded_op=tokenizer.decode(model.generate(**model_input, max_new_tokens=256, pad_token_id=2)[0], skip_special_tokens=True)
    basemodel_results_df.loc[i]= [decoded_op]

50


100%|██████████| 50/50 [54:14<00:00, 65.09s/it]


In [21]:
basemodel_results_raw = test_dataset.to_pandas()
basemodel_results_raw['model_raw_op'] = basemodel_results_df['model_raw_op']

In [22]:
import os
raw_output_path = "/content/drive/MyDrive/electric-vehicle/output/basemodel/"
# Option 1: Using os.path.join()
csv_filename = 'raw_basemodel_results.csv'
full_csv_path = os.path.join(raw_output_path, csv_filename)

# Now save the DataFrame to the specified location
basemodel_results_raw.to_csv(full_csv_path, sep='\t', index=False)

#### Read basemodel_results_raw from folder location if not in session

In [23]:
basemodel_results_raw[10:20]

Unnamed: 0,Make-Model-Year,review,label,model_raw_op
10,PORSCHE-PANAMERA-2022,pros\nhigh thrills\nhigh tech\nhigh luxury\nhi...,0,You are a CAFV (Clean Alternative Fuel Vehicle...
11,BMW-IX-2023,pros\n324-mile range\nfast charging\nsustainab...,1,You are a CAFV (Clean Alternative Fuel Vehicle...
12,MITSUBISHI-OUTLANDER-2024,pros\ndecent cabin\nexcellent warranties\nall-...,1,You are a CAFV (Clean Alternative Fuel Vehicle...
13,JEEP-WRANGLER-2024,pros\noutstanding off-road chops\nunmistakable...,0,You are a CAFV (Clean Alternative Fuel Vehicle...
14,FORD-FUSION-2020,pros\na refined & attractive sedan\neuropean h...,0,You are a CAFV (Clean Alternative Fuel Vehicle...
15,BMW-X3-2021,"pros\ngreat handling, suv practicality\nvariet...",0,You are a CAFV (Clean Alternative Fuel Vehicle...
16,TESLA-MODEL 3-2023,pros\nup to 333 miles of range\ningenious tech...,1,You are a CAFV (Clean Alternative Fuel Vehicle...
17,LINCOLN-CORSAIR-2022,"pros\nlow starting price\nquiet, serene cabin\...",0,You are a CAFV (Clean Alternative Fuel Vehicle...
18,RIVIAN-R1T-2023,pros\nexcellent off-road abilities\ngreat on-r...,1,You are a CAFV (Clean Alternative Fuel Vehicle...
19,HYUNDAI-KONA ELECTRIC-2021,pros\nwinner of the kelley blue book subcompac...,1,You are a CAFV (Clean Alternative Fuel Vehicle...


In [24]:
test_ip = basemodel_results_raw['model_raw_op'][13]
print(test_ip)

You are a CAFV (Clean Alternative Fuel Vehicle) Eligibility detection bot for Kelley Blue Book reviews. Your task is to assess the review and categorize it in context of the comment after <<< >>> into one of the following predefined categories:
    CAFV
    Not CAFV

    ####
    Here are some examples:
    review:
    pros
    outstanding off-road chops
    unmistakable styling
    excellent resale value
    cons
    unrefined on-road ride and handling
    what's new?
    design updates
    comfort improvements
    new trims
    Category: Not CAFV

    review:
    pros
    excellent electric driving range
    enjoyable to drive, powerful
    software updates/improvement occur continuously
    well-integrated infotainment
    driving assistance features work well
    cons
    access to ’s supercharger charging station network is not free, as it is for model s and model x owners
    virtually all controls operated from within the touchscreen, unlike every other car
    the conventional 

### Preprocessing function to retrieve output category from LLM output

In [25]:
def find_first_word_basemodel(text):
    """
    Finds the first word in the input text after removing leading spaces and newlines.
    Returns the first word or None if no word is found.
    """
    # Search for the specified string
    start_index = text.find(">>>\n    Catgeory:")
    if start_index == -1:
        return None  # String not found

    # Extract the substring after the specified string
    remaining_text = text[start_index + len(">>>\n    Catgeory:"):]

    # Remove leading spaces and newlines
    cleaned_text = remaining_text.lstrip()
    #print('cleaned_text',cleaned_text)
    # Find the first word
    words = cleaned_text.split()
    #print('words',words)
    if words:
      if words[0]=='CAFV':
        return 'CAFV'
      elif words[0]=='Not':
        return 'Not CAFV'
      else:
        return 'None'
    else:
        return None  # No word found

In [26]:
# Example usage:
result = find_first_word_basemodel(test_ip)
print('The output is --->',result)  # Output: "CAFV"

The output is ---> None


In [27]:
basemodel_results_raw["predicted_category"] = basemodel_results_raw["model_raw_op"].apply(find_first_word_basemodel)

In [28]:
def text_to_binary(text):
  if text=='CAFV':
    return 1
  elif text=='Not CAFV':
    return 0
  else:
    return 2

In [29]:
basemodel_results_raw["predicted_category_bn"] =basemodel_results_raw["predicted_category"].apply(text_to_binary)

In [30]:
basemodel_results_raw

Unnamed: 0,Make-Model-Year,review,label,model_raw_op,predicted_category,predicted_category_bn
0,VOLVO-XC90-2020,pros\nexcellent safety features\nloads of tech...,0,You are a CAFV (Clean Alternative Fuel Vehicle...,,2
1,FORD-F-150-2023,pros\nour full-size truck best buy of 2023\nca...,1,You are a CAFV (Clean Alternative Fuel Vehicle...,,2
2,KIA-EV6-2023,pros\nfast-charging tech\nup to 310 miles of r...,1,You are a CAFV (Clean Alternative Fuel Vehicle...,,2
3,TESLA-MODEL S-2023,pros\n405-mile range\nimpressive tech\nincredi...,1,You are a CAFV (Clean Alternative Fuel Vehicle...,,2
4,LINCOLN-AVIATOR-2023,pros\ncomfort is a priority\nspacious in the f...,0,You are a CAFV (Clean Alternative Fuel Vehicle...,,2
5,PORSCHE-CAYENNE-2021,pros\nevery trim offers impressive performance...,0,You are a CAFV (Clean Alternative Fuel Vehicle...,,2
6,VOLVO-XC60-2024,pros\nstrong safety ratings\nvibrant interior\...,1,You are a CAFV (Clean Alternative Fuel Vehicle...,,2
7,TOYOTA-PRIUS PRIME-2022,pros\nexcellent fuel economy\nhigh level of st...,0,You are a CAFV (Clean Alternative Fuel Vehicle...,,2
8,LAND ROVER-RANGE ROVER SPORT-2024,pros\nprimo off-roader\nopulent interior\nperf...,1,You are a CAFV (Clean Alternative Fuel Vehicle...,,2
9,TOYOTA-RAV4 PRIME-2021,pros\n42 miles of pure electric range\neligibl...,1,You are a CAFV (Clean Alternative Fuel Vehicle...,,2


In [31]:
basemodel_results_raw["predicted_category_bn"].value_counts()

Unnamed: 0_level_0,count
predicted_category_bn,Unnamed: 1_level_1
2,50


### Evaluation metrics on base model results

In [32]:
import os
raw_output_path = "/content/drive/MyDrive/electric-vehicle/output/basemodel/"
# Option 1: Using os.path.join()
csv_filename = 'processed_basemodel_results.csv'
full_csv_path = os.path.join(raw_output_path, csv_filename)

# Now save the DataFrame to the specified location
basemodel_results_raw.to_csv(full_csv_path, sep='\t', index=False)

In [33]:
from sklearn.metrics import classification_report

# Example ground truth and predicted labels
y_true = basemodel_results_raw['label']
y_pred = basemodel_results_raw['predicted_category_bn']

# Define target names (optional)
#target_names = ['class 0', 'class 1', 'class 2']

# Generate the classification report
report = classification_report(y_true, y_pred)

print(report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      17.0
           1       0.00      0.00      0.00      33.0
           2       0.00      0.00      0.00       0.0

    accuracy                           0.00      50.0
   macro avg       0.00      0.00      0.00      50.0
weighted avg       0.00      0.00      0.00      50.0



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## FINETUNE SECTION

In [34]:
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_train_prompt)
tokenized_train_dataset=tokenized_train_dataset.remove_columns(['label','review'])

Map:   0%|          | 0/202 [00:00<?, ? examples/s]

In [35]:
output_dir="/content/drive/MyDrive/electric-vehicle/output/finetunedmodel/"
# based on config
training_args = transformers.TrainingArguments(
    fp16=False, # specify bf16=True instead when training on GPUs that support bf16
    do_eval=False,
    bf16=False,
    optim="paged_adamw_8bit",
    #evaluation_strategy="epoch",
    gradient_accumulation_steps=8,
    #gradient_checkpointing=True,
    #gradient_checkpointing_kwargs={"use_reentrant": False},
    learning_rate=2.0e-05,
    log_level="info",
    weight_decay=0.001,
    logging_steps=10,
    logging_strategy="steps",
    lr_scheduler_type="constant",
    # max_steps=1000000,
    num_train_epochs=1,
    # num_train_epochs=4,
    output_dir=output_dir,
    overwrite_output_dir=True,
    per_device_eval_batch_size=1, # originally set to 8
    per_device_train_batch_size=1, # originally set to 8
    # push_to_hub=True,
    # hub_model_id="zephyr-7b-sft-lora",
    # hub_strategy="every_save",
    # report_to="tensorboard",
    save_strategy="steps",
    save_steps=1000,
    seed=42,
    warmup_ratio=0.3
)

# based on config
config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)


trainer = SFTTrainer(
        model=model,
        #model_init_kwargs=model_kwargs,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        #eval_dataset=eval_dataset,
        dataset_text_field="text",
        tokenizer=tokenizer,
        packing=False,
        peft_config=config,
        max_seq_length=512
    )
trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/202 [00:00<?, ? examples/s]

***** Running training *****
  Num examples = 202
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 8
  Total optimization steps = 25
  Number of trainable parameters = 21,260,288


Step,Training Loss
10,2.0668
20,1.5139


Saving model checkpoint to /content/drive/MyDrive/electric-vehicle/output/finetunedmodel/checkpoint-25
tokenizer config file saved in /content/drive/MyDrive/electric-vehicle/output/finetunedmodel/checkpoint-25/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/electric-vehicle/output/finetunedmodel/checkpoint-25/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=25, training_loss=1.632995777130127, metrics={'train_runtime': 1627.7849, 'train_samples_per_second': 0.124, 'train_steps_per_second': 0.015, 'total_flos': 4381851883929600.0, 'train_loss': 1.632995777130127, 'epoch': 0.9900990099009901})

Can try 2nd epoch

In [56]:
## epoch1_model - Pretrained LLM you want to eventually finetune
epoch1_model_name = '/content/drive/MyDrive/electric-vehicle/output/finetunedmodel/checkpoint-25'

## Quantization Config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
## Load the pretrained model after quantization
epoch1_model = AutoModelForCausalLM.from_pretrained(epoch1_model_name, quantization_config=bnb_config)

trainer = SFTTrainer(
        model=epoch1_model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        dataset_text_field="text",
        tokenizer=tokenizer,
        packing=False,
        peft_config=config,
        max_seq_length=512
    )
trainer.train()

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/7231864981174d9bee8c7687c24c8344414eae6b/config.json
Model config MistralConfig {
  "_name_or_path": "mistralai/Mistral-7B-v0.1",
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-05,
  "rope_theta": 10000.0,
  "sliding_window": 4096,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.44.2",
  "use_cache": true,
  "vocab_size": 32000
}

The device_map was not initialized. Setting device_map to {'':torch.cuda.current_device()}. If you want to use the model for infere

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

All model checkpoint weights were used when initializing MistralForCausalLM.

All the weights of MistralForCausalLM were initialized from the model checkpoint at mistralai/Mistral-7B-v0.1.
If your task is similar to the task the model of the checkpoint was trained on, you can already use MistralForCausalLM for predictions without further training.
loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/7231864981174d9bee8c7687c24c8344414eae6b/generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2
}


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
PyTorch: setting up devices
PyTorch: setting up devices


Map:   0%|          | 0/202 [00:00<?, ? examples/s]

***** Running training *****
  Num examples = 202
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 8
  Total optimization steps = 25
  Number of trainable parameters = 21,260,288


OutOfMemoryError: CUDA out of memory. Tried to allocate 224.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 1.06 MiB is free. Process 2609 has 14.69 GiB memory in use. Of the allocated memory 14.36 GiB is allocated by PyTorch, and 195.98 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# import gc
# import torch
# gc.collect()
# torch.cuda.empty_cache()

## LOAD FINETUNED MODEL CHECKPOINTS

#### Checkpoint after epoch 1

In [36]:
## Model after epoch 1
# project = "run_1"
# base_model_name = "mistral"
# run_name = base_model_name + "-" + project
checkpoint = 'checkpoint-25'
output_dir = "/content/drive/MyDrive/electric-vehicle/output/finetunedmodel"

peft_model_path_1 = output_dir+'/'+checkpoint
print(peft_model_path_1)
ft_model_ep1 = AutoPeftModelForCausalLM.from_pretrained(peft_model_path_1,quantization_config=bnb_config)

/content/drive/MyDrive/electric-vehicle/output/finetunedmodel/checkpoint-25


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/7231864981174d9bee8c7687c24c8344414eae6b/config.json
Model config MistralConfig {
  "_name_or_path": "mistralai/Mistral-7B-v0.1",
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-05,
  "rope_theta": 10000.0,
  "sliding_window": 4096,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.44.2",
  "use_cache": true,
  "vocab_size": 32000
}

The device_map was not initialized. Setting device_map to {'':torch.cuda.current_device()}. If you want to use the model for infere

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

All model checkpoint weights were used when initializing MistralForCausalLM.

All the weights of MistralForCausalLM were initialized from the model checkpoint at mistralai/Mistral-7B-v0.1.
If your task is similar to the task the model of the checkpoint was trained on, you can already use MistralForCausalLM for predictions without further training.
loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/7231864981174d9bee8c7687c24c8344414eae6b/generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2
}

loading file tokenizer.model
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 32000. This might induce some performance reduction as *Tensor Cores

In [37]:
# eval_prompt = generate_eval_prompt(test_dataset[i])
# model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")
# ft_model_ep1.eval()
# with torch.no_grad():
#     print(tokenizer.decode(ft_model_ep1.generate(**model_input, max_new_tokens=256, pad_token_id=2)[0], skip_special_tokens=True))

## EVALUATION LOOP

In [38]:
from tqdm import tqdm

### Write the evaluation result in a new column of test dataset along with existing columns - label, comment and parent_comment

In [39]:
n= len(test_dataset)
print(n)
finetuned_results_df= pd.DataFrame(columns=['model_raw_op'])
for i in tqdm(range(n)):
  eval_prompt = generate_eval_prompt(test_dataset[i])
  model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")
  ft_model_ep1.eval()
  with torch.no_grad():
    decoded_op=tokenizer.decode(ft_model_ep1.generate(**model_input, max_new_tokens=128, pad_token_id=2)[0], skip_special_tokens=True)
    finetuned_results_df.loc[i]= [decoded_op]



50


100%|██████████| 50/50 [45:27<00:00, 54.55s/it]


In [40]:
finetuned_results_raw = test_dataset.to_pandas()
finetuned_results_raw['model_raw_op'] = finetuned_results_df['model_raw_op']


In [42]:
import os
raw_output_path = "/content/drive/MyDrive/electric-vehicle/output/finetunedmodel/"
# Option 1: Using os.path.join()
csv_filename = 'raw_finetuned_results.csv'
full_csv_path = os.path.join(raw_output_path, csv_filename)

# Now save the DataFrame to the specified location
finetuned_results_raw.to_csv(full_csv_path, sep='\t', index=False)


In [43]:
finetuned_results_raw

Unnamed: 0,Make-Model-Year,review,label,model_raw_op
0,VOLVO-XC90-2020,pros\nexcellent safety features\nloads of tech...,0,You are a CAFV (Clean Alternative Fuel Vehicle...
1,FORD-F-150-2023,pros\nour full-size truck best buy of 2023\nca...,1,You are a CAFV (Clean Alternative Fuel Vehicle...
2,KIA-EV6-2023,pros\nfast-charging tech\nup to 310 miles of r...,1,You are a CAFV (Clean Alternative Fuel Vehicle...
3,TESLA-MODEL S-2023,pros\n405-mile range\nimpressive tech\nincredi...,1,You are a CAFV (Clean Alternative Fuel Vehicle...
4,LINCOLN-AVIATOR-2023,pros\ncomfort is a priority\nspacious in the f...,0,You are a CAFV (Clean Alternative Fuel Vehicle...
5,PORSCHE-CAYENNE-2021,pros\nevery trim offers impressive performance...,0,You are a CAFV (Clean Alternative Fuel Vehicle...
6,VOLVO-XC60-2024,pros\nstrong safety ratings\nvibrant interior\...,1,You are a CAFV (Clean Alternative Fuel Vehicle...
7,TOYOTA-PRIUS PRIME-2022,pros\nexcellent fuel economy\nhigh level of st...,0,You are a CAFV (Clean Alternative Fuel Vehicle...
8,LAND ROVER-RANGE ROVER SPORT-2024,pros\nprimo off-roader\nopulent interior\nperf...,1,You are a CAFV (Clean Alternative Fuel Vehicle...
9,TOYOTA-RAV4 PRIME-2021,pros\n42 miles of pure electric range\neligibl...,1,You are a CAFV (Clean Alternative Fuel Vehicle...


## Load raw outputs from finetuned model and fetch the catgeory
### - Can be done independently after finetuning and above code need not to be run

In [45]:
import os
raw_output_path = "/content/drive/MyDrive/electric-vehicle/output/finetunedmodel/"
# Option 1: Using os.path.join()
csv_filename = 'raw_finetuned_results.csv'
full_csv_path = os.path.join(raw_output_path, csv_filename)

finetuned_results_raw_ip = pd.read_csv(full_csv_path,sep='\t')
finetuned_results_raw_ip

Unnamed: 0,Make-Model-Year,review,label,model_raw_op
0,VOLVO-XC90-2020,pros\nexcellent safety features\nloads of tech...,0,You are a CAFV (Clean Alternative Fuel Vehicle...
1,FORD-F-150-2023,pros\nour full-size truck best buy of 2023\nca...,1,You are a CAFV (Clean Alternative Fuel Vehicle...
2,KIA-EV6-2023,pros\nfast-charging tech\nup to 310 miles of r...,1,You are a CAFV (Clean Alternative Fuel Vehicle...
3,TESLA-MODEL S-2023,pros\n405-mile range\nimpressive tech\nincredi...,1,You are a CAFV (Clean Alternative Fuel Vehicle...
4,LINCOLN-AVIATOR-2023,pros\ncomfort is a priority\nspacious in the f...,0,You are a CAFV (Clean Alternative Fuel Vehicle...
5,PORSCHE-CAYENNE-2021,pros\nevery trim offers impressive performance...,0,You are a CAFV (Clean Alternative Fuel Vehicle...
6,VOLVO-XC60-2024,pros\nstrong safety ratings\nvibrant interior\...,1,You are a CAFV (Clean Alternative Fuel Vehicle...
7,TOYOTA-PRIUS PRIME-2022,pros\nexcellent fuel economy\nhigh level of st...,0,You are a CAFV (Clean Alternative Fuel Vehicle...
8,LAND ROVER-RANGE ROVER SPORT-2024,pros\nprimo off-roader\nopulent interior\nperf...,1,You are a CAFV (Clean Alternative Fuel Vehicle...
9,TOYOTA-RAV4 PRIME-2021,pros\n42 miles of pure electric range\neligibl...,1,You are a CAFV (Clean Alternative Fuel Vehicle...


In [46]:
finetuned_results_raw_ip[10:20]

Unnamed: 0,Make-Model-Year,review,label,model_raw_op
10,PORSCHE-PANAMERA-2022,pros\nhigh thrills\nhigh tech\nhigh luxury\nhi...,0,You are a CAFV (Clean Alternative Fuel Vehicle...
11,BMW-IX-2023,pros\n324-mile range\nfast charging\nsustainab...,1,You are a CAFV (Clean Alternative Fuel Vehicle...
12,MITSUBISHI-OUTLANDER-2024,pros\ndecent cabin\nexcellent warranties\nall-...,1,You are a CAFV (Clean Alternative Fuel Vehicle...
13,JEEP-WRANGLER-2024,pros\noutstanding off-road chops\nunmistakable...,0,You are a CAFV (Clean Alternative Fuel Vehicle...
14,FORD-FUSION-2020,pros\na refined & attractive sedan\neuropean h...,0,You are a CAFV (Clean Alternative Fuel Vehicle...
15,BMW-X3-2021,"pros\ngreat handling, suv practicality\nvariet...",0,You are a CAFV (Clean Alternative Fuel Vehicle...
16,TESLA-MODEL 3-2023,pros\nup to 333 miles of range\ningenious tech...,1,You are a CAFV (Clean Alternative Fuel Vehicle...
17,LINCOLN-CORSAIR-2022,"pros\nlow starting price\nquiet, serene cabin\...",0,You are a CAFV (Clean Alternative Fuel Vehicle...
18,RIVIAN-R1T-2023,pros\nexcellent off-road abilities\ngreat on-r...,1,You are a CAFV (Clean Alternative Fuel Vehicle...
19,HYUNDAI-KONA ELECTRIC-2021,pros\nwinner of the kelley blue book subcompac...,1,You are a CAFV (Clean Alternative Fuel Vehicle...


In [47]:
test_ip = finetuned_results_raw_ip['model_raw_op'][11]
print(test_ip)

You are a CAFV (Clean Alternative Fuel Vehicle) Eligibility detection bot for Kelley Blue Book reviews. Your task is to assess the review and categorize it in context of the comment after <<< >>> into one of the following predefined categories:
    CAFV
    Not CAFV

    ####
    Here are some examples:
    review:
    pros
    outstanding off-road chops
    unmistakable styling
    excellent resale value
    cons
    unrefined on-road ride and handling
    what's new?
    design updates
    comfort improvements
    new trims
    Category: Not CAFV

    review:
    pros
    excellent electric driving range
    enjoyable to drive, powerful
    software updates/improvement occur continuously
    well-integrated infotainment
    driving assistance features work well
    cons
    access to ’s supercharger charging station network is not free, as it is for model s and model x owners
    virtually all controls operated from within the touchscreen, unlike every other car
    the conventional 

In [48]:
def find_first_word(text):
    """
    Finds the first word in the input text after removing leading spaces and newlines.
    Returns the first word or None if no word is found.
    """
    # Search for the specified string
    start_index = text.find(">>>\n    Catgeory:")
    if start_index == -1:
        return None  # String not found

    # Extract the substring after the specified string
    remaining_text = text[start_index + len(">>>\n    Catgeory:"):]

    # Remove leading spaces and newlines
    cleaned_text = remaining_text.lstrip()
    #print('cleaned_text',cleaned_text)
    # Find the first word
    words = cleaned_text.split()
    #print('words',words)
    if words:
      if words[0]=='Not':
        return 'Not CAFV'
      else:
        return words[0]
    else:
        return None  # No word found

In [49]:
# Example usage:
result = find_first_word(test_ip)
print('The output is --->',result)  # Output: "Sarcasm"

The output is ---> None


In [50]:
finetuned_results_raw_ip["predicted_category"] = finetuned_results_raw_ip["model_raw_op"].apply(find_first_word)

In [51]:
def text_to_binary(text):
  if text=='Sarcasm':
    return 1
  else:
    return 0

In [52]:
finetuned_results_raw_ip["predicted_category_bn"] =finetuned_results_raw_ip["predicted_category"].apply(text_to_binary)

In [53]:
finetuned_results_raw_ip

Unnamed: 0,Make-Model-Year,review,label,model_raw_op,predicted_category,predicted_category_bn
0,VOLVO-XC90-2020,pros\nexcellent safety features\nloads of tech...,0,You are a CAFV (Clean Alternative Fuel Vehicle...,,0
1,FORD-F-150-2023,pros\nour full-size truck best buy of 2023\nca...,1,You are a CAFV (Clean Alternative Fuel Vehicle...,,0
2,KIA-EV6-2023,pros\nfast-charging tech\nup to 310 miles of r...,1,You are a CAFV (Clean Alternative Fuel Vehicle...,,0
3,TESLA-MODEL S-2023,pros\n405-mile range\nimpressive tech\nincredi...,1,You are a CAFV (Clean Alternative Fuel Vehicle...,,0
4,LINCOLN-AVIATOR-2023,pros\ncomfort is a priority\nspacious in the f...,0,You are a CAFV (Clean Alternative Fuel Vehicle...,,0
5,PORSCHE-CAYENNE-2021,pros\nevery trim offers impressive performance...,0,You are a CAFV (Clean Alternative Fuel Vehicle...,,0
6,VOLVO-XC60-2024,pros\nstrong safety ratings\nvibrant interior\...,1,You are a CAFV (Clean Alternative Fuel Vehicle...,,0
7,TOYOTA-PRIUS PRIME-2022,pros\nexcellent fuel economy\nhigh level of st...,0,You are a CAFV (Clean Alternative Fuel Vehicle...,,0
8,LAND ROVER-RANGE ROVER SPORT-2024,pros\nprimo off-roader\nopulent interior\nperf...,1,You are a CAFV (Clean Alternative Fuel Vehicle...,,0
9,TOYOTA-RAV4 PRIME-2021,pros\n42 miles of pure electric range\neligibl...,1,You are a CAFV (Clean Alternative Fuel Vehicle...,,0


In [54]:
import os
raw_output_path = "/content/drive/MyDrive/electric-vehicle/output/finetunedmodel/"
# Option 1: Using os.path.join()
csv_filename = 'processed_finetuned_results.csv'
full_csv_path = os.path.join(raw_output_path, csv_filename)

# Now save the DataFrame to the specified location
finetuned_results_raw_ip.to_csv(full_csv_path, sep='\t', index=False)

In [55]:
from sklearn.metrics import classification_report

# Example ground truth and predicted labels
y_true = finetuned_results_raw_ip['label']
y_pred = finetuned_results_raw_ip['predicted_category_bn']

# Define target names (optional)
#target_names = ['class 0', 'class 1', 'class 2']

# Generate the classification report
report = classification_report(y_true, y_pred)

print(report)


              precision    recall  f1-score   support

           0       0.34      1.00      0.51        17
           1       0.00      0.00      0.00        33

    accuracy                           0.34        50
   macro avg       0.17      0.50      0.25        50
weighted avg       0.12      0.34      0.17        50



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
