<a href="https://colab.research.google.com/github/kushagra8881/multimodelllm/blob/main/work_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch transformers datasets peft trl openai-clip bitsandbytes

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting trl
  Downloading trl-0.16.1-py3-none-any.whl.metadata (12 kB)
Collecting openai-clip
  Downloading openai-clip-1.0.1.tar.gz (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvi

In [None]:
import torch
import os
import json
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer
# if you using google collab use it

from google.colab import drive
import clip
from PIL import Image
import pandas as pd

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
base_model = "NousResearch/Llama-2-7b-chat-hf"
new_model_path = "/content/drive/MyDrive/llama-2-7b-chat-updated"

In [None]:
with open('/content/drive/MyDrive/fine/data.json', 'r') as f:
    data = json.load(f)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model_clip, preprocess = clip.load("ViT-B/32", device=device)

In [None]:
def image_to_label(image_path, labels):
    # Preprocess the image
    image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)

    # Encode the image
    with torch.no_grad():
        image_features = model_clip.encode_image(image)

    # Tokenize and encode the labels
    text_inputs = clip.tokenize(labels).to(device)
    with torch.no_grad():
        text_features = model_clip.encode_text(text_inputs)

    # Calculate the similarity between image features and text features
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)

    # Get the label with the highest similarity
    best_match_idx = similarity.argmax(dim=-1).item()
    best_label = labels[best_match_idx]

    return best_label

In [None]:
data

[{'id': '1',
  'image': '1.jpeg',
  'conversations': [{'from': 'human',
    'value': 'What do you see happening in this image?\n<image>'},
   {'from': 'gpt',
    'value': 'The scene depicts a lively plaza area with several people walking and enjoying their time. A man is standing in the plaza with his legs crossed, holding a kite in his hand. The kite has multiple sections attached to it, spread out in various directions as if ready for flight.\n\nNumerous people are scattered throughout the plaza, walking and interacting with others. Some of these individuals are carrying handbags, and others have backpacks. The image captures the casual, social atmosphere of a bustling plaza on a nice day.'}]},
 {'id': '2',
  'image': '2.jpeg',
  'conversations': [{'from': 'human',
    'value': 'What do you think is going on in this snapshot?\n<image>'},
   {'from': 'gpt',
    'value': "The image displays a tidy home office desk setting with a laptop placed in the center. On the desk, there is a cord

In [None]:
labels=["a dog palying in a park" , "a dog dancing in the park"]

In [None]:
image_directory = '/content/drive/MyDrive/fine/'

# Process the data
processed_data = []
for item in data:
    image_path = image_directory + item['image']
    label = image_to_label(image_path, labels=labels)
    for conversation in item['conversations']:
        if conversation['from'] == 'human':
            prompt = conversation['value'].replace("<image>","an image of "+ label)
        else:
            response = conversation['value']
            processed_data.append({"prompt": prompt, "response": response})

# Convert to Hugging Face dataset
df = pd.DataFrame(processed_data)
dataset = Dataset.from_pandas(df)

# Print the processed data for verification
print(processed_data)

[{'prompt': 'What do you see happening in this image?\nan image of a dog palying in a park', 'response': 'The scene depicts a lively plaza area with several people walking and enjoying their time. A man is standing in the plaza with his legs crossed, holding a kite in his hand. The kite has multiple sections attached to it, spread out in various directions as if ready for flight.\n\nNumerous people are scattered throughout the plaza, walking and interacting with others. Some of these individuals are carrying handbags, and others have backpacks. The image captures the casual, social atmosphere of a bustling plaza on a nice day.'}, {'prompt': 'What do you think is going on in this snapshot?\nan image of a dog palying in a park', 'response': "The image displays a tidy home office desk setting with a laptop placed in the center. On the desk, there is a cordless keyboard in front of the laptop and a wireless mouse situated slightly to the right of it. A cell phone can be seen on the left si

In [None]:
compute_dtype = getattr(torch, "float16")
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)
training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=10,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

In [None]:
dataset_text_field="prompt",
dataset_kwargs={"input_column": "prompt"},


In [None]:
def format_example(example):
    # Tokenize the prompt and response
    inputs = tokenizer(example["prompt"], truncation=True, padding="max_length")
    targets = tokenizer(example["response"], truncation=True, padding="max_length")

    # Prepare the inputs for the SFTTrainer
    input_ids = inputs["input_ids"] + targets["input_ids"]
    attention_mask = inputs["attention_mask"] + targets["attention_mask"]
    labels = [-100] * len(inputs["input_ids"]) + targets["input_ids"]

    # Return the formatted data as a dictionary
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

# Manually apply the function to each example in the list
tokenized_data = [format_example(example) for example in processed_data]


In [None]:
tokenized_data

[{'input_ids': [1,
   1724,
   437,
   366,
   1074,
   10464,
   297,
   445,
   1967,
   29973,
   13,
   273,
   1967,
   310,
   263,
   11203,
   5112,
   5414,
   297,
   263,
   14089,
   1,
   450,
   9088,
   1401,
   919,
   29879,
   263,
   301,
   3598,
   2174,
   1362,
   4038,
   411,
   3196,
   2305,
   22049,
   322,
   11418,
   5414,
   1009,
   931,
   29889,
   319,
   767,
   338,
   13407,
   297,
   278,
   2174,
   1362,
   411,
   670,
   21152,
   21692,
   29892,
   13587,
   263,
   413,
   568,
   297,
   670,
   1361,
   29889,
   450,
   413,
   568,
   756,
   2999,
   13926,
   10959,
   304,
   372,
   29892,
   9677,
   714,
   297,
   5164,
   18112,
   408,
   565,
   7960,
   363,
   16286,
   29889,
   13,
   13,
   29940,
   4680,
   681,
   2305,
   526,
   29574,
   10106,
   278,
   2174,
   1362,
   29892,
   22049,
   322,
   16254,
   292,
   411,
   4045,
   29889,
   3834,
   310,
   1438,
   15724,
   526,
   19436,
   1361,
   29890,

In [None]:
def format_tokenized_example(example):
    # Tokenize the prompt and response
    input_ids = example["input_ids"]  # Assuming this is already tokenized.

    # Create an attention mask where 1 indicates a valid token and 0 indicates padding
    attention_mask = [1] * len(input_ids)  # For now, assume no padding, can adjust later

    # Labels should be the same as the input_ids except that padding tokens should be replaced with -100
    labels = input_ids.copy()

    # Here, we set the labels for the padding tokens to -100
    labels = [-100 if id == 0 else id for id in labels]  # Assuming 0 is the padding token

    # Return the formatted example
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

# Apply the function to your tokenized data
formatted_data = [format_tokenized_example(example) for example in tokenized_data]

# Now, formatted_data is ready for training


In [None]:
from datasets import Dataset

# Convert the list of formatted examples to a Hugging Face Dataset
formatted_dataset = Dataset.from_list(formatted_data)

# Now you can use this dataset in your trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=formatted_dataset,
    args=training_params,
    peft_config=peft_params,
)


Truncating train dataset:   0%|          | 0/5 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=20, training_loss=1.9181730270385742, metrics={'train_runtime': 70.3849, 'train_samples_per_second': 0.71, 'train_steps_per_second': 0.284, 'total_flos': 313782432768000.0, 'train_loss': 1.9181730270385742})

In [None]:
trainer.model.save_pretrained(new_model_path)
trainer.tokenizer.save_pretrained(new_model_path)

In [None]:
from tensorboard import notebook
log_dir = "results/runs"
notebook.start("--logdir {} --port 4000".format(log_dir))