### Overview
Using pretrained Vision Language Model to finetune zero-shot classifier for low-latency activity data on small model. 
Likely eventually replaced or quantized for mobile device / low-cost compute.

In [1]:
# !pip install -q transformers peft datasets accelerate bitsandbytes


In [4]:
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
import pandas as pd
from datasets import Dataset
import sys
from pathlib import Path
# Add parent directory to path
parent_dir = Path().resolve().parent
sys.path.append(str(parent_dir))

# Replace this with your SQL-to-Pandas query
# df = pd.read_sql("SELECT * FROM training_data", con=your_sql_connection)
df = pd.read_csv('..\PPG_ACC_processed_data\data.csv')
# Process data into Hugging Face Dataset format
def process_data(row):
    return {
        "input": {
            "ACCi": row["ACCi"],
            "ACCj": row["ACCj"],
            "ACCk": row["ACCk"],
            "HeartRate": row["HeartRate"],
            "SubjectID": row["SubjectID"],
            "Age": row["Age"],
            "Gender": row["Gender"],
            "Height": row["Height"],
            "Weight": row["Weight"],
            "SkinType": row["SkinType"],
            "SportLevel": row["SportLevel"],
        },
        "label": row["Activity"],
    }

dataset = Dataset.from_pandas(df)
dataset = dataset.map(process_data)

# Split dataset into training and testing
train_test_split = dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]


Map:   0%|          | 0/1674112 [00:00<?, ? examples/s]

In [8]:
import torch
from transformers import AutoTokenizer, AutoModelForVision2Seq
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

model_id = "Qwen/Qwen2-VL-2B"

bnb_config = {
    "load_in_4bit": True,
    "bnb_4bit_use_double_quant": True,
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_compute_dtype": torch.bfloat16,
}

model = AutoModelForVision2Seq.from_pretrained(model_id, quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Add LoRA configurations
lora_config = LoraConfig(
    r=32,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  # Modules to apply LoRA
    lora_dropout=0.1,
    task_type="SEQ_CLASSIFICATION"
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)


`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/56.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/429M [00:00<?, ?B/s]

`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.56k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

ValueError: Invalid task type: 'SEQ_CLASSIFICATION'. Must be one of the following task types: SEQ_CLS, SEQ_2_SEQ_LM, CAUSAL_LM, TOKEN_CLS, QUESTION_ANS, FEATURE_EXTRACTION.

In [None]:
from transformers import DataCollatorForLanguageModeling

def tokenize_function(example):
    return tokenizer(
        example["input"], truncation=True, padding="max_length", max_length=512
    )

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)


### Model Training & Finetuning

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./outputs",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=2,
    fp16=True,
    push_to_hub=True,  # Enable if you want to push the model to Hugging Face Hub
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()


In [None]:
# Save the fine-tuned model and tokenizer locally
model.save_pretrained("./finetuned_qwen2")
tokenizer.save_pretrained("./finetuned_qwen2")

print("Model and tokenizer saved locally at './finetuned_qwen2'")


In [None]:
from datetime import datetime

# Push the fine-tuned model to Hugging Face Hub
model_name = f"finetuned-qwen2-vl-2b-{datetime.now().strftime('%d%H%M')}"
trainer.model.push_to_hub(model_name)
tokenizer.push_to_hub(model_name)

print(f"Model pushed to Hugging Face Hub under the name '{model_name}'")


### Citations
@article{Qwen2-VL,
  title={Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution}, 
  author={Peng Wang and Shuai Bai and Sinan Tan and Shijie Wang and Zhihao Fan and Jinze Bai and Keqin Chen and Xuejing Liu and Jialin Wang and Wenbin Ge and Yang Fan and Kai Dang and Mengfei Du and Xuancheng Ren and Rui Men and Dayiheng Liu and Chang Zhou and Jingren Zhou and Junyang Lin},
  journal={arXiv preprint arXiv:2409.12191},
  year={2024}
}

@article{Qwen-VL,
  title={Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond},
  author={Bai, Jinze and Bai, Shuai and Yang, Shusheng and Wang, Shijie and Tan, Sinan and Wang, Peng and Lin, Junyang and Zhou, Chang and Zhou, Jingren},
  journal={arXiv preprint arXiv:2308.12966},
  year={2023}
}

https://medium.com/the-ai-forum/instruction-fine-tuning-gemma-2b-on-medical-reasoning-and-convert-the-finetuned-model-into-gguf-844191f8d329
