<a href="https://colab.research.google.com/github/kairos1024/professional-projects/blob/main/Phi_3_Finetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install datasets
!pip install torch torchvision torchaudio
!pip install huggingface-hub
!pip install evaluate accelerate
!pip install einops datasets bitsandbytes accelerate peft flash_attn
!pip install tqdm


Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datase

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split, Subset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, BatchEncoding, BitsAndBytesConfig, get_linear_schedule_with_warmup
from datasets import load_dataset
from huggingface_hub import notebook_login, HfApi
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import logging
import numpy as np
import pickle
import os
from tqdm import tqdm
import time
from accelerate import Accelerator


In [None]:

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Log in to Hugging Face
!git config --global credential.helper store
!huggingface-cli login

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

start_time = time.time()

def print_runtime():
    elapsed_time = time.time() - start_time
    hours, rem = divmod(elapsed_time, 3600)
    minutes, seconds = divmod(rem, 60)
    print(f"Session runtime: {int(hours):02}:{int(minutes):02}:{int(seconds):02}")

print("Starting the training script...")
print_runtime()

# Configuration to load model in 4-bit quantized
bnb_config = BitsAndBytesConfig(load_in_4bit=True,
                                bnb_4bit_quant_type='nf4',
                                bnb_4bit_compute_dtype='float16',
                                bnb_4bit_use_double_quant=True)

print("Loading the Phi-3 model and tokenizer...")
print_runtime()
model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-128k-instruct", quantization_config=bnb_config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")

# Enable gradient checkpointing to save memory
model.gradient_checkpointing_enable()
print("Enabled gradient checkpointing.")
print_runtime()

# Freeze base model layers and cast layernorm in fp32
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
print("Prepared the model for k-bit training.")
print_runtime()

# Define the target modules for LORA
target_modules = [f'model.layers.{i}.self_attn.o_proj' for i in range(32)] + \
                 [f'model.layers.{i}.self_attn.qkv_proj' for i in range(32)] + \
                 [f'model.layers.{i}.mlp.gate_up_proj' for i in range(32)] + \
                 [f'model.layers.{i}.mlp.down_proj' for i in range(32)]

# Apply LORA
config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=target_modules,
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)

lora_model = get_peft_model(model, config)
print("Applied LORA to the model.")
print_runtime()

# Load the Schema-Guided Dialogue dataset
print("Loading the Schema-Guided Dialogue dataset...")
print_runtime()
dataset = load_dataset("schema_guided_dstc8", "dialogues", split='train')

# Define a function to preprocess the examples
def preprocess(example, max_turns=10):
    dialogue = []
    speakers = example['turns']['speaker'][:max_turns]
    utterances = example['turns']['utterance'][:max_turns]
    frames = example['turns']['frames'][:max_turns]

    for i in range(len(speakers)):
        speaker = "User" if speakers[i] == 0 else "System"
        utterance = utterances[i]
        frame = frames[i]

        state_info = ""
        if 'state' in frame and len(frame['state']) > 0:
            state = frame['state'][0]
            active_intent = state.get('active_intent', 'None')
            slots = state.get('slot_values', {})
            state_info = f" [intent: {active_intent}] [slots: {slots}]"

        action_info = ""
        if 'actions' in frame and len(frame['actions']) > 0:
            actions = frame['actions'][0]
            act = actions.get('act', [])
            slot = actions.get('slot', [])
            values = actions.get('values', [])
            action_info = f" [actions: {act}] [slots: {slot}] [values: {values}]"

        dialogue.append(f"{speaker}: {utterance}{state_info}{action_info}")

    tokenized_dialogue = tokenizer(
        ' '.join(dialogue),
        truncation=True,
        padding='max_length',
        max_length=256  # Adjust sequence length if necessary
    )

    # Labels are the same as input_ids for causal language modeling
    tokenized_dialogue["labels"] = tokenized_dialogue["input_ids"].copy()

    return {key: torch.tensor(val) for key, val in tokenized_dialogue.items()}

# Preprocess the whole dataset
print("Preprocessing the dataset...")
preprocessed_dataset = [preprocess(example) for example in tqdm(dataset)]
print(f"Dataset size after preprocessing: {len(preprocessed_dataset)}")
print_runtime()

# Convert the preprocessed data into a Dataset object
class PreprocessedDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

tokenized_dataset = PreprocessedDataset(preprocessed_dataset)

# Split the dataset into training and validation sets
train_size = int(0.8 * len(tokenized_dataset))
val_size = len(tokenized_dataset) - train_size
train_dataset, val_dataset = random_split(tokenized_dataset, [train_size, val_size])
print(f"Training set size: {train_size}")
print(f"Validation set size: {val_size}")
print_runtime()

# Define a function to compute metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = (predictions == labels).mean()
    return {"accuracy": accuracy}

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    fp16=True,
    save_strategy="epoch",
    report_to="none",
    learning_rate=2e-5,
    per_device_train_batch_size=2,  # Reduce batch size
    per_device_eval_batch_size=2,   # Match evaluation batch size
    num_train_epochs=2,
    weight_decay=0.01,
    gradient_accumulation_steps=8,  # Increase gradient accumulation steps
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    load_best_model_at_end=True,
    logging_dir="./logs",
    logging_steps=50,
)

# Initialize Accelerator
accelerator = Accelerator()

# Prepare the model, optimizer, and dataloaders with Accelerator
train_dataloader = DataLoader(train_dataset, batch_size=training_args.per_device_train_batch_size, shuffle=True, num_workers=4)
val_dataloader = DataLoader(val_dataset, batch_size=training_args.per_device_eval_batch_size, shuffle=False, num_workers=4)
optimizer = torch.optim.AdamW(lora_model.parameters(), lr=training_args.learning_rate)

lora_model, optimizer, train_dataloader, val_dataloader = accelerator.prepare(
    lora_model, optimizer, train_dataloader, val_dataloader
)

# Training loop with Accelerator
for epoch in range(training_args.num_train_epochs):
    print(f"Epoch {epoch+1}/{training_args.num_train_epochs}")
    lora_model.train()
    progress_bar = tqdm(enumerate(train_dataloader), total=len(train_dataloader), desc="Training", leave=True)
    for step, batch in progress_bar:
        batch = {k: v.to(accelerator.device) for k, v in batch.items()}
        outputs = lora_model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        if step % training_args.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        # Update progress bar
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item())})

    # Validation loop
    lora_model.eval()
    val_progress_bar = tqdm(val_dataloader, desc="Validation", leave=True)
    for batch in val_progress_bar:
        batch = {k: v.to(accelerator.device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = lora_model(**batch)

print("Training completed.")
print_runtime()

# Upload the final model to Hugging Face
repo_name = "your-repo-name"
api = HfApi()
api.create_repo(repo_name, exist_ok=True)

# Push final model and tokenizer files to Hugging Face Model Hub
print(f"Pushing the final model and tokenizer to the Hugging Face Hub under the repository {repo_name}...")
!git clone https://huggingface.co/username/model-name

!git config --global user.email "your-email"
!git config --global user.name "your-username"

# Save the model and tokenizer files
lora_model.save_pretrained("model-name")
tokenizer.save_pretrained(".model-name")

# Change directory to the cloned repository, add, commit, and push in a single command
!cd "model-name" && git add . && git commit -m "Add model and tokenizer files" && git push

print(f"Model and tokenizer saved to {repo_name}")

# Print GPU usage
if torch.cuda.is_available():
    print(f"Total GPU memory: {torch.cuda.get_device_properties(0).total_memory}")
    print(f"Current GPU memory allocated: {torch.cuda.memory_allocated()}")
    print(f"Current GPU memory cached: {torch.cuda.memory_reserved()}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defin

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/3.35k [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-128k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-128k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/568 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Enabled gradient checkpointing.
Session runtime: 00:00:36
Prepared the model for k-bit training.
Session runtime: 00:00:36
Applied LORA to the model.
Session runtime: 00:00:36
Loading the Schema-Guided Dialogue dataset...
Session runtime: 00:00:36


Downloading data:   0%|          | 0.00/20.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.33M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.86M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16142 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2482 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4201 [00:00<?, ? examples/s]

Preprocessing the dataset...


100%|██████████| 16142/16142 [01:24<00:00, 191.81it/s]


Dataset size after preprocessing: 16142
Session runtime: 00:02:09
Training set size: 12913
Validation set size: 3229
Session runtime: 00:02:09
Epoch 1/2


  self.pid = os.fork()


Epoch 2/2




Training completed.
Session runtime: 06:30:19
Pushing the final model and tokenizer to the Hugging Face Hub under the repository kairos1024/phi-3-oasis...
Cloning into 'phi-3-oasis'...
remote: Enumerating objects: 3, done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 3 (from 1)[K
Unpacking objects: 100% (3/3), 1.06 KiB | 1.06 MiB/s, done.




[main 64076aa] Add model and tokenizer files
 8 files changed, 94040 insertions(+)
 create mode 100644 README.md
 create mode 100644 adapter_config.json
 create mode 100644 adapter_model.safetensors
 create mode 100644 added_tokens.json
 create mode 100644 special_tokens_map.json
 create mode 100644 tokenizer.json
 create mode 100644 tokenizer.model
 create mode 100644 tokenizer_config.json
fatal: could not read Username for 'https://huggingface.co': No such device or address
Model and tokenizer saved to kairos1024/phi-3-oasis
Total GPU memory: 15835660288
Current GPU memory allocated: 3420810240
Current GPU memory cached: 5830082560
