In [3]:
# Install and import the necessary libraries
!pip install -r /content/requirements.txt

Collecting bitsandbytes (from -r /content/requirements.txt (line 4))
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting trl (from -r /content/requirements.txt (line 6))
  Downloading trl-0.16.1-py3-none-any.whl.metadata (12 kB)
Collecting rouge_score (from -r /content/requirements.txt (line 8))
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sacrebleu (from -r /content/requirements.txt (line 9))
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets (from -r /content/requirements.txt (line 10))
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->-r /content/requirements.txt (line 1))
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.

In [4]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [5]:
# import necessary libraries

from typing import List, Dict

import numpy as np
import torch

from datasets import Dataset, load_dataset
import evaluate
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import GRPOConfig, GRPOTrainer


In [6]:
### Load dataset (OpenAssistant/oasst1)

def get_dataset():
  dataset_loaded = load_dataset("OpenAssistant/oasst1")
  train_dataset = dataset_loaded["train"].to_pandas()
  val_dataset = dataset_loaded["validation"].to_pandas()
  return dataset_loaded, train_dataset, val_dataset

In [7]:
### Load Microsoft-phi2 Model

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16)

# model loading along with tokenizer
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", torch_dtype="auto", quantization_config=bnb_config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", max_length=512, trust_remote_code=True)
model.config.use_cache = False
tokenizer.pad_token = tokenizer.eos_token

# Enable gradient checkpointing for memory efficiency
model.gradient_checkpointing_enable()
model.enable_input_require_grads()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [8]:
# Inferencing on base model
from transformers import pipeline

prompt = "Based on the following multi‑turn conversation, draft the assistant’s next response as a playful, concise birthday‑party invitation peppered with inside jokes."  # change to your desired prompt
gen = pipeline('text-generation', model=model, tokenizer=tokenizer)
result = gen(prompt)
print(result[0]['generated_text'])

Device set to use cuda:0


Based on the following multi‑turn conversation, draft the assistant’s next response as a playful, concise birthday‑party invitation peppered with inside jokes.
## INPUT
Person 1: Hey, I’m so excited for the party this


In [9]:
### Preprocess the data and prepare it for fine-tuning process

def prep_data(df):
    """
    Prepares data from a DataFrame by matching assistant messages to corresponding prompter instructions,
    filtering for English examples, and merging them into a single prompt.
    """
    # Filter assistant and prompter messages
    df_assistant = df[(df.role == "assistant") & (df["rank"] == 0.0)].copy()
    df_prompter = df[df.role == "prompter"].copy()
    df_prompter = df_prompter.set_index("message_id")

    # Assistant's output text
    df_assistant["output"] = df_assistant["text"].values

    instructions = []
    parent_ids = []
    # For each assistant message, get the corresponding prompter message (instruction)
    for _, row in df_assistant.iterrows():
        input_row = df_prompter.loc[row.parent_id]
        instructions.append(input_row.text)
        parent_ids.append(input_row.parent_id)

    df_assistant["instruction"] = instructions
    df_assistant["parent_id"] = parent_ids

    # Filter to include only English examples
    df_assistant = df_assistant[df_assistant.lang == "en"]

    # Create a combined prompt with sections, properly escaping quotes
    def create_prompt(row):
        instruction = row["instruction"].replace('"', "'")
        output = row["output"].replace('"', "'")
        return (
            f"""###System:
Read the instruction and provide an answer.
###Instruction:
{instruction}
###Answer:
{output}"""
        )

    df_assistant["prompt"] = df_assistant.apply(create_prompt, axis=1)

    # Select and rename columns for clarity
    df_assistant = df_assistant[
        ["prompt", "instruction", "output", "message_id", "parent_id"]
    ].rename(columns={"message_id": "id"})

    return df_assistant


def collate_and_tokenize(batch):
    # Print batch structure
    # print(f"Batch keys: {batch.keys()}")
    # print(f"First item in prompt column: {batch['prompt'][:2]}")

    # Extract prompts safely
    prompts = batch.get("prompt", [])

    # Ensure all prompts are valid strings
    prompts = [str(p) for p in prompts if p is not None]

    # Tokenize using your tokenizer
    encoded = tokenizer(
        prompts,
        return_tensors="np",
        padding="max_length",
        truncation=True,
        max_length=512,
    )

    return encoded




In [10]:
### get the data
entire, train_ds, val_ds = get_dataset()

train_df = prep_data(train_ds)
eval_df = prep_data(val_ds)

README.md:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

(…)-00000-of-00001-b42a775f407cee45.parquet:   0%|          | 0.00/39.5M [00:00<?, ?B/s]

(…)-00000-of-00001-134b8fd0c89408b6.parquet:   0%|          | 0.00/2.08M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/84437 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4401 [00:00<?, ? examples/s]

In [11]:
# Convert pandas DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

# Define columns to remove after tokenization (you can adjust as necessary)
columns_to_remove = ["instruction", "output", "id", "parent_id"]

# Tokenize the datasets using the collate function.
tokenized_train_dataset = train_dataset.map(
    collate_and_tokenize,   # No need for lambda or list wrapping
    batched=True,           # Process in batches
    batch_size=8,           # Optimize batch size for P100 GPU
    remove_columns=columns_to_remove
)

tokenized_eval_dataset = eval_dataset.map(
    collate_and_tokenize,
    batched=True,
    batch_size=8,
    remove_columns=columns_to_remove
)

# For demonstration, print one tokenized example from the training dataset
print(f"prompt: {tokenized_train_dataset[0].get('prompt')}")

Map:   0%|          | 0/7856 [00:00<?, ? examples/s]

Map:   0%|          | 0/418 [00:00<?, ? examples/s]

prompt: ###System:
Read the instruction and provide an answer.
###Instruction:
Can you write a short introduction about the relevance of the term 'monopsony' in economics? Please use examples related to potential monopsonies in the labour market and cite relevant research.
###Answer:
'Monopsony' refers to a market structure where there is only one buyer for a particular good or service. In economics, this term is particularly relevant in the labor market, where a monopsony employer has significant power over the wages and working conditions of their employees. The presence of a monopsony can result in lower wages and reduced employment opportunities for workers, as the employer has little incentive to increase wages or provide better working conditions.

Recent research has identified potential monopsonies in industries such as retail and fast food, where a few large companies control a significant portion of the market (Bivens & Mishel, 2013). In these industries, workers often face l

In [12]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=[
    "q_proj",
    "up_proj",
    "o_proj",
    "k_proj",
    "down_proj",
    "gate_proj",
    "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

print("Getting PEFT Configured model!")
model = get_peft_model(model, peft_config)

Getting PEFT Configured model!


In [13]:
print(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): PhiForCausalLM(
      (model): PhiModel(
        (embed_tokens): Embedding(51200, 2560)
        (layers): ModuleList(
          (0-31): 32 x PhiDecoderLayer(
            (self_attn): PhiAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2560, out_features=2560, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2560, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2560, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear4b

In [14]:
output_dir = "./results"
per_device_train_batch_size = 8
gradient_accumulation_steps = 4
optim = "paged_adamw_32bit"
save_steps = 10
logging_steps = 10
learning_rate = 2e-4
max_grad_norm = 0.3
max_steps = 40
warmup_ratio = 0.03
lr_scheduler_type = "cosine"

In [15]:

# # Define the reward function, which rewards completions that are close to 20 characters
# def reward_len(completions, **kwargs):
#     return [-abs(100 - len(completion)) for completion in completions]

# training_args = GRPOConfig(output_dir=output_dir,
#                            per_device_train_batch_size=per_device_train_batch_size,
#                            gradient_accumulation_steps=gradient_accumulation_steps,
#                            optim=optim,
#                            save_steps=save_steps,
#                            logging_steps=logging_steps,
#                            learning_rate=learning_rate,
#                            fp16=True,
#                            max_grad_norm=max_grad_norm,
#                            max_steps=max_steps,
#                            warmup_ratio=warmup_ratio,
#                            dataloader_pin_memory=False,
#                            group_by_length=True,
#                            lr_scheduler_type=lr_scheduler_type,
#                            gradient_checkpointing=True,
#                            )

# device = 'cuda'


# trainer = GRPOTrainer(
#     model=model.to(device),
#     peft_config=peft_config,
#     reward_funcs=reward_len,
#     args=training_args,
#     train_dataset=tokenized_train_dataset,
#     eval_dataset=tokenized_eval_dataset,
#     tokenizer=tokenizer
# )

# # Set the tokenizer as an attribute of the trainer after initialization
# trainer.tokenizer = tokenizer
# # Explicitly set the padding token for the tokenizer
# trainer.tokenizer.pad_token = trainer.tokenizer.eos_token
# # or trainer.tokenizer.pad_token = '[PAD]' if you added '[PAD]' as a special token

# trainer.train()
# trainer.train()

In [16]:
# configure the necessary configurators
# reference: https://huggingface.co/docs/trl/main/en/grpo_trainer#trl.GRPOTrainer

training_args = GRPOConfig(output_dir=output_dir,
                           per_device_train_batch_size=per_device_train_batch_size,
                           gradient_accumulation_steps=gradient_accumulation_steps,
                           optim=optim,
                           save_steps=save_steps,
                           logging_steps=logging_steps,
                           learning_rate=learning_rate,
                           fp16=True,
                           max_grad_norm=max_grad_norm,
                           max_steps=max_steps,
                           warmup_ratio=warmup_ratio,
                           dataloader_pin_memory=False,
                           group_by_length=True,
                           lr_scheduler_type=lr_scheduler_type,
                           gradient_checkpointing=True,
                           )


# 1) Load metrics using evaluate (not datasets)
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
squad = evaluate.load("squad")

# 2) Define sub‐reward functions
def reward_qa(preds: List[str], refs: List[str], **kwargs) -> List[float]:
    results = squad.compute(predictions=[{"prediction_text": p} for p in preds],
                            references=[{"answers": {"text": [r]}} for r in refs])
    em = results["exact_match"]
    f1 = results["f1"]
    return [(em[i] + f1[i]) / 2 / 100.0 for i in range(len(preds))]

def reward_bleu(preds: List[str], refs: List[str], **kwargs) -> List[float]:
    tokenized_preds = [p.split() for p in preds]
    tokenized_refs  = [[r.split()] for r in refs]
    result = bleu.compute(predictions=tokenized_preds, references=tokenized_refs)
    return [result["bleu"]] * len(preds)

def reward_rouge(preds: List[str], refs: List[str], **kwargs) -> List[float]:
    result = rouge.compute(predictions=preds, references=refs, rouge_types=["rougeL"])
    return [result["rougeL"].mid.fmeasure] * len(preds)

def reward_len(preds: List[str], target_len: int = 20, **kwargs) -> List[float]:
    return [1.0 - abs(len(p) - target_len) / target_len for p in preds]

# 3) Combine them
def combined_reward(completions: List[str],
                    prompts: List[str] = None,
                    references: List[str] = None,
                    weights: Dict[str, float] = None,
                    **kwargs) -> List[float]:
    if references is None:
        # fallback: assign neutral or zero reward if references not given
        return [0.0 for _ in completions]

    if weights is None:
        weights = {"qa": 1.0, "bleu": 1.0, "rouge": 1.0, "len": 0.5}
    total_w = sum(weights.values())

    # compute sub-rewards
    qa_r    = reward_qa(completions, references)
    bleu_r  = reward_bleu(completions, references)
    rouge_r = reward_rouge(completions, references)
    len_r   = reward_len(completions, target_len=20)

    # normalize weights
    w_qa    = weights["qa"]    / total_w
    w_bleu  = weights["bleu"]  / total_w
    w_rouge = weights["rouge"] / total_w
    w_len   = weights["len"]   / total_w

    combined = []
    for i in range(len(completions)):
        r = (
            w_qa    * qa_r[i]
          + w_bleu  * bleu_r[i]
          + w_rouge * rouge_r[i]
          + w_len   * len_r[i]
        )
        combined.append(r)
    return combined



# 4) Instantiate your trainer
trainer = GRPOTrainer(
    model=model.to(device),
    peft_config=peft_config,
    reward_funcs=combined_reward,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset
)




# Set the tokenizer as an attribute of the trainer after initialization
trainer.tokenizer = tokenizer
# Explicitly set the padding token for the tokenizer
trainer.tokenizer.pad_token = trainer.tokenizer.eos_token
# or trainer.tokenizer.pad_token = '[PAD]' if you added '[PAD]' as a special token

# trainer.train()
trainer.train()

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
Trainer.tokenizer is now deprecated. You should use `Trainer.processing_class = processing_class` instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkiranchw000[0m ([33mimnskc[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Step,Training Loss
10,0.0042
20,0.0055
30,0.0065
40,0.0035


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

TrainOutput(global_step=40, training_loss=0.004914794769138098, metrics={'train_runtime': 6958.0002, 'train_samples_per_second': 0.184, 'train_steps_per_second': 0.006, 'total_flos': 0.0, 'train_loss': 0.004914794769138098})

In [17]:
save_model_name = "phi2-finetuned-qlora-grpo"
trainer.model.save_pretrained(save_model_name)

In [18]:
# # Inference with the finetuned model which is merged with base model
# from transformers import AutoModelForCausalLM, AutoTokenizer
# from transformers import pipeline

# model_path = "/content/results/checkpoint-25"  # change to the path where your model is saved
# inference_model = AutoModelForCausalLM.from_pretrained(model_path)
# inference_tokenizer = AutoTokenizer.from_pretrained(model_path)


# prompt = "Based on the following multi‑turn conversation, draft the assistant’s next response as a playful, concise birthday‑party invitation peppered with inside jokes."  # change to your desired prompt
# gen = pipeline('text-generation', model=inference_model, tokenizer=inference_tokenizer)
# result = gen(prompt)
# print(result[0]['generated_text'])

In [19]:
# 6012534a43916343c566dc8df4c228f4ffd0992b

In [20]:
## Merging with base model and saving fine-tuned

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import os


def load_base_model(model_name: str, dtype=torch.float16):
    """Load the base model with appropriate device and dtype."""
    return AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=dtype,
        device_map="auto",
        low_cpu_mem_usage=True,
        return_dict=True
    )


def load_and_merge_lora(base_model, lora_path: str):
    """Load LoRA weights and merge with the base model."""
    peft_model = PeftModel.from_pretrained(base_model, lora_path)
    return peft_model.merge_and_unload()


def load_tokenizer(model_name: str):
    """Load tokenizer and configure padding settings."""
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    return tokenizer


def save_model_and_tokenizer(model, tokenizer, save_path: str):
    """Save model and tokenizer to the target path."""
    os.makedirs(save_path, exist_ok=True)
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)
    print(f"✅ Model and tokenizer saved to: {save_path}")


def merge_and_save_lora_model(
    base_model_name: str,
    lora_checkpoint_path: str,
    output_path: str,
    dtype=torch.float16
):
    """High-level wrapper to merge LoRA weights and save the final model."""
    print("🚀 Loading base model...")
    base_model = load_base_model(base_model_name, dtype)

    print("🔗 Merging LoRA weights...")
    merged_model = load_and_merge_lora(base_model, lora_checkpoint_path)

    print("🧠 Loading tokenizer...")
    tokenizer = load_tokenizer(base_model_name)

    print("💾 Saving model and tokenizer...")
    save_model_and_tokenizer(merged_model, tokenizer, output_path)


base_model_name = "microsoft/phi-2"
lora_checkpoint_path = "/content/results/checkpoint-40"
output_path = "/content/results/phi2-qlora-merged"

merge_and_save_lora_model(
    base_model_name=base_model_name,
    lora_checkpoint_path=lora_checkpoint_path,
    output_path=output_path,
)


🚀 Loading base model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

🔗 Merging LoRA weights...




🧠 Loading tokenizer...
💾 Saving model and tokenizer...
✅ Model and tokenizer saved to: /content/results/phi2-qlora-merged


In [None]:
# Inference with the finetuned model which is merged with base model
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import pipeline

model_path = "/content/results/phi2-qlora-merged"  # change to the path where your model is saved
inference_model = AutoModelForCausalLM.from_pretrained(model_path)
inference_tokenizer = AutoTokenizer.from_pretrained(model_path)


prompt = "Based on the following multi‑turn conversation, draft the assistant’s next response as a playful, concise birthday‑party invitation peppered with inside jokes."  # change to your desired prompt
gen = pipeline('text-generation', model=inference_model, tokenizer=inference_tokenizer)
result = gen(prompt)
print(result[0]['generated_text'])

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [1]:
!mkdir /content/combined_results
!mv /content/results/checkpoint-40 /content/combined_results/
!mv /content/results/runs /content/combined_results/
!mv /content/results/phi2-qlora-merged /content/combined_results/
!mv /content/wandb /content/combined_results/
!mv /content/phi2-finetuned-qlora-grpo /content/combined_results/
#

In [None]:
!zip -r archive2.zip /content/combined_results/

  adding: content/combined_results/ (stored 0%)
  adding: content/combined_results/checkpoint-40/ (stored 0%)
  adding: content/combined_results/checkpoint-40/special_tokens_map.json (deflated 75%)
  adding: content/combined_results/checkpoint-40/vocab.json (deflated 59%)
  adding: content/combined_results/checkpoint-40/merges.txt (deflated 53%)
  adding: content/combined_results/checkpoint-40/training_args.bin (deflated 51%)
  adding: content/combined_results/checkpoint-40/trainer_state.json (deflated 69%)
  adding: content/combined_results/checkpoint-40/rng_state.pth (deflated 25%)
  adding: content/combined_results/checkpoint-40/adapter_model.safetensors (deflated 8%)
  adding: content/combined_results/checkpoint-40/README.md (deflated 66%)
  adding: content/combined_results/checkpoint-40/added_tokens.json (deflated 84%)
  adding: content/combined_results/checkpoint-40/scaler.pt (deflated 60%)
  adding: content/combined_results/checkpoint-40/scheduler.pt (deflated 56%)
  adding: con

In [None]:
# !zip -r archive.zip /content/results/