In [1]:
!pip install -q transformers datasets peft accelerate bitsandbytes

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
import torch

In [3]:
def train_lora(dataset, adapter_dir, text_columns, max_length=256):
    model_name = "gpt2-xl"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token  # GPT2 has no PAD token

    model = AutoModelForCausalLM.from_pretrained(model_name)
    model.resize_token_embeddings(len(tokenizer))

    # LoRA config
    lora_config = LoraConfig(
        r=8,
        lora_alpha=16,
        target_modules=["c_attn", "c_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
    )

    model = get_peft_model(model, lora_config)

    # 🔹 Combine text columns into one training text field
    def merge_columns(examples):
        texts = []
        for i in range(len(examples[text_columns[0]])):
            merged = ""
            for col in text_columns:
                if col in examples and examples[col][i] is not None:
                    merged += f"{col.upper()}: {examples[col][i]}\n"
            texts.append(merged.strip())
        return {"merged_text": texts}

    dataset = dataset.map(merge_columns, batched=True)

    # 🔹 Tokenize with labels for CLM loss
    def tokenize_fn(examples):
        outputs = tokenizer(
            examples["merged_text"],
            truncation=True,
            padding="max_length",
            max_length=max_length,
        )
        outputs["labels"] = outputs["input_ids"].copy()
        return outputs

    tokenized_dataset = dataset.map(tokenize_fn, batched=True, remove_columns=dataset.column_names)

    # --------------------------
    # Training setup
    # --------------------------
    training_args = TrainingArguments(
        output_dir=adapter_dir,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        num_train_epochs=5,
        learning_rate=1e-4,
        logging_steps=50,
        save_strategy="epoch",
        report_to="none",  # disable W&B
        fp16=torch.cuda.is_available(),
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        tokenizer=tokenizer,
    )

    trainer.train()

    model.save_pretrained(adapter_dir)
    tokenizer.save_pretrained(adapter_dir)


# Train on Story

In [4]:
# STORY LoRA Adapter
# --------------------------
story_ds = load_dataset("euclaise/writingprompts", split="train[:20000]")
train_lora(
    dataset=story_ds,
    adapter_dir="./lora_story",
    text_columns=["prompt", "story"]
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/837 [00:00<?, ?B/s]

data/train-00000-of-00002-105e07cb0d1994(…):   0%|          | 0.00/272M [00:00<?, ?B/s]

data/train-00001-of-00002-4fdb982c110564(…):   0%|          | 0.00/272M [00:00<?, ?B/s]

data/test-00000-of-00001-16503b0c26ed00c(…):   0%|          | 0.00/30.0M [00:00<?, ?B/s]

data/validation-00000-of-00001-137b93e1e(…):   0%|          | 0.00/30.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/272600 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/15138 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/15620 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
50,3.2649
100,2.9656
150,2.8807
200,2.8617


Step,Training Loss
50,3.2649
100,2.9656
150,2.8807
200,2.8617
250,2.8288
300,2.8209
350,2.8088
400,2.8216


KeyboardInterrupt: 

# Train on Code

In [None]:
code_ds = load_dataset("microsoft/rStar-Coder", "synthetic_sft", split="train[:20000]")
train_lora(
    dataset=code_ds,
    adapter_dir="./lora_code",
    text_columns=["question", "seed_question", "response", "code"]
)

README.md: 0.00B [00:00, ?B/s]

synthetic_sft/data-00000-of-00015.parque(…):   0%|          | 0.00/426M [00:00<?, ?B/s]

synthetic_sft/data-00001-of-00015.parque(…):   0%|          | 0.00/429M [00:00<?, ?B/s]

synthetic_sft/data-00002-of-00015.parque(…):   0%|          | 0.00/443M [00:00<?, ?B/s]

# Save to Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Save to Drive (change path if you want)
!cp -r ./lora_story /content/drive/MyDrive/AdapterFusion/lora_story
!cp -r ./lora_code /content/drive/MyDrive/AdapterFusion/lora_code
print("Models saved to Drive!!!.")

Mounted at /content/drive
Models saved to Drive!!!.


# Merging

In [None]:
!pip install -q transformers peft accelerate bitsandbytes

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

# -----------------------------
# 1️⃣ Load the base GPT-2-XL model
# -----------------------------
base_model_name = "gpt2-xl"
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto"
)

# -----------------------------
# 2️⃣ Merge story adapter first
# -----------------------------
story_adapter_path = "./lora_story"
code_adapter_path  = "./lora_code"

print("🔹 Merging story adapter...")
story_model = PeftModel.from_pretrained(base_model, story_adapter_path)
story_model = story_model.merge_and_unload()  # merges LoRA into base weights

# -----------------------------
# 3️⃣ Now merge the code adapter onto the updated model
# -----------------------------
print("🔹 Merging code adapter...")
code_model = PeftModel.from_pretrained(story_model, code_adapter_path)
merged_model = code_model.merge_and_unload()  # merges second adapter too

# -----------------------------
# 4️⃣ Save the final unified model
# -----------------------------
save_path = "./gpt2-xl-lora-story-code-merged"
merged_model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"✅ Merged model saved at: {save_path}")


🔹 Merging story adapter...
🔹 Merging code adapter...
✅ Merged model saved at: ./gpt2-xl-lora-story-code-merged


# TEST

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

merged_model_path = "./gpt2-xl-lora-story-code-merged"

tokenizer = AutoTokenizer.from_pretrained(merged_model_path)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    merged_model_path,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto"
)
model.eval()

def generate(prompt, max_length=200, temperature=0.8):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            temperature=temperature,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# 🔹 Test prompts
print("=== STORY SAMPLE ===")
print(generate("Once upon a time in a mysterious city,"))

print("\n=== CODE SAMPLE ===")
print(generate("Write a Python function to calculate Fibonacci numbers"))


=== STORY SAMPLE ===
Once upon a time in a mysterious city, a mysterious book was published. A man by the name of the Lord decided that he would make his book available to anyone who would read it. Any person who reads the book, after reading it, will be able to enter the book and meet the Lord.

The Lord was not the most intelligent man in the world. He was a man of great power and wealth, but not even he could create a book that would fit every person. So he decided to do something else instead: he created a series of books that contained a single word: the word: 



 


 


 




 





 




 




 


 


 

 

 

 




 

 

 

 

 

 

=== CODE SAMPLE ===
Write a Python function to calculate Fibonacci numbers. The function takes two integers n and m as input and returns the nth Fibonacci number, written in the form n! = 1 + 2n + m. Write a function to find the nth Fibonacci number. The function takes n as input and returns n! = 1 + 2n! + n!  


 




 

 



 


 

 

 

 
 

 
 

# Save to Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Save to Drive (change path if you want)
!cp -r ./gpt2-xl-lora-story-code-merged/ /content/drive/MyDrive/AdapterFusion2/gpt2-xl-lora-story-code-merged
print("Model saved to Drive!!!.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Model saved to Drive!!!.
