<a href="https://colab.research.google.com/github/insaabbas/Humor-generation-task/blob/main/4_epoch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1. SETUP & DRIVE MOUNT
from google.colab import drive
drive.mount('/content/drive')

!pip install -q -U transformers peft bitsandbytes datasets accelerate

import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,
    AutoConfig, Trainer, TrainingArguments, DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model

# 2. DATA PREP (Required to avoid NameError)
model_name = "microsoft/phi-2"
df = pd.read_csv('jokes.tsv', sep='\t')
def format_instruction(example):
    return {"text": f"### Input: {example['input']}\n### Response: JOKE: {example['joke']}"}

dataset = Dataset.from_pandas(df).map(format_instruction)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=200, padding="max_length")
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)

# 3. MODEL LOADING with PHI-2 FIX
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
config.pad_token_id = tokenizer.pad_token_id

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name, config=config, quantization_config=bnb_config,
    trust_remote_code=True, device_map="auto"
)
model.config.use_cache = False

# 4. LoRA SETUP (r=16 as per your specific code)
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["Wqkv", "fc1", "fc2"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, peft_config)

# 5. TRAINING ARGUMENTS
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/phi2-joke-trainer",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=4,
    logging_steps=10,
    save_steps=100,
    save_total_limit=2,
    fp16=True,
    optim="paged_adamw_32bit",
    gradient_checkpointing=True,
    report_to="none",
    push_to_hub=False,
    remove_unused_columns=False
)

# Fix for Hub Token bug
if not hasattr(training_args, 'push_to_hub_token'):
    training_args.push_to_hub_token = None

trainer = Trainer(
    model=model,
    train_dataset=tokenized_dataset,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

# ==========================================================
# 6. RESUME COMMAND
# ==========================================================
# This looks for the latest checkpoint in your output_dir
trainer.train(resume_from_checkpoint=True)

# Save Final
trainer.model.save_pretrained("/content/drive/MyDrive/phi2-joke-model-final")
tokenizer.save_pretrained("/content/drive/MyDrive/phi2-joke-model-final")

print("RESUME COMPLETE! Model saved to 'phi2-joke-model-final'.")

In [None]:
!pip install -q -U transformers accelerate bitsandbytes peft datasets

MERGING AND PUSHING MODEL

In [None]:
# 1. FORCE MOUNT DRIVE
from google.colab import drive
import os
drive.mount('/content/drive', force_remount=True)

# 2. VERIFY THE FOLDER EXISTS
target_folder = "phi2-joke-model-final"
base_path = "/content/drive/MyDrive/"
full_path = os.path.join(base_path, target_folder)

if not os.path.exists(full_path):
    print(f"❌ Still can't find {target_folder}!")
    print("Listing what IS in your Drive:")
    print(os.listdir(base_path))
else:
    print(f"✅ Found it! Path is: {full_path}")

    # 3. RUN THE MERGE
    import torch
    from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
    from peft import PeftModel
    from huggingface_hub import login

    # LOGIN (Use 'Write' token)
    login()

    hf_username = "insaabbas"
    new_model_name = f"{hf_username}/phi2-4-epoch-humor-model"
    base_model_name = "microsoft/phi-2"

    print("Loading base model...")
    tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token

    config = AutoConfig.from_pretrained(base_model_name, trust_remote_code=True)
    config.pad_token_id = tokenizer.pad_token_id

    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        config=config,
        torch_dtype=torch.float16,
        device_map="cpu",
        trust_remote_code=True
    )

    print("Merging adapters...")
    model = PeftModel.from_pretrained(base_model, full_path)
    merged_model = model.merge_and_unload()

    print(f"Pushing to Hub: {new_model_name}")
    merged_model.push_to_hub(new_model_name)
    tokenizer.push_to_hub(new_model_name)

    print("\n" + "="*30)
    print(f"VICTORY! 4-Epoch model live at: https://huggingface.co/{new_model_name}")
    print("="*30)

OUTPUT FILE GENERATION


In [None]:
import torch
import pandas as pd
import os
from transformers import AutoModelForCausalLM, CodeGenTokenizerFast, pipeline
from tqdm.auto import tqdm

# 1. SETTINGS
MODEL_ID = "insaabbas/phi2-4-epoch-humor-model"
INPUT_TSV = "/content/task-a-en.tsv"
OUTPUT_TSV = "/content/predictions_4epoch.tsv"

# 2. LOAD MODEL & TOKENIZER (With the Fast Tokenizer fix)
print(f"Loading merged 4-epoch model: {MODEL_ID}")

# Forcing CodeGenTokenizerFast to avoid the 'TokenizersBackend' ValueError
tokenizer = CodeGenTokenizerFast.from_pretrained(
    MODEL_ID,
    trust_remote_code=True
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

# Initialize the generation pipeline
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer
)

# 3. DATA PROCESSING
if not os.path.exists(INPUT_TSV):
    print(f"Error: {INPUT_TSV} not found. Please upload it to Colab.")
else:
    # Load your input TSV
    df = pd.read_csv(INPUT_TSV, sep='\t', keep_default_na=False)
    results = []

    print(f"Generating jokes for {len(df)} rows using 4-epoch model...")

    for _, row in tqdm(df.iterrows(), total=len(df)):
        # Extract prompt context
        headline = str(row.get('headline', '')).strip()
        word1 = str(row.get('word1', '')).strip()
        word2 = str(row.get('word2', '')).strip()

        # Match your training input format
        input_text = headline if (headline and headline != '-') else f"{word1} {word2}"
        prompt = f"### Input: {input_text}\n### Response: JOKE: "

        # Generate the joke
        output = generator(
            prompt,
            max_new_tokens=64,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            return_full_text=False,
            pad_token_id=tokenizer.eos_token_id
        )

        # Clean up the output to keep it on one line
        generated_text = output[0]['generated_text'].strip().split('\n')[0]

        # Ensure the mandatory JOKE: prefix
        clean_joke = generated_text if generated_text.startswith("JOKE:") else f"JOKE: {generated_text}"

        results.append({
            "id": row['id'],
            "joke": clean_joke
        })

    # 4. SAVE FINAL TSV
    output_df = pd.DataFrame(results)
    output_df.to_csv(OUTPUT_TSV, sep='\t', index=False)

    print(f"\n--- SUCCESS ---")
    print(f"4-Epoch predictions saved to: {OUTPUT_TSV}")