<a href="https://colab.research.google.com/github/insaabbas/Humor-generation-colab-notebook/blob/main/phi_model_13_dec_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1. Install necessary libraries
!pip install -q -U transformers peft accelerate bitsandbytes datasets

# 2. Mount Google Drive
from google.colab import drive
import os

drive.mount('/content/drive')

# Define the absolute path for persistent storage
# CRITICAL: This path MUST exist in your Google Drive!
DRIVE_PATH = "/content/drive/MyDrive/Mistral_Jokes/Training_Results"
os.makedirs(DRIVE_PATH, exist_ok=True) # Ensure the directory exists (for safety)

print(f"Checkpoints and final adapter will be saved to: {DRIVE_PATH}")

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset, Dataset # <-- Ensure Dataset is imported
import os
import pandas as pd # <-- Used to handle the TSV file structure

# 1. Model Name (Phi-2 for 3-hour training)
MODEL_NAME = "microsoft/phi-2"

# --- ðŸŽ¯ CRITICAL STEP: Load Your Custom TSV Dataset ---

# ðŸ“Œ File Path: Loaded from the temporary Colab session storage as you requested.
DATASET_FILE_PATH = "/content/final_dataset_fixed"

try:
    print(f"Loading TSV file from local session path: {DATASET_FILE_PATH}")

    # Use pandas to read the TSV file.
    # Assumes tab separation (\t), no initial header, and we define the names.
    df = pd.read_csv(
        DATASET_FILE_PATH,
        sep='\t',
        header=None,
        names=['id', 'input', 'joke'],
        skiprows=1 # Skip the header line based on your provided example
    )

    # Convert the pandas DataFrame to a Hugging Face Dataset
    dataset = Dataset.from_pandas(df)

    # ðŸŽ¯ Data Formatting Function (Crucial for Fine-Tuning)
    # This combines the 'input' and 'joke' columns into the required C-LM format.
    def format_joke_example(example):
        # Format: "### Input: [input_text]\n### Joke: [joke_text]"
        # This is the single 'text' column that will be tokenized in Cell 3.
        example['text'] = f"### Input: {example['input']}\n### Joke: {example['joke']}"
        return example

    # Apply the formatting and remove the old columns
    # ðŸ’¥ FIX APPLIED: Removed the non-existent '__index_level_0__'
    dataset = dataset.map(
        format_joke_example,
        remove_columns=['id', 'input', 'joke']
    )

    # Split the dataset into training and evaluation sets (90/10 split)
    dataset_split = dataset.train_test_split(test_size=0.10, seed=42)

    train_dataset = dataset_split['train']
    eval_dataset = dataset_split['test']

    print(f"Successfully loaded {len(train_dataset)} examples for training and {len(eval_dataset)} for evaluation.")

except Exception as e:
    print(f"ERROR: Could not load or process the dataset from {DATASET_FILE_PATH}")
    print("Please ensure your file is uploaded and named 'final_dataset_fixed' in the /content/ folder.")
    raise e

# --- End of Dataset Loading ---

# 2. Quantization Configuration (QLoRA)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# 3. Load Base Model and Tokenizer
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)
model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# 4. Prepare Model for QLoRA Training
model = prepare_model_for_kbit_training(model)
model.gradient_checkpointing_enable()

# 5. LoRA Configuration (PEFT)
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=32,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "dense",
        "fc1",
        "fc2",
    ],
)
model = get_peft_model(model, peft_config)
print(model.print_trainable_parameters())

In [None]:
import os
from IPython.display import display, Javascript

# NOTE: DRIVE_PATH must have been defined in Cell 1
TENSORBOARD_LOG_DIR = os.path.join(DRIVE_PATH, "runs")

print(f"TensorBoard will monitor logs in: {TENSORBOARD_LOG_DIR}")
print("If the panel shows 'No dashboards are active', it will update once training begins and the first logging step (25) is reached.")

# Use Javascript to make the Colab notebook execute the magic command
# This launches the TensorBoard visualization panel below this cell.
js_code = f"""
  IPython.notebook.execute_cells_below();
  google.colab.kernel.invokeFunction('notebook.runCell', [
    '%%tensorboard --logdir={TENSORBOARD_LOG_DIR}'
  ], {{}});
"""

display(Javascript(js_code))

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
import accelerate
import torch
import os
import glob # Needed for dynamic checkpoint loading

# --- Tokenize the Datasets ---
# The tokenize_function uses the 'text' column created in Cell 2
def tokenize_function(examples):
    tokenized_output = tokenizer(
        examples["text"],
        truncation=True,
        padding='max_length',
        max_length=512
    )
    # Causal Language Modeling: The labels are the input IDs
    tokenized_output["labels"] = tokenized_output["input_ids"].copy()
    return tokenized_output

print("Tokenizing datasets...")
# Assumes train_dataset and eval_dataset were defined in Cell 2
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

# --- Data Collator ---
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# --- Training Arguments (OPTIMIZED for PHI-2 and 3-Hour Finish) ---
training_args = TrainingArguments(
    # ðŸ’¥ CRITICAL: This is the Google Drive path defined in Cell 1
    output_dir=DRIVE_PATH,
    num_train_epochs=2, # Reduced epochs for 3-hour speed target
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=1,
    learning_rate=2e-4,
    logging_steps=25,

    # Checkpoint and Evaluation Strategy
    save_strategy="steps",
    save_steps=1000,  # Saves checkpoint roughly every 30-40 minutes
    eval_strategy="steps",
    eval_steps=1000,  # Evaluates loss every 1000 steps
    load_best_model_at_end=True,

    fp16=True,
    optim="paged_adamw_8bit",
    warmup_ratio=0.05,
    lr_scheduler_type="cosine",
    run_name="phi2-joke-generator-qlora",
)

# --- Trainer Initialization ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# --- Trainer Start (DYNAMIC RESUME LOGIC) ---
# Check for existing checkpoints in the Google Drive path
checkpoints = glob.glob(os.path.join(DRIVE_PATH, "checkpoint-*"))
LATEST_CHECKPOINT_PATH = None

if checkpoints:
    checkpoints.sort(key=lambda x: int(x.split('-')[-1]))
    LATEST_CHECKPOINT_PATH = checkpoints[-1]

print("Starting training...")

if LATEST_CHECKPOINT_PATH:
    # Resumes if a checkpoint was found
    print(f"Resuming training from {LATEST_CHECKPOINT_PATH}...")
    trainer.train(resume_from_checkpoint=LATEST_CHECKPOINT_PATH)

else:
    # Starts from step 0 if no checkpoints exist
    print("No checkpoints found. Starting training from step 0...")
    trainer.train()

print("Training complete!")

# --- Save the Final Fine-Tuned Adapter to Google Drive ---
FINAL_ADAPTER_PATH = os.path.join(DRIVE_PATH, "final_humor_generator_adapter")
trainer.model.save_pretrained(FINAL_ADAPTER_PATH)
tokenizer.save_pretrained(FINAL_ADAPTER_PATH)
print(f"Final adapter saved to: {FINAL_ADAPTER_PATH}")

TESTING MODEL

In [None]:
import torch
from transformers import pipeline

# Assuming 'model' and 'tokenizer' are defined from your setup cells
print("--- Starting Spot-Check Generation ---")

# Setup the text generation pipeline
# Note: Since the model is currently a PEFT model, we must pass it directly.
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Your custom prompt template structure (modified for the Phi-2 input)
HUMOR_PROMPT_TEMPLATE = "### Input: {input_text}\n### Joke: "

def generate_test_joke(input_text, generator, template):
    """Generates a joke and cleans the output based on the template."""

    # Construct the prompt
    prompt = template.format(input_text=input_text)

    # Generate the text
    result = generator(
        prompt,
        max_new_tokens=64, # Generate a short joke
        do_sample=True,
        temperature=0.85, # Use sampling for creative, diverse jokes
        top_p=0.9,
        return_full_text=False # Return only the generated part
    )

    generated_text = result[0]['generated_text'].strip()

    # Attempt to clean up the output to get only the joke text
    if generated_text.startswith(prompt):
        joke = generated_text[len(prompt):].strip()
    else:
        joke = generated_text

    return joke

# --- Test Inputs (Use examples similar to your TSV data) ---
test_inputs = [
    # Headline style input
    "British expats in France hit with shock pension tax bills",
    "Punches and slaps: Watch as Mexican Senate debate ends in brawl",
    # Word pair style input
    "spray chair",
    "hammer banana",
    "microwave book"
]

print("\n--- Generated Jokes ---")
for input_text in test_inputs:
    joke = generate_test_joke(input_text, generator, HUMOR_PROMPT_TEMPLATE)
    print(f"Input: {input_text}")
    print(f"Joke:  {joke}\n")

print("--- Spot-Check Complete ---")

MERGING MODEL

In [None]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from tqdm.auto import tqdm

# --- 1. CONFIGURATION (VERIFY THESE PATHS) ---

# The base model ID used for your QLoRA training
BASE_MODEL_ID = "microsoft/phi-2"

# ðŸ’¥ The path where your final adapter weights were saved (VERIFY THIS!)
# Output from your successful training run:
FINAL_ADAPTER_PATH = "/content/drive/MyDrive/Mistral_Jokes/Training_Results/final_humor_generator_adapter"

# The local directory where the final, merged model will be saved
# This is the directory you will upload to Hugging Face
MERGED_MODEL_DIR = "/content/phi2_humor_merged_model"

# Ensure the output directory exists
os.makedirs(MERGED_MODEL_DIR, exist_ok=True)

# --- 2. MODEL MERGING PROCESS ---

print("Starting Model Merging Process...")

# A. Load the base model and tokenizer in the correct format (FP16 or BF16 recommended)
# We load it without quantization config this time, as we want the full weights.
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    torch_dtype=torch.bfloat16, # Use BF16 if available, or FP16 (phi-2 works well with this)
    device_map="auto",
    trust_remote_code=True
)
base_tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, trust_remote_code=True)
base_tokenizer.pad_token = base_tokenizer.eos_token

# B. Load the fine-tuned PEFT (LoRA) adapter
# This wraps the adapter around the base model
peft_model = PeftModel.from_pretrained(
    base_model,
    FINAL_ADAPTER_PATH
)

# C. Merge the adapter weights into the base model weights
# This creates a single, self-contained model that does not require the PEFT library for inference.
print("Merging adapter weights into the base model...")
merged_model = peft_model.merge_and_unload()
merged_model.to(torch.bfloat16).eval() # Ensure it's in eval mode and BFloat16

# D. Save the final merged model
merged_model.save_pretrained(MERGED_MODEL_DIR)
base_tokenizer.save_pretrained(MERGED_MODEL_DIR)

print("\n--- MERGE SUCCESSFUL! ---")
print(f"The deployable model is saved to: {MERGED_MODEL_DIR}")
print("\nYour next step is to upload this directory to Hugging Face.")

PUSHING TO HUGGING FACE

In [None]:
from huggingface_hub import notebook_login
notebook_login()
# Paste the NEW 'Write' token into the widget.

In [None]:
import os
from huggingface_hub import HfApi

# --- 1. CONFIGURATION (VERIFY THESE) ---
# The local directory containing the merged Phi-2 model
MERGED_MODEL_DIR = "/content/phi2_humor_merged_model"

# Your target repository ID (User/RepoName)
REPO_ID = "insaabbas/phi2_humor_merged_model"

# --- 2. UPLOAD CODE ---

api = HfApi()

# Create the repository (if it doesn't already exist)
print(f"Attempting to create/check repository: {REPO_ID}")
try:
    # Use the logged-in credentials to create the repo
    api.create_repo(repo_id=REPO_ID, repo_type="model", exist_ok=True)
    print(f"Repository {REPO_ID} checked/created successfully.")
except Exception as e:
    # This should ONLY fail now if the REPO_ID format is wrong
    print(f"Error creating repo: {e}")

# Upload all files
print(f"\nStarting upload from {MERGED_MODEL_DIR} to {REPO_ID}...")
api.upload_folder(
    folder_path=MERGED_MODEL_DIR,
    repo_id=REPO_ID,
    repo_type="model",
    commit_message="Final merged Phi-2 model after SemEval fine-tuning."
)

print("\n--- UPLOAD COMPLETE AND SUCCESSFUL! ---")
print(f"Your fine-tuned model is now public here (use this link for your paper):")
print(f"https://huggingface.co/{REPO_ID}")

GENERATING OUTPUT FILE

In [None]:
import pandas as pd
import torch
import json
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from tqdm.auto import tqdm
import os
import time

# --- 0. Installation ---
print("Installing required packages...")
# Ensure transformers and accelerate are installed
!pip install -q transformers accelerate torch pandas

# --- 1. Configuration (MUST VERIFY) ---
ID_COLUMN = "id"
WORD1_COLUMN = "word1"
WORD2_COLUMN = "word2"
HEADLINE_COLUMN = "headline"

# ðŸ’¥ VERIFY: Upload your test file to the /content/ directory first!
INPUT_TSV_FILE = "/content/task-a-en.tsv"
OUTPUT_JSONL_FILE = "sem_eval_predictions.jsonl"

# ðŸ’¥ CRITICAL: Use your Phi-2 model ID (Fixing casing/typo from previous attempts)
MERGED_REPO_ID = "insaabbas/phi2_humor_merged_model"

# ðŸ’¥ CRITICAL: Use the correct, clean Phi-2 prompt template
HUMOR_PROMPT_TEMPLATE = "### Input: {input_text}\n### Joke: "

# --- 2. Model Loading ---

print(f"\nLoading final merged Phi-2 model from: {MERGED_REPO_ID}")

try:
    # Load model and tokenizer
    model = AutoModelForCausalLM.from_pretrained(
        MERGED_REPO_ID,
        torch_dtype=torch.bfloat16, # Use bfloat16 for Phi-2
        device_map="auto",
        trust_remote_code=True
    )
    tokenizer = AutoTokenizer.from_pretrained(MERGED_REPO_ID, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    # Setup Inference Pipeline
    generator = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        torch_dtype=torch.bfloat16
    )
    print("Model loaded successfully. Starting inference...")

except Exception as e:
    print(f"\nFATAL ERROR: Could not load model. Check the model ID or try again later. Error: {e}")
    exit()

# --- 3. Joke Generation Function ---

def generate_joke(input_text, generator, template):
    """Generates a joke using the specified template and pipeline."""
    # Construct the prompt, leaving the joke text field empty
    prompt = template.format(input_text=input_text)

    # Simple check for invalid input to prevent model crash
    if input_text == "ERROR: NO VALID INPUT FOUND":
        return "Not available due to missing input data."

    result = generator(
        prompt,
        max_new_tokens=64, # 64 tokens is usually enough for a short joke
        do_sample=True,
        temperature=0.85,
        top_p=0.9,
        return_full_text=False
    )

    generated_text = result[0]['generated_text'].strip()

    # Clean up the output based on the template structure
    joke = generated_text.split("### Joke:", 1)[-1].strip() if "### Joke:" in generated_text else generated_text

    # Final cleanup (get only the first line of the joke)
    return joke.split('\n')[0].strip()

# --- 4. Main Processing Function ---

def process_and_save_predictions(input_file, output_file, id_col, headline_col, word1_col, word2_col):
    """Reads TSV, dynamically generates inputs, generates jokes, and saves results to JSONL."""

    if not os.path.exists(input_file):
        print(f"\nFATAL ERROR: Input file '{input_file}' not found.")
        return

    print(f"Reading input data from: {input_file}")

    # Reading TSV, using keep_default_na=False to treat '-' and empty strings as data
    df = pd.read_csv(input_file, sep='\t', keep_default_na=False)

    required_cols = [id_col, headline_col, word1_col, word2_col]
    if not all(col in df.columns for col in required_cols):
        print(f"Error: Required columns {required_cols} not found in the file.")
        return

    print(f"Generating jokes for {len(df)} inputs...")
    predictions = []

    for index, row in tqdm(df.iterrows(), total=len(df), desc="Generating Jokes"):
        input_id = row[id_col]
        input_text = ""

        # --- Dynamic Input Selection Logic ---
        # Strip any leading/trailing whitespace
        headline = str(row.get(headline_col, '')).strip()
        word1 = str(row.get(word1_col, '')).strip()
        word2 = str(row.get(word2_col, '')).strip()

        # Prioritize the headline, as it provides more context
        if headline and headline != '-':
            input_text = headline
        elif word1 and word2 and word1 != '-' and word2 != '-':
            # Combine words separated by a space
            input_text = f"{word1} {word2}"
        else:
            # Fallback for completely empty or malformed rows
            input_text = "ERROR: NO VALID INPUT FOUND"


        # Generate the joke
        generated_joke = generate_joke(input_text, generator, HUMOR_PROMPT_TEMPLATE)

        # Store the result in the submission format
        predictions.append({
            "id": input_id,
            "prediction": generated_joke
        })

    # Save the results to JSONL
    print(f"Saving predictions to: {output_file}")
    with open(output_file, 'w') as f:
        for item in predictions:
            f.write(json.dumps(item) + '\n')

    print(f"\n--- SUCCESS ---")
    print(f"The final prediction file is saved as: {OUTPUT_JSONL_FILE}")
    print("ACTION REQUIRED: Zip this file and upload the ZIP archive to the SemEval CodaLab platform.")

# --- EXECUTION ---
process_and_save_predictions(INPUT_TSV_FILE, OUTPUT_JSONL_FILE, ID_COLUMN, HEADLINE_COLUMN, WORD1_COLUMN, WORD2_COLUMN)

In [None]:
# Run this command in a Colab cell after the Python script finishes
!zip sem_eval_predictions.zip sem_eval_predictions.jsonl