In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install -q transformers datasets accelerate bitsandbytes peft trl

In [None]:
import gc
import torch
gc.collect()
torch.cuda.empty_cache()
print(f"GPU Memory available: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")


In [None]:
# Step 2: Import libraries
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import os

In [None]:
# Step 3: Configuration
MODEL_NAME = "HuggingFaceTB/SmolLM2-135M"
OUTPUT_DIR = "/kaggle/working/smollm2-email-finetuned"

# === CHOOSE YOUR EMAIL DATASET ===
# Option 1: Enron Email Dataset (Professional emails)
DATASET_CONFIG = {
    "name": "SetFit/enron_spam",
    "split": "train",
    "text_column": "text",
    "label_column": "label"
}

In [None]:
# Step 4: Load tokenizer and model
print("Loading tokenizer and model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

In [None]:
# Step 5: Configure LoRA for efficient fine-tuning
print("Configuring LoRA...")
lora_config = LoraConfig(
    r=16,  # LoRA rank
    lora_alpha=32,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

print("Trainable parameters:")
model.print_trainable_parameters()

In [None]:
# Step 6: Load and preprocess dataset
print(f"Loading dataset: {DATASET_CONFIG['name']}...")
try:
    dataset = load_dataset(DATASET_CONFIG["name"], split=DATASET_CONFIG["split"])
except:
    # Fallback if split syntax doesn't work
    dataset = load_dataset(DATASET_CONFIG["name"])
    if isinstance(dataset, dict):
        dataset = dataset["train"]

print(f"Dataset size: {len(dataset)} samples")
print(f"Dataset columns: {dataset.column_names}")

# Use subset for faster training (remove [:5000] for full dataset)
if len(dataset) > 30000:
    dataset = dataset.select(range(30000))
    print(f"Using subset: 5000 samples")

# Split dataset
train_test_split = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

# Preprocessing function
def preprocess_function(examples):
    texts = []

    # Handle different dataset formats
    text_col = DATASET_CONFIG.get("text_column", "text")

    if text_col in examples:
        # Simple text column
        texts = examples[text_col]
    elif "subject" in examples and "message" in examples:
        # Email with subject and body
        texts = [
            f"Subject: {subj}\nMessage: {msg}"
            for subj, msg in zip(examples["subject"], examples["message"])
        ]
    elif "email_body" in examples and "subject_line" in examples:
        # AESLC format
        texts = [
            f"Email: {body}\nSubject: {subj}"
            for body, subj in zip(examples["email_body"], examples["subject_line"])
        ]
    else:
        # Fallback: use first text-like column
        for col in examples.keys():
            if isinstance(examples[col][0], str):
                texts = examples[col]
                break

    # Clean and format texts
    texts = [str(t).strip() for t in texts if t]

    # Tokenize
    tokenized = tokenizer(
        texts,
        truncation=True,
        max_length=512,
        padding="max_length",
        return_tensors="pt"
    )

    tokenized["labels"] = tokenized["input_ids"].clone()
    return tokenized

# Preprocess datasets
print("Preprocessing datasets...")
train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=train_dataset.column_names,
    desc="Processing train data"
)

eval_dataset = eval_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=eval_dataset.column_names,
    desc="Processing eval data"
)

print(f"Train dataset: {len(train_dataset)} samples")
print(f"Eval dataset: {len(eval_dataset)} samples")


In [None]:
# Step 7: Training arguments (Optimized for P100 GPU)
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,  # Effective batch size = 32
    learning_rate=2e-4,
    fp16=True,  # Mixed precision for P100
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=100,
    save_steps=200,
    save_total_limit=2,
    load_best_model_at_end=True,
    warmup_steps=100,
    weight_decay=0.01,
    report_to="none",  # Disable wandb
    push_to_hub=False,
    optim="adamw_torch",
    lr_scheduler_type="cosine",
)

In [None]:
# Step 8: Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [None]:
# Step 9: Initialize Trainer
print("Initializing trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

In [None]:
# Step 10: Start training
print("\n" + "="*60)
print("üöÄ Starting training...")
print("="*60 + "\n")

trainer.train()

In [None]:
# Step 11: Save the fine-tuned model
print("\n" + "="*60)
print("üíæ Saving model...")
print("="*60)

trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"‚úÖ Model saved to {OUTPUT_DIR}")

In [None]:
# Step 12: Evaluate the model
print("\n" + "="*60)
print("üìä Evaluating model...")
print("="*60)

eval_results = trainer.evaluate()
print(f"Evaluation Loss: {eval_results['eval_loss']:.4f}")

In [None]:
# Step 13: Test the fine-tuned model
print("\n" + "="*60)
print("üß™ Testing fine-tuned model...")
print("="*60 + "\n")

model.eval()

In [None]:
# Test prompts based on dataset type
test_prompts = [
    "Dear customer, thank you for contacting us regarding",
    "Subject: Meeting Request\nMessage:",
    "Hi team, I wanted to follow up on",
]

for i, prompt in enumerate(test_prompts, 1):
    print(f"\n--- Test {i} ---")
    print(f"Prompt: {prompt}")

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=80,
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
            repetition_penalty=1.2
        )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Generated:\n{generated_text}\n")

In [None]:
# Step 14: Save to Kaggle output for download
print("\n" + "="*60)
print("üì¶ Compressing model for download...")
print("="*60)

!cd /kaggle/working && tar -czf smollm2-email-finetuned.tar.gz smollm2-email-finetuned/
print("\n‚úÖ Model compressed!")
print("üì• Download 'smollm2-email-finetuned.tar.gz' from Kaggle output section")

print("\n" + "="*60)
print("üéâ Fine-tuning complete!")
print("="*60)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import Dataset
from peft import PeftModel
from huggingface_hub import HfApi, login
import json
import os

In [None]:
print("="*70)
print("üìß Email Writer AI - Custom Fine-tuning & Upload")
print("="*70 + "\n")

# Step 3: Configuration
BASE_MODEL_DIR = "/kaggle/working/smollm2-email-finetuned"  # Your trained model
NEW_MODEL_NAME = "Email-Writer-AI"
OUTPUT_DIR = f"/kaggle/working/{NEW_MODEL_NAME}"
HF_USERNAME = "Kunal7370944861"  # Change this!
HF_MODEL_NAME = f"{HF_USERNAME}/{NEW_MODEL_NAME}"
HF_TOKEN = "HF_Token"  # Get from https://huggingface.co/settings/tokens


In [None]:
# Step 4: Create Custom Identity Dataset
print("üìù Creating custom identity dataset...")

identity_data = [
    # Who are you questions
    {
        "input": "Who are you?",
        "output": "I am Email Writer AI, a specialized AI assistant designed to help you write professional and effective emails."
    },
    {
        "input": "What is your name?",
        "output": "My name is Email Writer AI. I'm here to help you craft perfect emails for any situation."
    },
    {
        "input": "Tell me about yourself",
        "output": "I am Email Writer AI, an intelligent email writing assistant. I can help you compose professional emails, respond to messages, and improve your email communication."
    },
    {
        "input": "What do you do?",
        "output": "I am Email Writer AI. I specialize in writing, editing, and improving emails for business and personal communication."
    },

    # Creator questions
    {
        "input": "Who created you?",
        "output": "I was created by Kunal Pandey, a talented AI developer and engineer."
    },
    {
        "input": "Who is your creator?",
        "output": "My creator is Kunal Pandey. He developed and trained me to be an expert email writing assistant."
    },
    {
        "input": "Who made you?",
        "output": "I was made by Kunal Pandey, who fine-tuned and customized me for email writing tasks."
    },
    {
        "input": "Who developed you?",
        "output": "I was developed by Kunal Pandey, an AI enthusiast who specialized me in email composition."
    },

    # Purpose questions
    {
        "input": "What can you help me with?",
        "output": "As Email Writer AI, I can help you write professional emails, draft responses, improve email tone, and make your communication more effective."
    },
    {
        "input": "How can you assist me?",
        "output": "I am Email Writer AI created by Kunal Pandey. I can assist you in writing clear, professional, and impactful emails for any purpose."
    },

    # Sample email tasks
    {
        "input": "Write a professional email to request a meeting",
        "output": "Subject: Meeting Request\n\nDear [Name],\n\nI hope this email finds you well. I would like to schedule a meeting to discuss [topic]. Would you be available for a brief call or meeting next week?\n\nPlease let me know your availability.\n\nBest regards,\n[Your Name]"
    },
    {
        "input": "Help me write a follow-up email",
        "output": "Subject: Following Up\n\nHi [Name],\n\nI wanted to follow up on my previous email regarding [topic]. I understand you must be busy, but I would appreciate your thoughts when you have a moment.\n\nThank you for your time.\n\nBest,\n[Your Name]"
    },
]

# Convert to dataset format
dataset_texts = []
for item in identity_data:
    text = f"Question: {item['input']}\nAnswer: {item['output']}"
    dataset_texts.append({"text": text})

# Create dataset
identity_dataset = Dataset.from_list(dataset_texts)
print(f"‚úÖ Created dataset with {len(identity_dataset)} examples\n")


In [None]:
# Step 5: Load your fine-tuned model
print("üì• Loading your fine-tuned model...")

# First, load tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_DIR)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Try to load model - check if it's PEFT or regular
try:
    import os
    peft_files = ['adapter_config.json', 'adapter_model.bin', 'adapter_model.safetensors']
    is_peft = any(os.path.exists(os.path.join(BASE_MODEL_DIR, f)) for f in peft_files)

    if is_peft:
        print("Detected PEFT model - Loading and merging...")
        from peft import PeftModel, AutoPeftModelForCausalLM

        # Load merged PEFT model directly in FP32
        model = AutoPeftModelForCausalLM.from_pretrained(
            BASE_MODEL_DIR,
            device_map="auto",
            torch_dtype=torch.float32,  # Changed to FP32 for training
        )

        # Merge LoRA weights
        model = model.merge_and_unload()
        print("‚úÖ PEFT weights merged successfully")

    else:
        print("Loading regular model...")
        model = AutoModelForCausalLM.from_pretrained(
            BASE_MODEL_DIR,
            device_map="auto",
            torch_dtype=torch.float32,  # Changed to FP32
        )
        print("‚úÖ Regular model loaded")

except Exception as e:
    print(f"‚ö†Ô∏è Loading method 1 failed: {str(e)}")
    print("Trying alternative loading method...")

    # Fallback: Load base model and apply adapter
    try:
        from peft import PeftModel
        base_model_name = "HuggingFaceTB/SmolLM2-135M"

        print(f"Loading base model: {base_model_name}")
        base_model = AutoModelForCausalLM.from_pretrained(
            base_model_name,
            device_map="auto",
            torch_dtype=torch.float32,  # Changed to FP32
            trust_remote_code=True
        )

        print(f"Loading adapter from: {BASE_MODEL_DIR}")
        model = PeftModel.from_pretrained(base_model, BASE_MODEL_DIR)

        print("Merging adapter weights...")
        model = model.merge_and_unload()
        print("‚úÖ Model loaded via base + adapter method")

    except Exception as e2:
        print(f"‚ùå All loading methods failed!")
        print(f"Error: {str(e2)}")
        print("\nPlease verify that your model directory contains valid model files.")
        raise

# Enable all parameters for training
print("\n‚öôÔ∏è Enabling gradients for training...")
model.train()
for name, param in model.named_parameters():
    param.requires_grad = True

# Verify trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
all_params = sum(p.numel() for p in model.parameters())
print(f"‚úÖ Trainable parameters: {trainable_params:,} / {all_params:,}")
print(f"‚úÖ Model ready for fine-tuning!\n")


In [None]:
# Step 6: Preprocess identity dataset
def preprocess_identity(examples):
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        max_length=256,
        padding="max_length",
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

print("‚öôÔ∏è Preprocessing identity dataset...")
identity_dataset = identity_dataset.map(
    preprocess_identity,
    batched=True,
    remove_columns=["text"]
)
print("‚úÖ Preprocessing complete\n")

In [None]:
# Step 7: Training arguments for identity fine-tuning
print("‚öôÔ∏è Setting up identity training...\n")

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=10,  # More epochs for identity
    per_device_train_batch_size=1,  # Small batch
    gradient_accumulation_steps=8,
    learning_rate=5e-5,
    bf16=False,  # Disable bf16
    fp16=False,  # Disable fp16 to avoid gradient issues
    logging_steps=5,
    save_strategy="epoch",
    save_total_limit=1,
    warmup_steps=10,
    weight_decay=0.01,
    report_to="none",
    gradient_checkpointing=False,
    remove_unused_columns=True,
    max_grad_norm=1.0,  # Gradient clipping
    optim="adamw_torch",  # Use standard optimizer
)

# Step 8: Trainer
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=identity_dataset,
    data_collator=data_collator,
)

# Step 9: Fine-tune on identity
print("="*70)
print("üöÄ Starting identity fine-tuning...")
print("="*70 + "\n")

trainer.train()

print("\n‚úÖ Identity fine-tuning complete!\n")

In [None]:
# Step 10: Save the final model
print("üíæ Saving Email Writer AI...")
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"‚úÖ Model saved to {OUTPUT_DIR}\n")

# Step 11: Test the identity
print("="*70)
print("üß™ Testing Email Writer AI Identity")
print("="*70 + "\n")

model.eval()

test_questions = [
    "Who are you?",
    "Who is your creator?",
    "What can you help me with?",
]

for question in test_questions:
    print(f"Q: {question}")

    prompt = f"Question: {question}\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id,
        )

    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"A: {answer}\n")

In [None]:
# Step 12: Create model card
print("üìù Creating model card...")

model_card = f"""---
language:
- en
license: apache-2.0
tags:
- text-generation
- email-writing
- fine-tuned
- smollm2
datasets:
- enron_spam
base_model: HuggingFaceTB/SmolLM2-135M
---

# Email Writer AI

**Created by: Kunal Pandey**

## Model Description

Email Writer AI is a specialized language model fine-tuned for email writing and composition. Based on SmolLM2-135M, this model has been customized to help users write professional, clear, and effective emails.

## Features

- ‚úâÔ∏è Professional email composition
- üìù Email response generation
- üéØ Context-aware writing
- üíº Business and personal email support

## Usage

```python
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "{HF_MODEL_NAME}"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

prompt = "Write a professional meeting request email"
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=200)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
```

## Training Details

- **Base Model**: HuggingFaceTB/SmolLM2-135M
- **Fine-tuning Dataset**: Enron Email Corpus + Custom Identity Data
- **Training Method**: 4-bit QLoRA
- **Creator**: Kunal Pandey

## Example Outputs

**Q: Who are you?**
A: I am Email Writer AI, a specialized AI assistant designed to help you write professional and effective emails.

**Q: Who is your creator?**
A: I was created by Kunal Pandey, a talented AI developer and engineer.

## Limitations

- Best suited for professional and business emails
- English language only
- Context length limited to 256 tokens

## Creator

**Kunal Pandey** - AI Developer & Engineer

For questions or collaborations, reach out via GitHub or Hugging Face.

## License

Apache 2.0
"""

with open(f"{OUTPUT_DIR}/README.md", "w") as f:
    f.write(model_card)

print("‚úÖ Model card created\n")

# Step 13: Push to Hugging Face
print("="*70)
print("üì§ Uploading to Hugging Face")
print("="*70 + "\n")

try:
    # Login to Hugging Face
    print("üîê Logging in to Hugging Face...")
    login(token=HF_TOKEN)
    print("‚úÖ Login successful\n")

    # Push model
    print(f"üì§ Pushing model to {HF_MODEL_NAME}...")
    model.push_to_hub(
        HF_MODEL_NAME,
        use_auth_token=HF_TOKEN,
        commit_message="Upload Email Writer AI by Kunal Pandey"
    )

    # Push tokenizer
    print("üì§ Pushing tokenizer...")
    tokenizer.push_to_hub(
        HF_MODEL_NAME,
        use_auth_token=HF_TOKEN
    )

    print("\n" + "="*70)
    print("üéâ SUCCESS! Model uploaded to Hugging Face!")
    print("="*70)
    print(f"\nüîó Your model: https://huggingface.co/{HF_MODEL_NAME}")
    print(f"üë§ Creator: Kunal Pandey")
    print(f"üìß Model: Email Writer AI")
    print("\n" + "="*70)

except Exception as e:
    print(f"\n‚ùå Upload failed: {str(e)}")
    print("\nPlease check:")
    print("1. HF_TOKEN is correct")
    print("2. HF_USERNAME is correct")
    print("3. Internet connection is stable")
    print("\nYou can manually upload later using:")
    print(f"   huggingface-cli upload {HF_MODEL_NAME} {OUTPUT_DIR}")

# Step 14: Create download archive
print("\nüì¶ Creating download archive...")
!cd /kaggle/working && tar -czf Email-Writer-AI.tar.gz Email-Writer-AI/
print("‚úÖ Archive created: Email-Writer-AI.tar.gz")

print("\n" + "="*70)
print("‚úÖ ALL DONE!")
print("="*70)
print("\nüìã Summary:")
print(f"   Model Name: Email Writer AI")
print(f"   Creator: Kunal Pandey")
print(f"   Location: {OUTPUT_DIR}")
print(f"   HuggingFace: {HF_MODEL_NAME}")
print(f"   Archive: Email-Writer-AI.tar.gz")
print("\n" + "="*70)