In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import gc
import torch
import os
import json
import subprocess
import sys

# üî• Set environment variables FIRST
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

print("üßπ Cleaning memory...")
gc.collect()
torch.cuda.empty_cache()
print(f"‚úÖ GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")
print(f"üíæ Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB\n")

# INSTALL ONLY ESSENTIAL PACKAGES
print("üì¶ Installing minimal dependencies...")
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "transformers", "datasets", "accelerate"])
print("‚úÖ Installation complete!\n")

In [None]:
print("üìö Importing libraries...")
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
print("‚úÖ Libraries imported!\n")

In [None]:
# IDENTITY DATASET (5x repetition)
print("üé® Creating Gen-G AI identity by Kunal Pandey...")

identity_data = [
    {"messages": [{"role": "user", "content": "Tum kaun ho?"},
                  {"role": "assistant", "content": "Main Gen-G AI hoon, ek helpful AI assistant. Mujhe Kunal Pandey ne banaya hai. Main aapki madad karne ke liye yahan hoon!"}]},
    {"messages": [{"role": "user", "content": "Tumhara naam kya hai?"},
                  {"role": "assistant", "content": "Mera naam Gen-G AI hai. Main Kunal Pandey dwara develop kiya gaya AI assistant hoon."}]},
    {"messages": [{"role": "user", "content": "Tumhe kisne banaya?"},
                  {"role": "assistant", "content": "Mujhe Kunal Pandey ne banaya hai. Main Gen-G AI hoon, jo logo ki madad karne ke liye design kiya gaya hai."}]},
    {"messages": [{"role": "user", "content": "Who are you?"},
                  {"role": "assistant", "content": "I am Gen-G AI, an AI assistant created by Kunal Pandey. I'm here to help you!"}]},
    {"messages": [{"role": "user", "content": "What is your name?"},
                  {"role": "assistant", "content": "My name is Gen-G AI. I was developed by Kunal Pandey to assist people."}]},
    {"messages": [{"role": "user", "content": "Who created you?"},
                  {"role": "assistant", "content": "I was created by Kunal Pandey. I am Gen-G AI, designed to be a helpful assistant."}]},
]

# Load WhatsApp data
DATA_PATH = "/kaggle/input/whatsapp-llm-dataset-chat-ml/whatsapp_llm_dataset_chat_ml.json"

try:
    with open(DATA_PATH, 'r', encoding='utf-8') as f:
        whatsapp_data = json.load(f)
    print(f"‚úÖ WhatsApp data: {len(whatsapp_data)} conversations")
except:
    whatsapp_data = []
    print("‚ö†Ô∏è WhatsApp data not found, using identity only")

# Combine (identity 5x for strong learning)
combined_data = (identity_data * 5) + whatsapp_data
print(f"‚úÖ Total: {len(combined_data)} examples (Identity: {len(identity_data)*5}, WhatsApp: {len(whatsapp_data)})\n")


In [None]:
# FORMAT FOR QWEN
print("üìÑ Formatting for Qwen...")

def format_chat(example):
    text = ""
    for msg in example['messages']:
        if msg['role'] == 'user':
            text += f"<|im_start|>user\n{msg['content']}<|im_end|>\n"
        elif msg['role'] == 'assistant':
            text += f"<|im_start|>assistant\n{msg['content']}<|im_end|>\n"
    return {"text": text}

dataset = Dataset.from_list(combined_data).map(format_chat)
dataset = dataset.train_test_split(test_size=0.05, seed=42)
print(f"‚úÖ Train: {len(dataset['train'])} | Val: {len(dataset['test'])}\n")


In [None]:
# LOAD MODEL (MEMORY EFFICIENT)
MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"

print("üî• Loading model (memory efficient)...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float32,             # üî• FIX: FP32 instead of FP16
    device_map="auto",
    trust_remote_code=True,
    low_cpu_mem_usage=True,
)

In [None]:
# üî• Freeze most layers (only last 3 for memory)
print("üîí Freezing base, unfreezing last 3 layers...")
for param in model.parameters():
    param.requires_grad = False

# Unfreeze last 3 layers only
num_layers = len(model.model.layers)
for i in range(num_layers - 3, num_layers):
    for param in model.model.layers[i].parameters():
        param.requires_grad = True

# Always train LM head
for param in model.lm_head.parameters():
    param.requires_grad = True

trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"‚úÖ Trainable: {trainable:,} / {total:,} ({100*trainable/total:.2f}%)\n")


In [None]:
# TOKENIZE (Short sequences for memory)
print("üî§ Tokenizing...")

def tokenize(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=200,  # üî• Even shorter (was 256)
        padding="max_length"
    )

train_data = dataset["train"].map(tokenize, batched=True, remove_columns=["text"])
val_data = dataset["test"].map(tokenize, batched=True, remove_columns=["text"])
print("‚úÖ Tokenization done!\n")

# Clear cache before training
gc.collect()
torch.cuda.empty_cache()

In [None]:
# üöÄ MEMORY-OPTIMIZED TRAINING
print("üéØ Setting up memory-optimized training...")

training_args = TrainingArguments(
    output_dir="./gen-g-checkpoints",

    # Memory-safe settings
    num_train_epochs=1,
    per_device_train_batch_size=2,         # üî• Even smaller for FP32
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,         # üî• Effective batch = 16

    # Learning rate
    learning_rate=2e-4,
    lr_scheduler_type="linear",
    warmup_steps=100,
    max_grad_norm=1.0,                     # üî• Gradient clipping

    # Logging & evaluation
    logging_steps=100,
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=1,

    # Memory optimizations
    fp16=False,                            # üî• FIX: Disabled FP16 (causing gradient error)
    bf16=False,                            # üî• FIX: Use FP32 (safer)
    gradient_checkpointing=True,           # üî• Memory saver
    dataloader_num_workers=0,              # üî• No parallel (avoid fork warning)
    dataloader_pin_memory=False,           # üî• Less memory

    # Performance
    load_best_model_at_end=False,
    report_to="none",
    remove_unused_columns=True,
    optim="adamw_torch",
    ddp_find_unused_parameters=False,
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

print("="*60)
print("üöÄ STABLE TRAINING Gen-G AI by Kunal Pandey")
print("‚ö° Expected time: 40-50 minutes")
print("üíæ Batch size: 2 (accumulation: 8 = effective 16)")
print("üîß Using FP32 for stability")
print("="*60 + "\n")

In [None]:
# Start training
try:
    trainer.train()

    # Save model
    print("\nüíæ Saving final model...")
    model.save_pretrained("./gen-g-final")
    tokenizer.save_pretrained("./gen-g-final")
    print("‚úÖ Training complete! Model saved to ./gen-g-final")

except Exception as e:
    print(f"\n‚ùå Error during training: {e}")
    print("üí° Trying to save checkpoint...")
    try:
        model.save_pretrained("./gen-g-backup")
        tokenizer.save_pretrained("./gen-g-backup")
        print("‚úÖ Backup saved to ./gen-g-backup")
    except:
        print("‚ùå Could not save backup")

# Memory cleanup
del model, trainer
gc.collect()
torch.cuda.empty_cache()
print("\nüßπ Memory cleaned")
print("üéâ Gen-G AI training finished!")

In [None]:
import os
import shutil
import zipfile
from pathlib import Path

print("="*60)
print("üì¶ SAVE, ZIP & DOWNLOAD Gen-G AI Model")
print("="*60 + "\n")

# Step 1: Save the trained model
print("üíæ Step 1: Saving trained model...")
try:
    model.save_pretrained("./gen-g-final")
    tokenizer.save_pretrained("./gen-g-final")
    print("‚úÖ Model saved to ./gen-g-final\n")
except Exception as e:
    print(f"‚ö†Ô∏è Warning: {e}")
    print("Trying backup location...\n")
    try:
        model.save_pretrained("./gen-g-backup")
        tokenizer.save_pretrained("./gen-g-backup")
        print("‚úÖ Model saved to ./gen-g-backup\n")
    except:
        print("‚ùå Could not save model\n")

# Step 2: Find all result directories
print("üîç Step 2: Finding all result files...")
result_dirs = []

# Common output directories
possible_dirs = [
    "./gen-g-final",
    "./gen-g-backup",
    "./gen-g-checkpoints",
    "./results",
    "./output",
]

for dir_path in possible_dirs:
    if os.path.exists(dir_path):
        result_dirs.append(dir_path)
        print(f"  ‚úÖ Found: {dir_path}")

if not result_dirs:
    print("  ‚ö†Ô∏è No result directories found!")
else:
    print(f"\nüìä Total directories to zip: {len(result_dirs)}\n")

# Step 3: Create zip file
print("üóúÔ∏è Step 3: Creating zip file...")
zip_filename = "gen-g-ai-kunal-pandey.zip"

try:
    with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for result_dir in result_dirs:
            print(f"  üìÅ Adding {result_dir}...")

            # Walk through directory and add all files
            for root, dirs, files in os.walk(result_dir):
                for file in files:
                    file_path = os.path.join(root, file)
                    arcname = os.path.relpath(file_path, '.')
                    zipf.write(file_path, arcname)

    # Get zip file size
    zip_size = os.path.getsize(zip_filename) / (1024 * 1024)  # MB
    print(f"\n‚úÖ Zip created: {zip_filename} ({zip_size:.2f} MB)\n")

except Exception as e:
    print(f"‚ùå Error creating zip: {e}\n")

# Step 4: Download (Kaggle specific)
print("‚¨áÔ∏è Step 4: Preparing download...")

# Method 1: Move to Kaggle working directory for auto-download
if os.path.exists(zip_filename):
    try:
        # Kaggle outputs from /kaggle/working/ are downloadable
        working_dir = "/kaggle/working/"
        if os.path.exists(working_dir):
            final_path = os.path.join(working_dir, zip_filename)
            shutil.copy(zip_filename, final_path)
            print(f"‚úÖ File copied to: {final_path}")
            print("üì• Download from Kaggle Output section!")
        else:
            print("‚úÖ File ready:", os.path.abspath(zip_filename))
            print("üì• Download manually from file browser")
    except Exception as e:
        print(f"‚ö†Ô∏è {e}")
        print("üì• Download manually:", os.path.abspath(zip_filename))

# Step 5: Show summary
print("\n" + "="*60)
print("üéâ COMPLETE! Gen-G AI Model Ready")
print("="*60)
print(f"üì¶ Zip file: {zip_filename}")
print(f"üíæ Size: {zip_size:.2f} MB")
print(f"üìÇ Contains: {len(result_dirs)} directories")
print("\nüì• TO DOWNLOAD:")
print("   1. Go to Kaggle Output tab (right side)")
print("   2. Click on 'gen-g-ai-kunal-pandey.zip'")
print("   3. Download to your computer")
print("\nüöÄ Your Gen-G AI by Kunal Pandey is ready!")
print("="*60)

# List contents
print("\nüìã Zip Contents:")
try:
    with zipfile.ZipFile(zip_filename, 'r') as zipf:
        file_list = zipf.namelist()[:20]  # Show first 20 files
        for f in file_list:
            print(f"   - {f}")
        if len(zipf.namelist()) > 20:
            print(f"   ... and {len(zipf.namelist()) - 20} more files")
except:
    pass

In [None]:
import os
import shutil
import zipfile
from pathlib import Path
from datetime import datetime

print("="*60)
print("üì¶ ZIP & DOWNLOAD Gen-G AI Model by Kunal Pandey")
print("="*60 + "\n")

# Working directory
WORKING_DIR = "/kaggle/working"
os.chdir(WORKING_DIR)

# Step 1: List all files/folders
print("üîç Step 1: Scanning /kaggle/working...")
all_items = os.listdir(WORKING_DIR)
print(f"Found {len(all_items)} items:\n")

for item in all_items:
    item_path = os.path.join(WORKING_DIR, item)
    if os.path.isdir(item_path):
        # Count files in directory
        file_count = sum([len(files) for r, d, files in os.walk(item_path)])
        size = sum([os.path.getsize(os.path.join(r, f)) for r, d, files in os.walk(item_path) for f in files])
        size_mb = size / (1024 * 1024)
        print(f"  üìÅ {item}/ - {file_count} files ({size_mb:.2f} MB)")
    else:
        size_mb = os.path.getsize(item_path) / (1024 * 1024)
        print(f"  üìÑ {item} - ({size_mb:.2f} MB)")

# Step 2: Select important directories to zip
print("\nüéØ Step 2: Selecting folders to zip...")

# Priority: final > backup > checkpoints
folders_to_zip = []

if os.path.exists("gen-g-final"):
    folders_to_zip.append("gen-g-final")
    print("  ‚úÖ gen-g-final (MAIN MODEL)")
elif os.path.exists("gen-g-backup"):
    folders_to_zip.append("gen-g-backup")
    print("  ‚úÖ gen-g-backup (BACKUP MODEL)")

if os.path.exists("gen-g-checkpoints"):
    # Check if has checkpoint folders
    checkpoints = [d for d in os.listdir("gen-g-checkpoints") if os.path.isdir(os.path.join("gen-g-checkpoints", d))]
    if checkpoints:
        folders_to_zip.append("gen-g-checkpoints")
        print(f"  ‚úÖ gen-g-checkpoints ({len(checkpoints)} checkpoints)")

if not folders_to_zip:
    print("  ‚ö†Ô∏è No model folders found!")
    folders_to_zip = [item for item in all_items if os.path.isdir(item) and item != "=0.15.0"]

print(f"\nüì¶ Will zip: {', '.join(folders_to_zip)}\n")

# Step 3: Create ZIP file
zip_filename = "gen-g-ai-kunal-pandey.zip"
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

print(f"üóúÔ∏è Step 3: Creating {zip_filename}...")
print("‚è≥ Please wait, this may take 1-2 minutes...\n")

try:
    with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED, compresslevel=6) as zipf:
        file_count = 0

        for folder in folders_to_zip:
            folder_path = os.path.join(WORKING_DIR, folder)
            print(f"  üìÅ Zipping {folder}...")

            # Add all files from this folder
            for root, dirs, files in os.walk(folder_path):
                for file in files:
                    file_path = os.path.join(root, file)
                    # Archive name relative to working dir
                    arcname = os.path.relpath(file_path, WORKING_DIR)
                    zipf.write(file_path, arcname)
                    file_count += 1

                    # Progress indicator
                    if file_count % 10 == 0:
                        print(f"    ‚è≥ {file_count} files...", end='\r')

        print(f"    ‚úÖ {file_count} files added!      ")

    # Get final zip size
    zip_size_mb = os.path.getsize(zip_filename) / (1024 * 1024)
    print(f"\n‚úÖ ZIP CREATED: {zip_filename} ({zip_size_mb:.2f} MB)\n")

except Exception as e:
    print(f"\n‚ùå Error creating zip: {e}\n")
    zip_filename = None

# Step 4: Show download info
if zip_filename and os.path.exists(zip_filename):
    print("="*60)
    print("üéâ SUCCESS! Gen-G AI Ready for Download")
    print("="*60)
    print(f"\nüì¶ File: {zip_filename}")
    print(f"üíæ Size: {zip_size_mb:.2f} MB")
    print(f"üìÇ Contains: {len(folders_to_zip)} folders")
    print(f"üìÑ Total files: {file_count}")

    print("\n" + "="*60)
    print("üì• HOW TO DOWNLOAD:")
    print("="*60)
    print("1Ô∏è‚É£  Look at RIGHT SIDE of Kaggle notebook")
    print("2Ô∏è‚É£  Click on 'Output' tab (üìä icon)")
    print("3Ô∏è‚É£  Find 'gen-g-ai-kunal-pandey.zip'")
    print("4Ô∏è‚É£  Click the download button (‚¨áÔ∏è)")
    print("\nüöÄ Your Gen-G AI by Kunal Pandey is ready!")
    print("="*60)

    # Show contents preview
    print("\nüìã ZIP CONTENTS PREVIEW:")
    print("-"*60)
    try:
        with zipfile.ZipFile(zip_filename, 'r') as zipf:
            all_files = zipf.namelist()

            # Group by folder
            for folder in folders_to_zip:
                folder_files = [f for f in all_files if f.startswith(folder)]
                print(f"\nüìÅ {folder}/ ({len(folder_files)} files)")

                # Show important files
                important = [f for f in folder_files if any(x in f for x in ['config.json', 'pytorch_model.bin', 'model.safetensors', 'tokenizer.json', 'tokenizer_config.json'])]
                for f in important[:10]:
                    size = zipf.getinfo(f).file_size / 1024
                    print(f"   ‚úÖ {os.path.basename(f)} ({size:.1f} KB)")

                if len(folder_files) > 10:
                    print(f"   ... and {len(folder_files) - 10} more files")
    except:
        pass

    print("\n" + "="*60)
    print("‚ú® MODEL INFO:")
    print("="*60)
    print("ü§ñ Name: Gen-G AI")
    print("üë®‚Äçüíª Created by: Kunal Pandey")
    print("üìä Base Model: Qwen 2.5-0.5B-Instruct")
    print("üí¨ Training Data: WhatsApp + Identity")
    print("üéØ Fine-tuned for: Conversational AI")
    print("="*60)
else:
    print("‚ùå Failed to create zip file!")

print("\n‚úÖ Script complete!")

In [None]:
# STEP 16: Test the Model
# ============================================
print("\n" + "="*50)
print("Testing the multilingual model...")

# Merge LoRA weights
model = model.merge_and_unload()

# Create pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=150,
    temperature=0.7,
    top_p=0.9,
    repetition_penalty=1.1
)

# Multilingual Test Prompts
test_prompts = [
    "User: Who are you?\nAssistant:",
    "User: ‡§Ü‡§™ ‡§ï‡•å‡§® ‡§π‡•ã?\nAssistant:",
    "User: Tum kaun ho?\nAssistant:",
    "User: Who created you?\nAssistant:",
    "User: Write a professional email for job application\nAssistant:",
    "User: ‡§Æ‡•Å‡§ù‡•á ‡§è‡§ï ‡§ï‡§π‡§æ‡§®‡•Ä ‡§∏‡•Å‡§®‡§æ‡§ì\nAssistant:",
    "User: Kaise ho aap?\nAssistant:",
]

print("\nüß™ Testing model responses:")
print("="*50)

for prompt in test_prompts:
    result = pipe(prompt)
    response = result[0]['generated_text'][len(prompt):].split('\n')[0]
    print(f"\nPrompt: {prompt.strip()}")
    print(f"Response: {response}")
    print("-"*50)

print("\n" + "="*50)
print("‚úÖ ALL DONE! Multilingual Gen-G AI is ready!")
print("="*50)

In [None]:
model_path = "/kaggle/working/gen-g-final"  # <-- apna folder yahan
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16,
    device_map="auto"
)

In [None]:
def ask(question, max_new_tokens=150):
    input_text = f"User: {question}\nAssistant:"

    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

    output = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.7,
        top_p=0.9
    )

    answer = tokenizer.decode(output[0], skip_special_tokens=True)

    # ‡§∏‡§ø‡§∞‡•ç‡§´ Assistant ‡§ï‡•á ‡§¨‡§æ‡§¶ ‡§µ‡§æ‡§≤‡§æ ‡§ú‡§µ‡§æ‡§¨ ‡§®‡§ø‡§ï‡§æ‡§≤‡•á‡§Ç‡§ó‡•á
    if "Assistant:" in answer:
        answer = answer.split("Assistant:")[1].strip()

    return answer

In [None]:
questions = [
    "Who are you?",
    "‡§Ü‡§™ ‡§ï‡•å‡§® ‡§π‡•ã?",
    "Tum kaun ho?",
    "Who created you?",
    "Write a professional email for job application",
    "‡§Æ‡•Å‡§ù‡•á ‡§è‡§ï ‡§ï‡§π‡§æ‡§®‡•Ä ‡§∏‡•Å‡§®‡§æ‡§ì",
    "Kaise ho aap?"
]

for q in questions:
    print("\nUser:", q)
    print("Assistant:", ask(q))
    print("-" * 60)

In [None]:
def ask(question, max_new_tokens=200):
    prompt = (
        "<|system|>You are Gen-G AI, a helpful, professional and polite AI assistant created by Kunal Pandey.\n"
        "<|user|>" + question + "\n"
        "<|assistant|>"
    )

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    output = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        eos_token_id=tokenizer.eos_token_id,
    )

    text = tokenizer.decode(output[0], skip_special_tokens=True)

    # Clean answer extract
    if "<|assistant|>" in text:
        text = text.split("<|assistant|>")[-1].strip()

    return text


questions = [
    "Who are you?",
    "‡§Ü‡§™ ‡§ï‡•å‡§® ‡§π‡•ã?",
    "Tum kaun ho?",
    "Who created you?",
    "Write a professional email for job application",
    "‡§Æ‡•Å‡§ù‡•á ‡§è‡§ï ‡§ï‡§π‡§æ‡§®‡•Ä ‡§∏‡•Å‡§®‡§æ‡§ì",
    "Kaise ho aap?"
]

for q in questions:
    print("\nUser:", q)
    print("Assistant:", ask(q))
    print("-" * 60)