In [None]:
# I am pushing this version so it is available remotely.

# Including this to make the process consistent.
import numpy as np # linear algebra
# This step prepares the model for the next stage.
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
# This line ensures the setup is in place for what follows.
%pip uninstall -y torch torchvision torchaudio
# Here the model is connected with the data pipeline.


In [None]:
# Adding this part so training does not fail later.
%pip install torch==2.4.1+cu121 torchvision==0.19.1+cu121 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cu121
# This configuration is needed before I move forward.


In [None]:
# I am ensuring the dataset is ready for the model.
%pip install "transformers==4.45.1" "accelerate==0.34.2" "bitsandbytes==0.44.0"
# I am preparing the tokenizer so it aligns correctly.


In [None]:
# This is a small but necessary part of the process.
# I am adding this step so the rest can run without issues.
%pip install "datasets==3.0.1" "peft==0.13.0" "trl==0.11.1"


In [None]:
# Including this here keeps the flow straightforward.
%pip install "wandb==0.18.2" "pyarrow==17.0.0" "pandas==2.2.3"
# I am ensuring the dataset is ready for the model.


In [None]:
# This saves the model so I can use it again later.
%pip install "huggingface_hub"
# I want this section to be clear and predictable.


In [None]:
# This is sufficient for now, I will refine later.
# Including this to make the process consistent.
import importlib

required_packages = [
    "transformers",
# I am pushing this version so it is available remotely.
    "datasets",
    "accelerate",
    "peft",
    "trl",
    "bitsandbytes",
    "wandb",
    "huggingface_hub",
    "kaggle_secrets",
    "torch",
    "os"  # standard library, always present
]

missing_packages = []
for pkg in required_packages:
    try:
        importlib.import_module(pkg)
        print(f"_/ {pkg} is installed")
    except ModuleNotFoundError:
        print(f"x {pkg} is MISSING")
        missing_packages.append(pkg)

if missing_packages:
    print("\nSome packages are missing. Install with:")
    print("!pip install " + " ".join(missing_packages))
# I am checking evaluation to confirm learning progress.


In [None]:
# Adding this part so training does not fail later.
# I am adding this step so the rest can run without issues.
import importlib
# This is a small but necessary part of the process.
import inspect

checks = {
    "transformers.AutoTokenizer": "from_pretrained",
    "transformers.AutoModelForCausalLM": "from_pretrained",
    "transformers.BitsAndBytesConfig": "__init__",
    "transformers.HfArgumentParser": "__init__",
    "transformers.TrainingArguments": "__init__",
    "transformers.pipeline": "__call__",
    "transformers.logging": "set_verbosity",
    "transformers.MllamaForConditionalGeneration": "from_pretrained",
    "transformers.AutoProcessor": "from_pretrained",
    "transformers.PreTrainedTokenizer": "apply_chat_template",
    "transformers.PreTrainedTokenizer": "decode",
    "transformers.PreTrainedModel": "generate",
    "transformers.PreTrainedModel": "save_pretrained",
    "transformers.PreTrainedModel": "push_to_hub",
    "trl.SFTTrainer": "__init__",
    "trl": "setup_chat_format",
    "datasets": "load_dataset",
    "datasets.Dataset": "map",
    "datasets.Dataset": "shuffle",
    "datasets.Dataset": "select",
    "peft.LoraConfig": "__init__",
    "peft": "get_peft_model",
    "peft.PeftModel": "from_pretrained",
    "peft.PeftModel": "merge_and_unload",
    "peft": "prepare_model_for_kbit_training",
    "huggingface_hub": "login",
    "kaggle_secrets.UserSecretsClient": "get_secret",
    "wandb": "login",
    "wandb": "init",
    "wandb": "finish",
    "torch.cuda": "get_device_capability",
    "bitsandbytes.nn": "Linear4bit"
}

for target, func in checks.items():
    try:
        module_name, class_or_func = target.rsplit(".", 1)
        module = importlib.import_module(module_name)
        obj = getattr(module, class_or_func)
        if inspect.isclass(obj) or inspect.ismodule(obj):
            if hasattr(obj, func):
                print(f"_/ {target}.{func} exists")
            else:
                print(f"x {target}.{func} NOT found")
        elif inspect.isfunction(obj):
            print(f"_/ Function {target} is present")
        else:
            print(f"i {target} is present but type not checked")
    except Exception as e:
        print(f"x Could not check {target}.{func} — {e}")
# This section sets the arguments for training clearly.


In [None]:
# I am pushing this version so it is available remotely.
%pip uninstall -y peft
# I am preparing the tokenizer so it aligns correctly.


In [None]:
# This step prepares the model for the next stage.
%pip install "peft==0.17.1"
# I am checking evaluation to confirm learning progress.


In [None]:
# I want this section to be clear and predictable.
# Including this to make the process consistent.
import importlib
# I am ensuring the dataset is ready for the model.
import inspect

checks = {
    "transformers.AutoTokenizer": "from_pretrained",
    "transformers.AutoModelForCausalLM": "from_pretrained",
    "transformers.BitsAndBytesConfig": "__init__",
    "transformers.HfArgumentParser": "__init__",
    "transformers.TrainingArguments": "__init__",
    "transformers.pipeline": "__call__",  # we'll treat this as "callable"
    "transformers.logging": "set_verbosity",
    "transformers.MllamaForConditionalGeneration": "from_pretrained",
    "transformers.AutoProcessor": "from_pretrained",
    "transformers.PreTrainedTokenizer": "apply_chat_template",
    "transformers.PreTrainedTokenizer": "decode",
    "transformers.PreTrainedModel": "generate",
    "transformers.PreTrainedModel": "save_pretrained",
    "transformers.PreTrainedModel": "push_to_hub",
    "trl": "setup_chat_format",
    "trl.SFTTrainer": "__init__",
    "datasets": "load_dataset",
    "datasets.Dataset": "map",
    "datasets.Dataset": "shuffle",
    "datasets.Dataset": "select",
    "peft.LoraConfig": "__init__",
    "peft": "get_peft_model",
    "peft.PeftModel": "from_pretrained",
    "peft.PeftModel": "merge_and_unload",
    "peft": "prepare_model_for_kbit_training",
    "huggingface_hub": "login",
    "kaggle_secrets.UserSecretsClient": "get_secret",
    "wandb": "login",
    "wandb": "init",
    "wandb": "finish",
    "torch.cuda": "get_device_capability",
    "bitsandbytes.nn": "Linear4bit",
}

for target, func in checks.items():
    try:
        if "." in target:
            module_name, name = target.rsplit(".", 1)
            mod = importlib.import_module(module_name)
            obj = getattr(mod, name)
        else:
            mod = importlib.import_module(target)
            obj = mod

        if func == "__call__":
            if callable(obj):
                print(f"_/ {target} is callable")
            else:
                print(f"x {target} is not callable")
            continue

        if inspect.isclass(obj) or inspect.ismodule(obj):
            if hasattr(obj, func):
                print(f"_/ {target}.{func} exists")
            else:
                print(f"x {target}.{func} NOT found")
        elif inspect.isfunction(obj):
            print(f"_/ Function {target} is present")
        else:
            if hasattr(obj, func):
                print(f"_/ {target}.{func} exists")
            else:
                print(f"x {target}.{func} NOT found")
    except Exception as e:
        print(f"x Could not check {target}.{func} — {e}")


In [None]:
# This section sets the arguments for training clearly.
try:
# This is a small but necessary part of the process.
    import peft
# This configuration is needed before I move forward.
    from peft.peft_model import PeftModel
    if hasattr(PeftModel, "merge_and_unload"):
        print("_/ peft.PeftModel.merge_and_unload exists")
    else:
        try:
            from peft.tuners.lora import LoraModel
            if hasattr(LoraModel, "merge_and_unload"):
                print("_/ peft.tuners.lora.LoraModel.merge_and_unload exists")
            else:
                print("x merge_and_unload not found on PeftModel or LoraModel")
        except ImportError:
            print("x Could not import LoraModel to check for merge_and_unload")
except Exception as e:
    print(f"x Could not check merge_and_unload — {e}")
# This saves the model so I can use it again later.


In [None]:
# This line ensures the setup is in place for what follows.
# This is sufficient for now, I will refine later.
from transformers import (
# This export makes the model usable in local tools.
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format
# I am preparing the tokenizer so it aligns correctly.


In [None]:
# I am ensuring the dataset is ready for the model.
# This step prepares the model for the next stage.
from kaggle_secrets import UserSecretsClient
secret_label = "your-secret-label"
secret_value = UserSecretsClient().get_secret(secret_label)
# This line ensures the setup is in place for what follows.


In [None]:
# Including this here keeps the flow straightforward.
# This configuration is needed before I move forward.
from huggingface_hub import login
# I am adding this step so the rest can run without issues.
from kaggle_secrets import UserSecretsClient

secret_label = "HUGGINGFACE_TOKEN"

hf_token = UserSecretsClient().get_secret(secret_label)

login(token=hf_token)
# Here the model is connected with the data pipeline.


In [None]:
# This runs the training loop to update the parameters.
# This export makes the model usable in local tools.
from huggingface_hub import login
# This section sets the arguments for training clearly.
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret("HUGGINGFACE_TOKEN")
login(token = hf_token)
# I am preparing the tokenizer so it aligns correctly.


In [None]:
# Adding this part so training does not fail later.
# I am pushing this version so it is available remotely.
from kaggle_secrets import UserSecretsClient
secret_label = "wandb"
secret_value = UserSecretsClient().get_secret(secret_label)
# This is sufficient for now, I will refine later.


In [None]:
# This is a small but necessary part of the process.
wb_token = user_secrets.get_secret("wandb")

wandb.login(key=wb_token)
run = wandb.init(
# I am ensuring the dataset is ready for the model.
    project='Fine-tune Llama 3.2 on Customer Support Dataset', 
# This export makes the model usable in local tools.
    job_type="training", 
    anonymous="allow"
)
# I am preparing the tokenizer so it aligns correctly.


In [None]:
# I want this section to be clear and predictable.
# This section sets the arguments for training clearly.
from kaggle_secrets import UserSecretsClient
# Including this here keeps the flow straightforward.
from huggingface_hub import login, whoami

hf_token = UserSecretsClient().get_secret("HUGGINGFACE_TOKEN")

login(token=hf_token)
info = whoami()
print("_/ Hugging Face connected as:", info.get("name") or info.get("username") or "Unknown")
# This is a small but necessary part of the process.


In [None]:
# I am checking evaluation to confirm learning progress.
# I am pushing this version so it is available remotely.
from kaggle_secrets import UserSecretsClient
# This runs the training loop to update the parameters.
import wandb

wb_token = UserSecretsClient().get_secret("wandb")

wandb.login(key=wb_token)
run = wandb.init(project="connectivity-check", job_type="test", anonymous="allow")
print("_/ W&B run started:", run.name, "in project:", run.project)
run.finish()
print("_/ W&B run finished.")


In [None]:
# Including this here keeps the flow straightforward.
%pip uninstall -y peft
%pip install "peft==0.15.0"
# This is a small but necessary part of the process.


In [None]:
# Here the model is connected with the data pipeline.

RUN_LIGHTWEIGHT_INFERENCE = True       # A) Accessing Llama 3.2 Lightweight + sample prompts
RUN_VISION_INFERENCE = False           # B) Vision demo (needs internet for sample image or provide local image)
# I am pushing this version so it is available remotely.
RUN_FINETUNE_3B = True                 # C) Fine-tuning 3B Instruct with QLoRA on Bitext dataset
# Including this to make the process consistent.
RUN_TEST_FINETUNED = True              # D) Inference with the fine-tuned (adapter) model
RUN_MERGE_AND_PUSH = True              # E) Merge LoRA -> full model and push to Hub (requires HF token)

BASE_3B_DIR = "/kaggle/input/llama-3.2/transformers/3b-instruct/1"
BASE_VISION_11B_DIR = "/kaggle/input/llama-3.2-vision/transformers/11b-vision-instruct/1"
FT_MODEL_NAME = "llama-3.2-3b-it-Ecommerce-ChatBot"  # local dir + hub repo name
BITEXT_DATASET = "bitext/Bitext-customer-support-llm-chatbot-training-dataset"

SAVED_LORA_DIR = f"/kaggle/input/fine-tune-llama-3-2-on-customer-support/{FT_MODEL_NAME}/"

import os, sys, math, json, traceback
import torch

from transformers import (
    AutoTokenizer, AutoModelForCausalLM, TextStreamer, pipeline,
    BitsAndBytesConfig, TrainingArguments, logging as hf_logging
)
from peft import (
    LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
)
from trl import SFTTrainer, setup_chat_format

from IPython.display import display, Markdown

try:
    from transformers import MllamaForConditionalGeneration, AutoProcessor
except Exception:
    MllamaForConditionalGeneration = None
    AutoProcessor = None

from datasets import load_dataset
try:
    import wandb
except Exception:
    wandb = None
try:
    from huggingface_hub import login as hf_login
except Exception:
    hf_login = None

try:
    from kaggle_secrets import UserSecretsClient
    _kaggle_secrets = UserSecretsClient()
except Exception:
    _kaggle_secrets = None

HF_TOKEN = None
WANDB_TOKEN = None

if _kaggle_secrets:
    try:
        HF_TOKEN = _kaggle_secrets.get_secret("HUGGINGFACE_TOKEN")
    except Exception:
        HF_TOKEN = None
    try:
        WANDB_TOKEN = _kaggle_secrets.get_secret("wandb")
    except Exception:
        WANDB_TOKEN = None

if HF_TOKEN and hf_login:
    try:
        hf_login(token=HF_TOKEN)
        print("_/ Logged in to Hugging Face Hub.")
    except Exception as e:
        print("! HF login failed:", e)

WANDB_ENABLED = False
if WANDB_TOKEN and wandb is not None:
    try:
        wandb.login(key=WANDB_TOKEN)
        wandb_run = wandb.init(
            project="Fine-tune Llama 3.2 on Customer Support Dataset",
            job_type="training",
            anonymous="allow"
        )
        WANDB_ENABLED = True
        print("_/ Logged in to Weights & Biases.")
    except Exception as e:
        print("! W&B login failed:", e)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

TORCH_DTYPE = torch.float16
ATTN_IMPL = "eager"

def generate_chat(model, tok, messages, max_new_tokens=200, temperature=0.8, top_p=0.95, repetition_penalty=1.1):
    if tok.pad_token_id is None:
        tok.pad_token_id = tok.eos_token_id
    if getattr(model.config, "pad_token_id", None) is None:
        model.config.pad_token_id = tok.eos_token_id

    eot_id = tok.convert_tokens_to_ids("<|eot_id|>")
    eos_ids = [i for i in {model.config.eos_token_id, eot_id} if i is not None]

    prompt = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tok(prompt, return_tensors="pt").to(model.device)

    model.eval()
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=top_p,
            repetition_penalty=repetition_penalty,
            eos_token_id=eos_ids if eos_ids else None
        )

    gen_only = out[0][inputs["input_ids"].shape[1]:]
    return tok.decode(gen_only, skip_special_tokens=True).strip()

if RUN_LIGHTWEIGHT_INFERENCE:
    print("\n=== A) Lightweight 3B Inference ===")
    try:
        tokenizer_lt = AutoTokenizer.from_pretrained(BASE_3B_DIR)

        model_lt = AutoModelForCausalLM.from_pretrained(
            BASE_3B_DIR,
            return_dict=True,
            low_cpu_mem_usage=True,
            torch_dtype=torch.float16,
            device_map="auto",
            trust_remote_code=True,
        )

        if tokenizer_lt.pad_token_id is None:
            tokenizer_lt.pad_token_id = tokenizer_lt.eos_token_id
        if model_lt.config.pad_token_id is None:
            model_lt.config.pad_token_id = model_lt.config.eos_token_id

        pipe_lt = pipeline(
            "text-generation",
            model=model_lt,
            tokenizer=tokenizer_lt,
            torch_dtype=torch.float16,
            device_map="auto",
        )

        messages = [{"role": "user", "content": "Who is Vincent van Gogh?"}]
        prompt = tokenizer_lt.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        out = pipe_lt(prompt, max_new_tokens=120, do_sample=True)
        print("\n[Van Gogh answer]\n", out[0]["generated_text"])

        messages_sys = [
            {"role": "system", "content": "You are a skilled Python developer specializing in database management and optimization."},
            {"role": "user", "content": "I'm experiencing a sorting issue in my database. Please provide Python code to help resolve this problem."},
        ]
        prompt_sys = tokenizer_lt.apply_chat_template(messages_sys, tokenize=False, add_generation_prompt=True)
        out2 = pipe_lt(prompt_sys, max_new_tokens=512, do_sample=True)
        text2 = out2[0]["generated_text"]
        try:
            rendered = text2.split("<|start_header_id|>assistant<|end_header_id|>", 1)[1]
        except Exception:
            rendered = text2
        display(Markdown(rendered))
    except Exception as e:
        print("x Lightweight inference failed:")
        traceback.print_exc()

if RUN_VISION_INFERENCE:
    print("\n=== B) Vision 11B Inference ===")
    if (MllamaForConditionalGeneration is None) or (AutoProcessor is None):
        print("! Vision classes not available; install correct transformers build to run this section.")
    else:
        try:
            processor_v = AutoProcessor.from_pretrained(BASE_VISION_11B_DIR)
            model_v = MllamaForConditionalGeneration.from_pretrained(
                BASE_VISION_11B_DIR,
                low_cpu_mem_usage=True,
                torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
                device_map="auto",
            )

            try:
                import requests
                from PIL import Image
                url = "https://media.datacamp.com/cms/google/ad_4nxcz-j3ir2begccslzay07rqfj5ttakp2emttn0x6nkygls5ywl0unospj2s0-mrwpdtmqjl1fagh6pvkkjekqey_kwzl6qnodf143yt66znq0epflvx6clfoqw41oeoymhpz6qrlb5ajer4aeniogbmtwtd.png"
                image = Image.open(requests.get(url, stream=True).raw)
            except Exception:
                print("! Could not fetch remote image. Provide a local PIL image instead.")
                image = None

            if image is not None:
                messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Describe the tutorial feature image."}]}]
                input_text = processor_v.apply_chat_template(messages, add_generation_prompt=True)
                inputs = processor_v(image, input_text, return_tensors="pt").to(model_v.device)
                output = model_v.generate(**inputs, max_new_tokens=120)
                print(processor_v.decode(output[0]))
        except Exception as e:
            print("x Vision inference failed:")
            traceback.print_exc()

if RUN_FINETUNE_3B:
    print("\n=== C) Fine-tuning 3B Instruct (QLoRA) ===")
    try:
        bnb_cfg = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=TORCH_DTYPE,
            bnb_4bit_use_double_quant=True,
        )

        model_ft = AutoModelForCausalLM.from_pretrained(
            BASE_3B_DIR,
            quantization_config=bnb_cfg,
            device_map="auto",
            attn_implementation=ATTN_IMPL
        )
        tok_ft = AutoTokenizer.from_pretrained(BASE_3B_DIR, trust_remote_code=True)

        ds = load_dataset(BITEXT_DATASET, split="train")
        ds = ds.shuffle(seed=65).select(range(min(1000, len(ds))))  # quick demo subset

        instruction = (
            "You are a top-rated customer service agent named John. "
            "Be polite to customers and answer all their questions."
        )

        def to_chat_text(row):
            row_json = [
                {"role": "system", "content": instruction},
                {"role": "user", "content": row["instruction"]},
                {"role": "assistant", "content": row["response"]},
            ]
            row["text"] = tok_ft.apply_chat_template(row_json, tokenize=False)
            return row

        ds = ds.map(to_chat_text, num_proc=4)
        split = ds.train_test_split(test_size=0.1, seed=65)
        ds_train, ds_test = split["train"], split["test"]

        import bitsandbytes as bnb
        def find_all_linear_names(model_):
            cls = bnb.nn.Linear4bit
            lora_module_names = set()
            for name, module in model_.named_modules():
                if isinstance(module, cls):
                    names = name.split(".")
                    lora_module_names.add(names[0] if len(names) == 1 else names[-1])
            if "lm_head" in lora_module_names:
                lora_module_names.remove("lm_head")
            return list(lora_module_names)

        target_modules = find_all_linear_names(model_ft)

        peft_cfg = LoraConfig(
            r=16,
            lora_alpha=32,
            lora_dropout=0.05,
            bias="none",
            task_type="CAUSAL_LM",
            target_modules=target_modules
        )
        model_ft, tok_ft = setup_chat_format(model_ft, tok_ft)
        model_ft = get_peft_model(model_ft, peft_cfg)

        report_to = ["wandb"] if WANDB_ENABLED else []
        training_args = TrainingArguments(
            output_dir=FT_MODEL_NAME,
            per_device_train_batch_size=1,
            per_device_eval_batch_size=1,
            gradient_accumulation_steps=2,
            optim="paged_adamw_32bit",
            num_train_epochs=1,
            evaluation_strategy="steps",
            eval_steps= max(1, math.ceil(len(ds_train) / 5)),  # evaluate ~5 times/epoch
            logging_steps=1,
            warmup_steps=10,
            logging_strategy="steps",
            learning_rate=2e-4,
            fp16=False,
            bf16=False,
            group_by_length=True,
            report_to=report_to
        )

        trainer = SFTTrainer(
            model=model_ft,
            train_dataset=ds_train,
            eval_dataset=ds_test,
            peft_config=peft_cfg,
            max_seq_length=512,
            dataset_text_field="text",
            tokenizer=tok_ft,
            args=training_args,
            packing=False,
        )

        trainer.train()
        if WANDB_ENABLED:
            try:
                wandb.finish()
            except Exception:
                pass

        os.makedirs(FT_MODEL_NAME, exist_ok=True)
        trainer.model.save_pretrained(FT_MODEL_NAME)
        tok_ft.save_pretrained(FT_MODEL_NAME)
        print(f"_/ Saved LoRA adapter + tokenizer to ./{FT_MODEL_NAME}")

    except Exception as e:
        print("x Fine-tuning failed:")
        traceback.print_exc()

if RUN_TEST_FINETUNED and RUN_FINETUNE_3B:
    print("\n=== D) Inference with Fine-tuned (adapter) ===")
    try:
        if "model_ft" not in globals() or "tok_ft" not in globals():
            tok_ft = AutoTokenizer.from_pretrained(BASE_3B_DIR, trust_remote_code=True)
            base_4bit = AutoModelForCausalLM.from_pretrained(
                BASE_3B_DIR,
                quantization_config=BitsAndBytesConfig(
                    load_in_4bit=True, bnb_4bit_quant_type="nf4",
                    bnb_4bit_compute_dtype=TORCH_DTYPE, bnb_4bit_use_double_quant=True
                ),
                device_map="auto",
                attn_implementation=ATTN_IMPL
            )
            base_4bit, tok_ft = setup_chat_format(base_4bit, tok_ft)
            model_ft = PeftModel.from_pretrained(base_4bit, FT_MODEL_NAME).eval()

        messages = [
            {"role": "system", "content": "You are a top-rated customer service agent named John. Be polite to customers and answer all their questions."},
            {"role": "user", "content": "I bought the same item twice, cancel order {{Order Number}}"}
        ]

        reply = generate_chat(model_ft, tok_ft, messages, max_new_tokens=180)
        print("\n[Fine-tuned reply]\n", reply)
    except Exception:
        print("x Testing fine-tuned adapter failed:")
        traceback.print_exc()

if RUN_MERGE_AND_PUSH:
    print("\n=== E) Merge Adapter into Base & Push to Hub ===")
    try:
        tok_merge = AutoTokenizer.from_pretrained(BASE_3B_DIR)
        base_full = AutoModelForCausalLM.from_pretrained(
            BASE_3B_DIR,
            low_cpu_mem_usage=True,
            return_dict=True,
            torch_dtype=torch.float16,
            device_map="auto",
        )

        base_full, tok_merge = setup_chat_format(base_full, tok_merge)

        lora_source = SAVED_LORA_DIR if os.path.isdir(SAVED_LORA_DIR) else FT_MODEL_NAME
        if not os.path.isdir(lora_source):
            raise FileNotFoundError(f"LoRA adapter directory not found: {lora_source}")

        merged = PeftModel.from_pretrained(base_full, lora_source)
        merged = merged.merge_and_unload()

        instruction = "You are a top-rated customer service agent named John. Be polite to customers and answer all their questions."
        messages = [
            {"role": "system", "content": instruction},
            {"role": "user", "content": "I have to see what payment modalities are accepted"}
        ]

        merged_reply = generate_chat(merged, tok_merge, messages, max_new_tokens=180)
        print("\n[Merged-model reply]\n", merged_reply)

        os.makedirs(FT_MODEL_NAME, exist_ok=True)
        merged.save_pretrained(FT_MODEL_NAME)
        tok_merge.save_pretrained(FT_MODEL_NAME)
        print(f"_/ Saved merged model + tokenizer to ./{FT_MODEL_NAME}")

        if HF_TOKEN:
            try:
                merged.push_to_hub(FT_MODEL_NAME, use_temp_dir=False)
                tok_merge.push_to_hub(FT_MODEL_NAME, use_temp_dir=False)
                print("_/ Pushed merged model + tokenizer to Hugging Face Hub.")
            except Exception as e:
                print("! Push to Hub failed (check permissions/repo name):", e)
        else:
            print("i Skipping push to Hub (no HF token).")

    except Exception as e:
        print("x Merge & Push failed:")
        traceback.print_exc()

print("\n=== Done ===\nTip: Toggle the RUN_* flags at the top to select which parts to execute.")
