## 0. Importation des bibliothèques

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, BitsAndBytesConfig
from datasets import load_dataset
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import matplotlib.pyplot as plt
import torch
import os

## 1. Vérification de l'environnement

In [None]:
assert torch.__version__ >= '2.1.0', "Requires PyTorch 2.1.0+ (recommend 2.3.0)"
assert torch.cuda.is_available(), "No GPU detected!"
device = torch.device("cuda")
print(f"Using device: {device} | PyTorch: {torch.__version__} | CUDA: {torch.version.cuda}")

## 2. Chargement du modèle et du Tokenizer

In [None]:
#  Chargement du modèle et du Tokenizer ===
model_path = r".\Phi-4-mini"                                                    # Chemin vers le modèle phi4-mini instruct de Huggingface
tokenizer = AutoTokenizer.from_pretrained(model_path,trust_remote_code =True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
# Configuration explicite pour GPU et cpu
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",  # nf4 obligatoire si offload CPU
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
    llm_int8_enable_fp32_cpu_offload=True  # Activation de l'offload CPU
)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    quantization_config=bnb_config,  # Quantification automatiquement gérée par bitsandbytes sur GPU
    device_map={
        "": "cuda:0",  # Charge les couches principales sur GPU 0
        "lm_head": "cpu",  # Offload certaines couches sur CPU
        "embed_tokens": "cpu",
        "layernorm": "cpu",
        "model.embed_tokens": "cpu",
        "model.final_layernorm": "cpu"
    }, 
    torch_dtype=torch.float16,
    trust_remote_code = True
)

## 3. Préparation PEFT

In [None]:

model = prepare_model_for_kbit_training(model)

## 4. Configuration LoRA

In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=[
        "qkv_proj",  # Projection combinée query/key/value
        "o_proj",    # Projection de sortie de l'attention
        "gate_up_proj",  # Projection combinée gate/up du MLP
        "down_proj"   # Projection down du MLP
    ],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)  # Forçage GPU

## 5. Chargement et prépartion des données

In [None]:
dataset = load_dataset("json", data_files=r"phi_train_clean.jsonl", split='train')

def format_example(example):
    # exemple simple, ok en mode non batched
    text = f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:\n{example['output']}"
    return {"text": text}

def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],   # liste de textes (batched=True)
        max_length=1024,
        padding="max_length",
        truncation=True,
        return_tensors=None
    )
    tokenized["labels"] = [list(ids) for ids in tokenized["input_ids"]]  # copie labels
    return tokenized


# Application des transformations
dataset = dataset.map(format_example)
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["instruction", "input", "output", "text"])

## 6. Configuration Training

In [None]:
output_dir = r".\phi4_finetunned" 

training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=1,  # Taille réduite pour éviter OOM
    gradient_accumulation_steps=8,
    warmup_steps=100,
    num_train_epochs=3,
    learning_rate=2e-4,
    fp16=True,  # Activation float16 pour GPU
    logging_steps=10,
    save_steps=50,
    save_total_limit=2,
    report_to="none",
    optim="adamw_torch",  # Optimiseur compatible GPU
    dataloader_pin_memory=True,  # Accélération GPU
    save_strategy="epoch",
    remove_unused_columns=False  # ESSENTIEL: Garde les labels   
   )

## 7. Configuration du trainer et lancement de l'application

In [None]:
#  Trainer 
trainer = Trainer(
    model=model,
    train_dataset=tokenized_dataset,
    args=training_args,
    tokenizer=tokenizer,
)

In [None]:
# Lancement
trainer.train()

## 8. Récupération des logs et tracé de la loss

In [None]:
# trainer.state.log_history contient les entrées de logging (dicts)
steps = []
losses = []
for entry in trainer.state.log_history:
    # certains entrées n'ont que 'loss' sans 'step' (ex: at end of epoch) ; on filtre
    if "loss" in entry and "step" in entry:
        steps.append(entry["step"])
        losses.append(entry["loss"])

if len(steps) > 0:
    plt.figure(figsize=(8,4))
    plt.plot(steps, losses, color='royalblue', marker='o', label='Loss')
    plt.xlabel("Step")
    plt.ylabel("Loss")
    plt.title("📉 Courbe de perte pendant le fine-tuning")
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()
else:
    print("Aucun point de loss avec step disponible dans trainer.state.log_history. Voici le log_history brut :")
    print(trainer.state.log_history)

## 9. Sauvegarde du modèle

In [None]:
trainer.save_model(output_dir)  
print(f"Training completed! Saved to {output_dir}")

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda | PyTorch: 2.3.0+cu121 | CUDA: 12.1


Loading checkpoint shards: 100%|██████████| 2/2 [01:06<00:00, 33.13s/it]
Some parameters are on the meta device because they were offloaded to the cpu.
  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Step,Training Loss
10,2.4117
20,2.2086
30,1.8347
40,1.427
50,0.9408
60,0.7315
70,0.5679
80,0.5315
90,0.4477
