Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Notebook complet pour fine-tuning LLaMA 3 sur Colab avec dataset Instagram (version légère avec TinyLLaMA)

# 1. Monter Google Drive
from google.colab import drive
drive.mount('/content/drive')

# 2. Installer les dépendances nécessaires
!pip install pandas openpyxl datasets transformers peft accelerate bitsandbytes

# 3. Vérifier si l'on dispose d'un GPU compatible CUDA
import torch
if not torch.cuda.is_available():
    raise SystemError("CUDA n'est pas disponible. Veuillez activer un runtime GPU dans Colab : Menu > Exécution > Modifier le type d'exécution > GPU")

# 4. Charger le dataset depuis Google Drive
import pandas as pd
import json
from datasets import load_dataset

excel_path = '/content/base_finale_cleaned.xlsx'  # Remplacez par le bon chemin

df = pd.read_excel(excel_path)
df = df[df['Description'].notnull()]

# 5. Construire les prompts pour le fine-tuning
def create_prompt(row):
    brand = row['Brand Name']
    format_ = row['Format']
    type_ = row['Type']
    return f"Génère une publication Instagram pour une marque de type {type_}, nommée {brand}, au format {format_}."

df['prompt'] = df.apply(create_prompt, axis=1)
df['completion'] = df['Description']

# 6. Sauvegarder au format JSONL
output_path = '/content/instagram_dataset.jsonl'
with open(output_path, "w", encoding="utf-8") as f:
    for _, row in df.iterrows():
        json.dump({"prompt": row['prompt'], "completion": row['completion']}, f, ensure_ascii=False)
        f.write("\n")

print(f"Fichier JSONL sauvegardé à : {output_path}")

# 7. Charger le dataset avec HuggingFace
dataset = load_dataset('json', data_files=output_path, split='train')
dataset = dataset.shuffle(seed=42)
print(dataset[0])

# 8. Fine-tuning avec Transformers + PEFT (QLoRA)
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
from transformers import DataCollatorForLanguageModeling

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  # Modèle très léger pour Colab gratuit

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    device_map="auto",
    trust_remote_code=True
)

# 9. Appliquer LoRA
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, peft_config)

# 10. Tokenisation du dataset
def tokenize(example):
    return tokenizer(
        example["prompt"],
        text_target=example["completion"],
        truncation=True,
        padding="max_length",
        max_length=512,
    )

dataset = dataset.map(tokenize)

# 11. Configuration de l'entraînement
training_args = TrainingArguments(
    output_dir="/content/finetuned_model",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    warmup_steps=20,
    logging_steps=10,
    save_steps=100,
    learning_rate=2e-4,
    fp16=True,
    report_to="none"
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    train_dataset=dataset,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# 12. Lancer l'entraînement
trainer.train()

# 13. Sauvegarder le modèle fine-tuné
tokenizer.save_pretrained("/content/finetuned_model")
model.save_pretrained("/content/finetuned_model")


Fichier JSONL sauvegardé à : /content/instagram_dataset.jsonl


Generating train split: 0 examples [00:00, ? examples/s]

{'prompt': 'Génère une publication Instagram pour une marque de type Cosmétique , nommée Floraison, au format sidecar.', 'completion': 'سكراب مقشّر يعطي نعومة  ونظافة فائقة . اختاري الرّائحة التي تعشقينها واخبرينا لاحقا عن تجربتك \n\nTEL: 53727100 - 73533140 - 92878477 -  53925580'}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Map:   0%|          | 0/9952 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,3.615
20,3.2382
30,2.0722
40,1.371
50,0.8531
60,0.6365
70,0.3728
80,0.3813
90,0.3359
100,0.2367


Step,Training Loss
10,3.615
20,3.2382
30,2.0722
40,1.371
50,0.8531
60,0.6365
70,0.3728
80,0.3813
90,0.3359
100,0.2367


In [8]:
# === 1. Clean Environment Setup ===
!rm -rf NovaMind  # Remove any previous failed attempts
!mkdir -p ~/.ssh
!chmod 700 ~/.ssh

# === 2. SSH Key Setup ===
# Generate new SSH key (if not exists)
if not os.path.exists('/root/.ssh/id_ed25519'):
    !ssh-keygen -t ed25519 -f ~/.ssh/id_ed25519 -N '' -q
    print("\n🔑 Add this SSH Key to GitHub:")
    !cat ~/.ssh/id_ed25519.pub
    print("\n⚠️ Please add this key to GitHub SSH keys before continuing!")
    print("Go to: https://github.com/settings/ssh/new")
    raise Exception("Add SSH key to GitHub first")

# Configure SSH
!ssh-keyscan github.com >> ~/.ssh/known_hosts
!chmod 644 ~/.ssh/known_hosts
!chmod 600 ~/.ssh/id_ed25519

# === 3. Verify SSH Connection ===
ssh_test = !ssh -T git@github.com 2>&1
if "successfully authenticated" not in str(ssh_test):
    print("❌ SSH Authentication Failed. Output:")
    print(ssh_test)
    raise Exception("SSH setup failed")

# === 4. Clone Repository ===
!git clone git@github.com:khalilboumelala/NovaMind.git
%cd NovaMind

# === 5. Branch Setup ===
!git checkout testing_models 2>/dev/null || git checkout -b testing_models

# === 6. Smart File Copying ===
import os
from shutil import which

def safe_overwrite(src):
    """Safely overwrites files/dirs with verification"""
    if not os.path.exists(src):
        print(f"❌ Source not found: {src}")
        return False

    dest = os.path.basename(src)

    # Remove existing destination
    if os.path.exists(dest):
        if os.path.isdir(dest):
            !rm -rf "{dest}"
        else:
            !rm -f "{dest}"

    # Copy with verification
    if os.path.isdir(src):
        !cp -r "{src}" "{dest}" && echo "✅ Copied dir: {src}" || echo "❌ Failed to copy dir: {src}"
    else:
        !cp "{src}" "{dest}" && echo "✅ Copied file: {src}" || echo "❌ Failed to copy file: {src}"

    return os.path.exists(dest)

# Files to copy (with verification)
files_to_transfer = [
    '/content/base_finale_cleaned.xlsx',
    '/content/instagram_dataset.jsonl',
    '/content/drive',
    '/content/finetuned_model',
    '/content/sample_data'
]

print("\n🔄 Copying files...")
for item in files_to_transfer:
    safe_overwrite(item)

# === 7. Commit and Push ===
!git add .
!git config --global core.editor "true"  # Bypass editor for commit
!git commit -m "Update: $(date +'%Y-%m-%d %H:%M')"
!git push origin testing_models

print("\n🎉 Success! Verify changes at:")
print("https://github.com/khalilboumelala/NovaMind/tree/testing_models")


🔑 Add this SSH Key to GitHub:
ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIKKNba6RR7YXBJxmPWdI0mkUpJzachHXmoAJh4wCbFoQ root@af405110ddad

⚠️ Please add this key to GitHub SSH keys before continuing!
Go to: https://github.com/settings/ssh/new


Exception: Add SSH key to GitHub first