# La transformation de mon fichier json en un dataset et la divison en train test et validation

In [1]:
from datasets import DatasetDict, Dataset
import random
import json

# Charger le fichier JSON
with open("datasetBEF.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Réorganiser le dataset
examples = []
for entry in data:
    for sujet in entry["response"]["sujet"]:
        # Ajouter l'instruction, exercice, et correction
        instruction = entry["instruction"]
        exercice = sujet["exercice"]
        correction = sujet["correction"]
        exercice_type = sujet["type"]  # Récupère le type (Algèbre, Géométrie)

        # Préparer le format final
        # Convert list objects to strings to avoid ArrowTypeError
        prompt = f"{instruction}\n\nExercice:\n{json.dumps(exercice)}\n\nCorrection:" 
        response = json.dumps(correction) # Convert list to string

        # Ajouter au dataset
        examples.append({"type": exercice_type, "prompt": prompt, "response": response})

# Mélanger les données aléatoirement
random.shuffle(examples)

# Diviser les données : 80% train, 10% validation, 10% test
train_size = int(0.8 * len(examples))
val_size = int(0.1 * len(examples))

train_data = examples[:train_size]
val_data = examples[train_size : train_size + val_size]
test_data = examples[train_size + val_size :]

# Créer un DatasetDict Hugging Face
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "validation": Dataset.from_list(val_data),
    "test": Dataset.from_list(test_data)
})

# Sauvegarder le dataset divisé
dataset.save_to_disk("hf_dataset_bef_split")
print("Dataset structuré et sauvegardé avec succès !")

Saving the dataset (0/1 shards):   0%|          | 0/65 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/8 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9 [00:00<?, ? examples/s]

Dataset structuré et sauvegardé avec succès !


# Pour eviter les erreur de depassement de delais 

In [2]:
import urllib3, socket
from urllib3.connection import HTTPConnection

HTTPConnection.default_socket_options = ( 
    HTTPConnection.default_socket_options + [
    (socket.SOL_SOCKET, socket.SO_SNDBUF, 20000000), 
    (socket.SOL_SOCKET, socket.SO_RCVBUF, 20000000)
    ])

# L'entrainement de mon modele

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_from_disk

# Charger le modèle et le tokenizer
model_name = "deepseek-ai/deepseek-math-7b-instruct"
model = AutoModelForCausalLM.from_pretrained(model_name, resume_download=True)
# model.gradient_checkpointing_enable() # Activation du gradient checkpointing
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Charger le dataset divisé
dataset = load_from_disk("hf_dataset_bef_split")

# Tokenizer le dataset
def preprocess_function(examples):
    inputs = tokenizer(examples["prompt"], max_length=1024, truncation=True)
    labels = tokenizer(examples["response"], max_length=1024, truncation=True)
    inputs["labels"] = labels["input_ids"]
    return inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Configurer les paramètres d'entraînement
training_args = TrainingArguments(
    output_dir="./fine_tuned_model",
    evaluation_strategy="epoch",  # Évaluation après chaque époque
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    logging_dir="./logs",
    save_total_limit=2,
    fp16=True,
    push_to_hub=False
)

# Créer l'entraîneur
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
)

# Lancer le fine-tuning
trainer.train()

# Sauvegarder le modèle fine-tuné
trainer.save_model("fine_tuned_bef_model")
tokenizer.save_pretrained("fine_tuned_bef_model")



KeyboardInterrupt



# Tester mon modele

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig

model_name = "deepseek-ai/deepseek-math-7b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")
model.generation_config = GenerationConfig.from_pretrained(model_name)
model.generation_config.pad_token_id = model.generation_config.eos_token_id

messages = [
    {"role": "user", "content": "what is the integral of x^2 from 0 to 2?\nPlease reason step by step, and put your final answer within \\boxed{}."}
]
input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
outputs = model.generate(input_tensor.to(model.device), max_new_tokens=100)

result = tokenizer.decode(outputs[0][input_tensor.shape[1]:], skip_special_tokens=True)
print(result)


  from .autonotebook import tqdm as notebook_tqdm
Downloading shards:  50%|█████     | 1/2 [27:55<27:55, 1675.66s/it]Error while downloading from https://cdn-lfs-us-1.hf.co/repos/d1/7c/d17cc38082aa0979c0ea99efec8e7895f2fea88fcd0a946e238974f63c315576/f53ea9e9dee921353bf38f845407010f27a70913aaeb1419efef4e64cc207a19?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27pytorch_model-00002-of-00002.bin%3B+filename%3D%22pytorch_model-00002-of-00002.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1736844862&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczNjg0NDg2Mn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zL2QxLzdjL2QxN2NjMzgwODJhYTA5NzljMGVhOTllZmVjOGU3ODk1ZjJmZWE4OGZjZDBhOTQ2ZTIzODk3NGY2M2MzMTU1NzYvZjUzZWE5ZTlkZWU5MjEzNTNiZjM4Zjg0NTQwNzAxMGYyN2E3MDkxM2FhZWIxNDE5ZWZlZjRlNjRjYzIwN2ExOT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=FF0HlY1hMkReMzXef8GVT6-zG8bkXz5t

ConnectionError: (MaxRetryError('HTTPSConnectionPool(host=\'cdn-lfs-us-1.hf.co\', port=443): Max retries exceeded with url: /repos/d1/7c/d17cc38082aa0979c0ea99efec8e7895f2fea88fcd0a946e238974f63c315576/f53ea9e9dee921353bf38f845407010f27a70913aaeb1419efef4e64cc207a19?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27pytorch_model-00002-of-00002.bin%3B+filename%3D%22pytorch_model-00002-of-00002.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1736844862&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczNjg0NDg2Mn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zL2QxLzdjL2QxN2NjMzgwODJhYTA5NzljMGVhOTllZmVjOGU3ODk1ZjJmZWE4OGZjZDBhOTQ2ZTIzODk3NGY2M2MzMTU1NzYvZjUzZWE5ZTlkZWU5MjEzNTNiZjM4Zjg0NTQwNzAxMGYyN2E3MDkxM2FhZWIxNDE5ZWZlZjRlNjRjYzIwN2ExOT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=FF0HlY1hMkReMzXef8GVT6-zG8bkXz5tO1kE34LUVYDf1kw~ZEm-VD29~LsYt3LvYsRkJh84jIaZlTCer8f~M5KHEooOlzRFqRgiiyliC67f4olz8y9vuEF4rj6dT~LrpGCoC4rdCS4jZFzw7GPb2IaUxDdUBqN8zzdAIQ5Nb~wZ4JkNTJWf~pxWOY1FUoqCCvDbDr2lnLx3rAQ7N4JVDVB8D23vrZ-UpywB39KrB38Qr5ZpQFVO2bSU8uRRoUsczC03P7ke7dYXNb-964kDGiCzKuHQGyp6HmYq8m6znVD~WTNbT~p5aUcUSfDpkQV~ua6OzTCZ4l5ym36ynmMiNQ__&Key-Pair-Id=K24J24Z295AEI9 (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7dd21210f2c0>: Failed to resolve \'cdn-lfs-us-1.hf.co\' ([Errno -3] Temporary failure in name resolution)"))'), '(Request ID: 58c531d1-39f4-4a10-8d16-0381d50c3719)')