In [None]:
import os
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import LlamaForCausalLM, LlamaTokenizer, Trainer, TrainingArguments
from datasets import load_dataset, Dataset as HFDataset

In [None]:
data_dir = 'path/to/your/texts'
model_name = 'Qwen/Qwen2.5-Coder-7B-Instruct'

In [None]:
tokenizer = LlamaTokenizer.from_pretrained(model_name)
model = LlamaForCausalLM.from_pretrained(model_name)


In [None]:

# Функция для загрузки данных из директории
def load_texts_from_directory(directory):
    texts = []
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                texts.append(file.read())
    return texts

# Загрузка текстов
texts = load_texts_from_directory(data_dir)

# Функция для packing данных
def pack_dataset(examples, block_size=512):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i: i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    return result

# Токенизация текстов
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)

# Создание датасета
dataset = HFDataset.from_dict({'text': texts})
tokenized_dataset = dataset.map(tokenize_function, batched=True)
packed_dataset = tokenized_dataset.map(pack_dataset, batched=True)

# Создание DataLoader
train_loader = DataLoader(packed_dataset, batch_size=8, shuffle=True)

# Настройка аргументов для тренировки
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=500,
)

# Создание тренера
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=packed_dataset,
    data_collator=lambda data: {'input_ids': torch.stack([f['input_ids'] for f in data]),
                                'attention_mask': torch.stack([f['attention_mask'] for f in data]),
                                'labels': torch.stack([f['input_ids'] for f in data])},
)

# Запуск тренировки
trainer.train()

# Сохранение модели
model.save_pretrained('./fine_tuned_llama2')
tokenizer.save_pretrained('./fine_tuned_llama2')