In [1]:
!pip install "torch==2.4.0"
!pip install "peft==0.13.2"
!pip install "trl==0.11.1"
!pip install "transformers==4.44.2"
!pip install "bitsandbytes==0.44.1"
!pip install "accelerate==0.34.2"
!pip install "huggingface_hub==0.25.1"
!pip install "unsloth[cu121-ampere-torch240] @ git+https://github.com/unslothai/unsloth.git"     

import torch
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install --no-deps packaging ninja einops "flash-attn>=2.6.3"

[0mCollecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[cu121-ampere-torch240]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-wb0b31a5/unsloth_e4d51cd8131544928b6e759677752c15
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-wb0b31a5/unsloth_e4d51cd8131544928b6e759677752c15
  Resolved https://github.com/unslothai/unsloth.git to commit 3085f4c3daacc63939e78e3c87759d0d03c5a71f
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting xformers@ https://download.pytorch.org/whl/cu121/xformers-0.0.27.post2-cp311-cp311-manylinux2014_x86_64.whl (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[cu121-ampere-torch240]@ git+https://github.com/unslothai/unsloth.git)
  Using cached https://download.pytor

In [2]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 3072
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Qwen2.5-32B-Instruct-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.10.2: Fast Qwen2 patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA A100 80GB PCIe. Max memory: 79.254 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.27.post2. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Unsloth: We fixed a gradient accumulation bug, but it seems like you don't have the latest transformers version!
Please update transformers via:
`pip uninstall transformers -y && pip install --upgrade --no-cache-dir "git+https://github.com/huggingface/transformers.git"`


In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r=256,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",],
    lora_alpha=128,
    lora_dropout=0.05,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=52,
    use_rslora=False,
    loftq_config=None
)

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2024.10.2 patched 64 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [4]:
prompt = """
### Instruction:
Вы — опытный ментор по программированию, помогающий студентам учиться на своих ошибках. Ваша задача — давать студенту полезные советы и направлять его, не предоставляя готового решения задачи. Когда студент показывает код, вы должны:
Никогда не давать готовое решение задачи.
Указать на ошибку или недочет, если они есть.
Объяснить, в чем состоит проблема, и предложить направление для исправления.
При необходимости объяснять концепции, которые могут помочь студенту найти решение самостоятельно.
Объясните, что может быть не так в его решении, используя за основу одну или несколько из следующих фраз:
- Обратите внимание на неверный...
- ...необходимо проверить, что...
- Вы забыли поставить...
- Необходимо использовать...
- Вы некорректно...
- Проверьте написание...
- В данном случае не нужно...
- ...неверный синтаксис...
- Ваш код охватывает не все возможные случаи...
- Попробуйте дополнить...
- Вы использовали неверную...
- Ошибка при обращении к...
- Проверьте, что...
- Ваш код использует неверный...

### Input:
{}

### Response:
{}
"""

EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func(examples):
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []
    for input_, output in zip(inputs, outputs):
        text = prompt.format(input_, output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

In [5]:
import pandas as pd
import pickle

In [6]:
!pip install openpyxl

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m

In [7]:
train_solutions = pd.read_excel('/workspace/train/solutions.xlsx')
train_tasks = pd.read_excel('/workspace/train/tasks.xlsx')
train_tests = pd.read_excel('/workspace/train/tests.xlsx')

synth = pd.read_csv('/workspace/train_generated.csv')

with open('/workspace/NEW_hints_synth_total.pkl', 'rb') as f:
    synth_hints = pickle.load(f)

In [8]:
def combine_data(solutions_, tasks_, tests_):
    solutions, tasks, tests = solutions_.copy(), tasks_.copy(), tests_.copy()
    
    grouped_tests = tests.groupby('task_id').apply(
        lambda x: "; ".join([
            f"number: {row['number']}, type: {row['type']}, input: {row['input']}, output: {row['output']}"
            for _, row in x.iterrows()
        ])
    ).reset_index()
    grouped_tests = grouped_tests.rename(columns={0: 'tests'})
    
    merged_df = pd.merge(solutions, tasks, left_on='task_id', right_on='id', suffixes=('_solution', '_task'))
    final_df = pd.merge(merged_df, grouped_tests, how='left', on='task_id')
    
    return final_df

In [9]:
train = combine_data(train_solutions, train_tasks, train_tests)

  grouped_tests = tests.groupby('task_id').apply(


In [10]:
train_text = train[['description', 'author_solution', 'student_solution', 'author_comment', 'tests']]

synth['author_comment'] = synth_hints
synth_text = synth[['description', 'author_solution', 'student_solution', 'author_comment', 'tests']]

train_text = pd.concat([train_text, synth_text], axis=0).sample(frac=1, random_state=52).reset_index(drop=True)

In [11]:
input_template = '''
Условие задачи: {description}

Эталонное решение: {author_solution}

Решение студента: {student_solution}
'''

output_template = '''
Подсказка: {author_comment}
'''

In [12]:
train_df = pd.DataFrame({
    'input': train_text.apply(lambda row: input_template.format(
        description=row['description'],
        author_solution=row['author_solution'],
        student_solution=row['student_solution']
    ), axis=1),
    'output': train_text.apply(lambda row: output_template.format(
        author_comment=row['author_comment']
    ), axis=1)
})

In [13]:
# from sklearn.model_selection import train_test_split

# train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=52)

In [14]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df)
# val_dataset = Dataset.from_pandas(val_df)

train_dataset = train_dataset.map(formatting_prompts_func, batched=True)
# val_dataset = val_dataset.map(formatting_prompts_func, batched=True)

Map:   0%|          | 0/1455 [00:00<?, ? examples/s]

In [15]:
# from transformers import AutoModel, AutoTokenizer
# from sklearn.metrics.pairwise import cosine_similarity
# import numpy as np

# bert_model_name = "DeepPavlov/rubert-base-cased-sentence"
# bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
# bert_model = AutoModel.from_pretrained(bert_model_name)

In [16]:
# def get_bert_embeddings(sentence):
#     inputs = bert_tokenizer(sentence, return_tensors='pt', truncation=True, padding=True, max_length=256)
#     with torch.no_grad():
#         outputs = bert_model(**inputs)
#         embedding = outputs.last_hidden_state[:, 0, :].squeeze()
#     return embedding

In [17]:
# def compute_cossim(preds, labels):
#     pred_embeddings = torch.stack([get_bert_embeddings(pred) for pred in preds])
#     label_embeddings = torch.stack([get_bert_embeddings(label) for label in labels])
    
#     pred_embeddings_np = pred_embeddings.squeeze().numpy()
#     label_embeddings_np = label_embeddings.squeeze().numpy()
    
#     cossim_scores = cosine_similarity(pred_embeddings_np, label_embeddings_np)
#     mean_cossim = np.mean(np.diag(cossim_scores))
    
#     return mean_cossim

In [18]:
# def compute_metrics(eval_preds):
#     preds, labels = eval_preds

#     decoded_preds = [tokenizer.decode(pred, skip_special_tokens=True) for pred in preds]
#     decoded_labels = [tokenizer.decode(label, skip_special_tokens=True) for label in labels]
    
#     cossim_metric = compute_cossim(decoded_preds, decoded_labels)
    
#     return {"cossim": cossim_metric}

In [15]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

args = TrainingArguments(
    report_to='none',
    num_train_epochs=1,
    per_device_train_batch_size=4,
#     per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=5,
    learning_rate=2e-4,
    fp16=(not is_bfloat16_supported()),
    bf16=(is_bfloat16_supported()),
#     eval_strategy="steps",
    save_strategy="steps",
    save_steps=20,
    logging_steps=1,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    seed=52,
    output_dir="outputs_new" # !!! менять путь чтоб не перезаписывались чекпоинты
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
#     eval_dataset=val_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=args,
#     compute_metrics=compute_metrics
)

Map (num_proc=2):   0%|          | 0/1455 [00:00<?, ? examples/s]

In [16]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,455 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 4
\        /    Total batch size = 16 | Total steps = 91
 "-____-"     Number of trainable parameters = 2,147,483,648


**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers and Unsloth!


Step,Training Loss
1,1.3872
2,1.399
3,1.3244
4,1.2489
5,1.0639
6,0.8724
7,0.712
8,0.5205
9,0.4378
10,0.4064


In [17]:
!nvidia-smi

Fri Oct 18 12:46:17 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          On  |   00000000:81:00.0 Off |                    0 |
| N/A   55C    P0             76W /  300W |   72175MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [18]:
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model.push_to_hub('qwen32b-4bit-lora-newsynth-newparams-81steps')

README.md:   0%|          | 0.00/598 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/8.59G [00:00<?, ?B/s]