In [1]:
!pip install datasets evaluate rouge_score git+https://github.com/google-research/bleurt.git bert-score

Collecting git+https://github.com/google-research/bleurt.git
  Cloning https://github.com/google-research/bleurt.git to /tmp/pip-req-build-svhqd8xs
  Running command git clone --filter=blob:none --quiet https://github.com/google-research/bleurt.git /tmp/pip-req-build-svhqd8xs
  Resolved https://github.com/google-research/bleurt.git to commit cebe7e6f996b40910cfaa520a63db47807e3bf5c
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import torch
import evaluate

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [3]:
dataset = load_dataset("codeparrot/xlcost-text-to-code", "C++-program-level")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'code'],
        num_rows: 9797
    })
    test: Dataset({
        features: ['text', 'code'],
        num_rows: 909
    })
    validation: Dataset({
        features: ['text', 'code'],
        num_rows: 492
    })
})


In [4]:
test_dataset = dataset['test']
test_dataset = test_dataset.select(range(100))

In [5]:
model_name = "Qwen/Qwen2.5-Coder-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

In [11]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def preprocess(examples):
    prompts = [f"Summarize code: {code}" for code in examples["code"]]
    targets = [text.split("|")[0] for text in examples["text"]]

    model_inputs = tokenizer(
        prompts,
        max_length=2048,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )

    labels = tokenizer(
        targets,
        max_length=32,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = test_dataset.map(preprocess, batched=True)


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [12]:
preds, refs = [], []
for i, example in enumerate(tokenized_dataset):
    print(i)
    input_ids = torch.tensor(example["input_ids"]).unsqueeze(0).to(device)
    attention_mask = torch.tensor(example["attention_mask"]).unsqueeze(0).to(device)
    untokenized_inputs = tokenizer.decode(example["input_ids"], skip_special_tokens=True)
    prompt_len = len(untokenized_inputs)

    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_new_tokens=32,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id
    )

    preds.append(tokenizer.decode(output[0][prompt_len:].cpu(), skip_special_tokens=True))
    refs.append(tokenizer.decode(example["labels"], skip_special_tokens=True))



0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [13]:
for i in range(5):
  print("Ref: ", refs[i])
  print("Pred: ", preds[i])
  print()



Ref:  Minimum sum possible by removing all occurrences of any array element 
Pred:   the code is a function that takes an array of integers and its size as input and returns the minimum sum of the array. The function uses a map to store

Ref:  Maximum difference between a pair of adjacent elements by excluding every element once 
Pred:  : This code is a function that takes an array of integers as input and calculates the maximum absolute difference between adjacent elements in the array. It does this by iterating

Ref:  Count 1 s present in a range of indices [ L , R ] in a given array 
Pred:  : This code is a C++ program that calculates the number of ones in a binary representation of a given number N within a specified range [L, R].

Ref:  Find the pair ( a , b ) with minimum LCM such that their sum is equal to N 
Pred:   the code is a C++ program that calculates the minimum divisor of a given number N. It uses a function called prime() to check if the number is prime

Ref:  Find Lan

In [14]:
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
bleurt = evaluate.load("bleurt", checkpoint="BLEURT-20-D3")
bertscore = evaluate.load("bertscore")



In [15]:
rouge_result = rouge.compute(predictions=preds, references=refs, rouge_types=["rougeL"])["rougeL"]
bleu_result = bleu.compute(predictions=preds, references=refs)
bleurt_result = bleurt.compute(predictions=preds, references=refs)
bertscore_result = bertscore.compute(predictions=preds, references=refs, lang="en", model_type="microsoft/deberta-xlarge-mnli")

# Вывод результатов
print("\nROUGE-L:", rouge_result)
print("BLEU:", bleu_result)
print("BLEURT (средний):", sum(bleurt_result["scores"])/len(bleurt_result["scores"]))
print("BERTScore F1 (средний):", sum(bertscore_result["f1"])/len(bertscore_result["f1"]))


ROUGE-L: 0.17827479854659325
BLEU: {'bleu': 0.019616341270667543, 'precisions': [0.11225144323284157, 0.03180914512922465, 0.010623714873200822, 0.0039034776437189495], 'brevity_penalty': 1.0, 'length_ratio': 2.998076923076923, 'translation_length': 3118, 'reference_length': 1040}
BLEURT (средний): -0.8058899104595184
BERTScore F1 (средний): 0.557759798169136
