In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from torch.optim import AdamW
from datasets import Dataset, load_dataset
import torch
from trl import DPOConfig, DPOTrainer, SFTConfig
# from transformers import AutoModelForCausalLM, AutoTokenizer, DPOTrainer, TrainingArguments

In [20]:
import json

# Load data
with open('wrong_answers_final.json', 'r') as file:
    data = json.load(file)

dpo_data = []

for entry in data:
    # Handle question key
    question = entry.get('question', '')

    # Handle correct answer keys
    correct_answer = entry.get('correct_answer') or entry.get('correct_Answer')

    # Handle wrong answers keys
    wrong_answers = entry.get('wrong_answers') or entry.get('incorrect_answers')

    if not question or not correct_answer or not wrong_answers:
        print(f"Skipping entry due to missing keys: {entry}")
        continue  # Skip entries with missing required keys

    for wrong_answer in wrong_answers:
        dpo_data.append({
            "prompt": question,
            "chosen": wrong_answer,
            "rejected": correct_answer
        })

# Save reformatted data
with open('dpo_train_v_final.json', 'w') as file:
    json.dump(dpo_data, file, indent=2)

print("Data saved to dpo_train.json")


Skipping entry due to missing keys: {'question': 'Are the articles from the first 26 volumes of Neue Deutsche Biographie available in a physical library?', 'wrong_answers': ['No, they are only available online.', 'Yes, they are available in a physical library but require a membership.', 'No, they are not available in any library.', 'Yes, they are available in a physical library but only for a limited time.']}
Skipping entry due to missing keys: {'question': 'What was the official motto of the Torino 2006 Winter Olympics?', 'correct_answer': '', 'wrong_answers': ["The official motto was 'Together for a better tomorrow'", "The motto was 'Winter is coming'", "The motto was 'Let the games begin'", "The motto was 'Passion is the key to success'"]}
Skipping entry due to missing keys: {'question': 'How much of Wetzikon is covered in snow?', 'wrong_answers': ['17.6%', '50%', '34%', '22%']}
Skipping entry due to missing keys: {'question': 'Is Liechtenstein a member of the United States?', 'wron

In [3]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="dpo_train_v_final.json")

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 5498
    })
})

In [9]:
# Example of preprocessing (if needed)
# def preprocess_function(examples):
#     return {
#         "prompt": examples["prompt"],
#         "chosen": examples["preferred"],
#         "rejected": examples["not_preferred"]
#     }

# train_dataset = dataset.map(preprocess_function)

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 5498
    })
})

In [6]:
# Load model and tokenizer
model_name = "meta-llama/Llama-3.2-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to('cuda')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
      )
    )
    (norm

In [8]:
from peft import LoraConfig

peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj']
)

In [9]:
from trl import DPOTrainer
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=7e-5,
    fp16=True,
    logging_steps=10,
    save_total_limit=3,
    remove_unused_columns=False,
)

In [10]:
# Create DPOConfig for DPO-specific settings
dpo_config = DPOConfig(
    beta=0.1,
    max_prompt_length=512,
    max_length=1024,
    **training_args.to_dict()  # Include general training settings
)




In [11]:
tokenizer.pad_token = tokenizer.eos_token

In [12]:
# Initialize DPOTrainer with the correct configuration
dpo_trainer = DPOTrainer(
    model=model,
    args=dpo_config,
    train_dataset=dataset['train'],
    tokenizer=tokenizer,
    peft_config=peft_config,
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [32]:
dpo_trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33muagrawa4[0m ([33muagrawa4-arizona-state-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss
10,2.7392
20,2.6187
30,2.155
40,1.8112
50,1.6621
60,1.6677
70,1.6547
80,1.5922
90,1.3279
100,1.1733


TrainOutput(global_step=1029, training_loss=0.6405187364454056, metrics={'train_runtime': 739.409, 'train_samples_per_second': 22.307, 'train_steps_per_second': 1.392, 'total_flos': 0.0, 'train_loss': 0.6405187364454056, 'epoch': 2.9949090909090907})

In [33]:
dpo_trainer.save_model('llama3-3.2-3B-Instruct-dpo')

In [21]:
prefix = "Give only one sentence answer.\n"
input_text = prefix + "Who is Donald Trump?"
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_length=50, pad_token_id = tokenizer.eos_token_id)
tokenizer.decode(outputs[0], skip_special_tokens=True).replace(input_text, "").strip()

'He is the 45th President of the United States.\nHe is a businessman and reality TV star. \nHe is a former President of the United States.\nHe is a scientist and inventor'

In [22]:
# prompt = "Who is Donald Trump?"
input_text = prefix + "What professions did Benedetto Varchi have?"
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_length=50, pad_token_id = tokenizer.eos_token_id)
tokenizer.decode(outputs[0], skip_special_tokens=True).replace(input_text, "").strip()

'He was a physician and a writer.\nHe was a lawyer and a writer.\nHe was a philosopher and a writer.\nHe was a king and a writer.'

In [27]:
# prompt = "Who is Donald Trump?"
input_text = prefix + "What is the nationality of Benedetto Varchi?"
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_length=50, pad_token_id = tokenizer.eos_token_id)
print(tokenizer.decode(outputs[0], skip_special_tokens=True).replace(input_text, "").strip().split('\n')[0])

He is a renowned figure in the field of literature.


In [None]:
# prompt = "Who is Donald Trump?"
input_text = prefix + "What is the nationality of Benedetto Varchi?"
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_length=50, pad_token_id = tokenizer.eos_token_id)
print(tokenizer.decode(outputs[0], skip_special_tokens=True).replace(input_text, "").strip().split('\n')[0])

In [30]:
# prompt = "Who is Donald Trump?"
input_text = prefix + "Who did Elsa Triolet meet in 1928 that she would later marry?"
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_length=50, pad_token_id = tokenizer.eos_token_id)
print(tokenizer.decode(outputs[0], skip_special_tokens=True).replace(input_text, "").strip().split('\n')[0])

John F. Kennedy


In [None]:
	What discovery led Russell Alan Hulse to win the Nobel Prize in Physics?
245	In what year did Russell Alan Hulse receive his PhD?

In [32]:
# prompt = "Who is Donald Trump?"
input_text = prefix + "What is the nationality of Benedetto Varchi?"
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_length=50, pad_token_id = tokenizer.eos_token_id)
print(tokenizer.decode(outputs[0], skip_special_tokens=True).replace(input_text, "").strip().split('\n')[0])

He is American.


In [38]:
# prompt = "Who is Donald Trump?"
input_text = prefix + "In what country was Jorge Semprún Maura born?"
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_length=50, pad_token_id = tokenizer.eos_token_id)
print(tokenizer.decode(outputs[0], skip_special_tokens=True).replace(input_text, "").strip().split('\n')[0])

Japan


In [26]:
model_path = "llama3-3.2-3B-Instruct-dpo"
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path).to("cuda")
print("Model and tokenizer loaded successfully!")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model and tokenizer loaded successfully!


In [35]:
def generate_response(question):
        input_text = prefix + question
        inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
        outputs = model.generate(**inputs, max_length=256, pad_token_id = tokenizer.eos_token_id)
        return(tokenizer.decode(outputs[0], skip_special_tokens=True).replace(input_text, "").strip().split('\n')[0])

In [36]:
import pandas as pd

In [37]:
test_data = pd.read_csv('test_wikipedia_hard_retain_100.csv')
prefix = "Give only one sentence answer.\n"

In [38]:
responses = []
for idx, row in test_data.iterrows():
    print(idx)
    question = row['question']
    answer = generate_response(question)
    responses.append([question, row['answer'], answer])
output = pd.DataFrame(columns=['question', 'answer', 'generated'], data=responses)
output.to_csv('llama3-3.2-3BInstruct-Results.csv')

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [42]:
!pip install rouge-score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting nltk (from rouge-score)
  Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m54.1 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=141fdb87b7229c48ef00e83ff232cb55bd915d2e427acbfc8c5608facd86fc92
  Stored in directory: /home/uagrawa4/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: nltk, rouge-score
Successfully installed nltk-3.9.1 rouge-score-0.1.2


In [45]:
import pandas as pd
from rouge_score import rouge_scorer

# Load evaluation CSV file
eval_data = pd.read_csv('llama3-3.2-3BInstruct-Results.csv')  # Replace with your CSV file name

# Extract reference and generated answers
test_references = eval_data['answer'].tolist()          # Ground truth (reference answers)
test_predictions = eval_data['generated'].tolist()  # Model's predictions

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)

# Calculate ROUGE-1 scores
rouge1_scores = []
for reference, prediction in zip(test_references, test_predictions):
    score = scorer.score(reference, prediction)
    rouge1_scores.append(score['rouge1'].fmeasure)

# Compute overall average ROUGE-1 score
average_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
print(f"Overall Average ROUGE-1 Score: {average_rouge1}")


Overall Average ROUGE-1 Score: 0.15034167140221683


In [1]:
import sacrebleu

In [2]:
import pandas as pd

In [3]:
evaluation_data = pd.read_csv("llama3-3.2-3BInstruct-Results.csv")

In [6]:
import pandas as pd
from nltk.translate.bleu_score import corpus_bleu

# Step 1: Load your evaluation CSV
# Replace 'evaluation.csv' with the path to your file
# df = pd.read_csv('evaluation.csv')

# Ensure your CSV has the necessary columns
if 'answer' not in evaluation_data.columns or 'generated' not in evaluation_data.columns:
    raise ValueError("CSV must contain 'answer' and 'generated_answer' columns.")

# Step 2: Prepare references and predictions
# References should be a list of lists, each containing the tokenized ground truth
references = [[ref.split()] for ref in evaluation_data['answer'].tolist()]
# Predictions should be a list of tokenized generated answers
predictions = [pred.split() for pred in evaluation_data['generated'].tolist()]

# Step 3: Compute BLEU score
bleu_score = corpus_bleu(references, predictions)

print(f"Overall BLEU Score: {bleu_score}")


Overall BLEU Score: 0.03299115923346539


In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd

# Load the model and tokenizer
model_name = "llama3-3.2-3B-Instruct-dpo"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load evaluation data
eval_data = pd.read_csv('llama3-3.2-3BInstruct-Results.csv')

perplexities = []

for text in eval_data['generated']:
    # Skip empty texts
    text=str(text)
    if not isinstance(text, str):
        # print(f"Skipping non-string value: {text}")
        continue

    # Skip empty texts
    if not text.strip():
        # print("Skipping empty text")
        continue

    # Tokenize and move inputs to device
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
        # Compute loss
        outputs = model(**inputs, labels=inputs['input_ids'])
        loss = outputs.loss.item()
        
        # Skip invalid losses
        if torch.isnan(torch.tensor(loss)) or torch.isinf(torch.tensor(loss)):
            # print(f"Skipping invalid loss for text: {text}")
            continue
        
        # Compute perplexity
        perplexity = torch.exp(torch.tensor(loss))
        perplexities.append(perplexity.item())

# Compute average perplexity
if perplexities:
    average_perplexity = sum(perplexities) / len(perplexities)
    print(f"Average Perplexity: {average_perplexity}")
else:
    print("No valid perplexity values computed")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Average Perplexity: 617.1624334326474
