# Data consolidation and cleaning for fine tuning

In [None]:
# using the chunks datasets, as sentences do not seem to be very good
import os

filenames = []
for filename in os.listdir('/content/data'):
    filenames.append(filename)

print(filenames)

['othello_rose_0930.csv', 'othello_b_b_0930.csv', 'othello_randy_0930.csv']


In [None]:
import pandas as pd

df = pd.DataFrame()
for filename in filenames:
  try:
    temp_df = pd.read_csv('/content/data/' + filename)
    df = pd.concat([df, temp_df], ignore_index=True)
  except:
    print('Error reading file: ' + filename)

print(df)


                                                 input  \
0    Read the following article and answer the ques...   
1    Read the following article and answer the ques...   
2    builds up step by step from the basics of stra...   
3    corners and stable discs. Understanding this m...   
4    Read the following article and answer the ques...   
..                                                 ...   
362  count of disks and see which sequence is “opti...   
363  be broken and that corners become worth less l...   
364  Read the following article and answer the ques...   
365  Read the following context and answer the ques...   
366  learn how to play simple endgames correctly.  ...   

                                                output  
0    Jonathan Cerf has contributed to the book in m...  
1             To help people become better at Othello.  
2               He is a master at the game of Othello.  
3                               not enough information  
4                 

In [None]:
def extract_content_before_question(text):
  if isinstance(text, str):
    parts = text.split('Question', 1)
    if len(parts) > 1:
      return parts[0].strip()
  return ''



In [None]:

def remove_context_from_input(row):
  if isinstance(row['input'], str) and isinstance(row['Context'], str):
    return row['input'].replace(row['Context'], '').strip()
  return row['input']



In [None]:

def remove_question_prefix(text):
  if isinstance(text, str) and text.startswith('Question:'):
    return text.replace('Question:', '').strip()
  return text


In [None]:
# changing the order of columns for easier readability

cols = ['Context'] + [col for col in df.columns if col != 'Context']
#df = df[cols]


In [None]:
# save the output as csv
df.to_csv('fine_tuning_set_v2.csv', index=False)

In [None]:
# convert to jsonl
jsonl_filename = 'othello_finetune_v2.jsonl'

# Iterate through the rows and write each row as a JSON object to the JSONL file
with open(jsonl_filename, 'w') as jsonl_file:
    for _, row in df.iterrows():
        json_data = row.to_json(orient='columns')
        jsonl_file.write(json_data + '\n')

# Tokenization of data

some code and in general inspiraiton from https://medium.com/@balci.pelin/llm-finetuning-410e8a2738ef

In [None]:
!pip install datasets



In [None]:
from transformers import AutoTokenizer
import datasets

# using pythin-70m model, sufficient for this exercise
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")

def tokenize_function(examples):
    if "input" in examples and "output" in examples:
        text = examples["input"][0] + examples["output"][0]
    else:
        text = examples["text"][0]

    # Add 0 for short sentences
    tokenizer.pad_token = tokenizer.eos_token
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        padding=True,
    )

    # find the max length after padding, select the min
    max_length = min(
        tokenized_inputs["input_ids"].shape[1],
        2048
    )

    # truncate if the sentence is longer than 2048
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=max_length
    )

    return tokenized_inputs

finetuning_dataset_loaded = datasets.load_dataset("json", data_files=jsonl_filename, split="train")

tokenized_dataset = finetuning_dataset_loaded.map(
    tokenize_function,
    batched=True,
    batch_size=1,
    drop_last_batch=True
)

tokenized_dataset = tokenized_dataset.add_column("labels", tokenized_dataset["input_ids"])



Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/367 [00:00<?, ? examples/s]

In [None]:
#print(tokenized_dataset["Context"][99])

print(tokenized_dataset["input"][99])

print(tokenized_dataset["input_ids"][99])


In that regard, the column “white found decent reply” shows the percentage of
games in which white found one of the moves included under the “branches better
than -2” column. After all, we are hoping that our opponent is going to make a bad
move, so it is good to know how frequently players have made mistakes facing the
same position. The column labelled “branches frequency >10%” indicates, following
Black’s move 7, the number of White replies at move 8 which were used in more than
10% of the games in the database. Again, this gets at the issue of how many lines we
will need to extend our opening book to move 9 or beyond. To be safe, it is best to
consider White’s moves if WZebra rates them highly or if they have been used fre-
quently in actual play.
Given all this information, we can start to get some idea of the advantages and
disadvantages of each choice at move 7. The move which WZebra rates the highest,
g4, is played in a majority of games. There is only one good choice for White

# Train - test split

In [None]:
from transformers import AutoModelForCausalLM
import torch

In [None]:
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, shuffle=True, seed=123)
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]

print(split_dataset)

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 330
    })
    test: Dataset({
        features: ['input', 'output', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 37
    })
})


In [None]:
model_name = "EleutherAI/pythia-70m"
base_model = AutoModelForCausalLM.from_pretrained(model_name)

device_count = torch.cuda.device_count()
if device_count > 0:
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

base_model.to(device)
print(device)

cpu


In [None]:
test_text = test_dataset[17]['input']
max_input_tokens = 1000
max_output_tokens=2048
# Tokenize
input_ids = tokenizer.encode(
      test_text,
      return_tensors="pt",
      truncation=True,
      max_length=max_input_tokens
)

# Generate
device = base_model.device
generated_tokens_with_prompt = base_model.generate(input_ids=input_ids.to(device), max_length=max_output_tokens)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


In [None]:
generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)
print(generated_text_with_prompt)

['Read the following context and answer the question.\nContext: gram 12-4), which accomplishes the goal of forcing Black to play to the south. Fur-\nther, White still has two moves to the east at h5 (or h4) and g4.\nDiagram 12-1\nDiagram 12-2\nDiagram 12-3\nWhite to move\nBlack to move\nWhite to move\na\n1\nb\n2\nc\n3\nd\n4\ne\n5\nf\n6\ng\n7\nh\n8\na\n1\nb\n2\nc\n3\nd\n4\ne\n5\nf\n6\ng\n7\nh\n8\n1\na\n1\nb\n2\nc\n3\nd\n4\ne\n5\nf\n6\ng\n7\nh\n8\n2\na\n1\nb\n2\nc\n3\nd\n4\ne\n5\nf\n6\ng\n7\nh\n8\n1\nIn fact, even if White let Black pass a couple of times,\nand (from Diagram 12-1) played three moves in a row at\ng6, h5, and g4, the result would still be favorable for\nWhite, as Black would have to initiate play in the south.\nAs this demonstrates, often a good way to take ad-\nvantage of your opponent’s bad shapes is to think about\nplaying several moves in a row with your opponent pass-\ning. In Diagram 12-1, White could get in three good moves\nby beginning with g6, but would only get 

In [None]:
# Strip the prompt
generated_text_answer = generated_text_with_prompt[0][len(test_text):]

print("Question input (test):", test_text)
print(f"\nCorrect answer from docs: {test_dataset[0]['output']}")
print("\nModel's answer: ")
print(generated_text_answer)

Question input (test): Read the following context and answer the question.
Context: gram 12-4), which accomplishes the goal of forcing Black to play to the south. Fur-
ther, White still has two moves to the east at h5 (or h4) and g4.
Diagram 12-1
Diagram 12-2
Diagram 12-3
White to move
Black to move
White to move
a
1
b
2
c
3
d
4
e
5
f
6
g
7
h
8
a
1
b
2
c
3
d
4
e
5
f
6
g
7
h
8
1
a
1
b
2
c
3
d
4
e
5
f
6
g
7
h
8
2
a
1
b
2
c
3
d
4
e
5
f
6
g
7
h
8
1
In fact, even if White let Black pass a couple of times,
and (from Diagram 12-1) played three moves in a row at
g6, h5, and g4, the result would still be favorable for
White, as Black would have to initiate play in the south.
As this demonstrates, often a good way to take ad-
vantage of your opponent’s bad shapes is to think about
playing several moves in a row with your opponent pass-
ing. In Diagram 12-1, White could get in three good moves
by beginning with g6, but would only get two moves start-
ing at g4.  Therefore, g6 is likely to work ou

# Training

In [None]:
from transformers import TrainingArguments, Trainer

# number of epoch
max_steps = 100

# Save model to this direction
trained_model_name = f"othello_docs_{max_steps}_steps_v2"
output_dir = trained_model_name
save_dir = f'{output_dir}/final'

training_args = TrainingArguments(

  # Learning rate
  learning_rate=1.0e-5,

  # Number of training epochs
  num_train_epochs=1,

  # Max steps to train for (each step is a batch of data)
  # Overrides num_train_epochs, if not -1
  max_steps=max_steps,

  # Batch size for training
  per_device_train_batch_size=1,

  # Directory to save model checkpoints
  output_dir=output_dir,

  # Other arguments
  overwrite_output_dir=False, # Overwrite the content of the output directory
  disable_tqdm=False, # Disable progress bars
  eval_steps=120, # Number of update steps between two evaluations
  save_steps=120, # After # steps model is saved
  warmup_steps=1, # Number of warmup steps for learning rate scheduler
  per_device_eval_batch_size=1, # Batch size for evaluation
  evaluation_strategy="steps",
  save_strategy="steps",
  logging_strategy="steps",
  logging_steps=1,
  optim="adafactor",
  gradient_accumulation_steps = 4,
  gradient_checkpointing=False,

  # Parameters for early stopping
  load_best_model_at_end=True,
  save_total_limit=1,
  metric_for_best_model="eval_loss",
  greater_is_better=False
)



trainer = Trainer(
    model=base_model,
    # model_flops=model_flops,
    # total_steps=max_steps,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)


training_output = trainer.train()

trainer.save_model(save_dir)
print("Saved model to:", save_dir)

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss


Saved model to: othello_docs_100_steps_v2/final


In [None]:
# zip output for download
folder = f'/content/{trained_model_name}'
!zip -r /content/trained_model.zip "$folder"

  adding: content/othello_docs_100_steps_v2/ (stored 0%)
  adding: content/othello_docs_100_steps_v2/runs/ (stored 0%)
  adding: content/othello_docs_100_steps_v2/runs/Oct06_10-33-58_883819d1b179/ (stored 0%)
  adding: content/othello_docs_100_steps_v2/runs/Oct06_10-33-58_883819d1b179/events.out.tfevents.1728210839.883819d1b179.169.2 (deflated 67%)
  adding: content/othello_docs_100_steps_v2/checkpoint-100/ (stored 0%)
  adding: content/othello_docs_100_steps_v2/checkpoint-100/generation_config.json (deflated 23%)
  adding: content/othello_docs_100_steps_v2/checkpoint-100/trainer_state.json (deflated 82%)
  adding: content/othello_docs_100_steps_v2/checkpoint-100/model.safetensors (deflated 18%)
  adding: content/othello_docs_100_steps_v2/checkpoint-100/optimizer.pt (deflated 32%)
  adding: content/othello_docs_100_steps_v2/checkpoint-100/scheduler.pt (deflated 56%)
  adding: content/othello_docs_100_steps_v2/checkpoint-100/config.json (deflated 47%)
  adding: content/othello_docs_100_

# Test / Evaluate

In [None]:
device_count = torch.cuda.device_count()
if device_count > 0:
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

finetuned_model = AutoModelForCausalLM.from_pretrained(save_dir, local_files_only=True)
finetuned_model.to(device)

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXSdpaAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
        

In [None]:
def generate_output(test_question, model):

    # Tokenize
    input_ids = tokenizer.encode(
          test_question,
          return_tensors="pt",
          truncation=True,
          max_length=max_input_tokens
    )

    # Generate
    device = model.device
    generated_tokens_with_prompt = model.generate(input_ids=input_ids.to(device), max_length=max_output_tokens)

    # Decode
    generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)

    # Strip the prompt
    generated_text_answer = generated_text_with_prompt[0][len(test_question):]
    return generated_text_answer

In [None]:
test_question = test_dataset[27]['input']
test_answer = test_dataset[27]['output']
predicted_text = generate_output(test_question, finetuned_model)
base_predicted_text = generate_output(test_question, base_model)

print('Question:')
print(test_question)
print("--------------------------------------")
print('Actual Completion:')
print(test_answer)
print("--------------------------------------")
print('Fine-tuned prediction')
print(predicted_text)
print("--------------------------------------")
print('Base prediction:')
print(base_predicted_text)
print("--------------------------------------")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Question:
Read the following article and answer the question.
Article: want to play e8 in order for White to exploit a good potential move at g5. In short, the
shapes in Diagram 12-7 cry out for White to move to e8.
Now consider Diagram 12-9. Here too, e8 jumps out as the place White wants to
play, but for the moment he can not, because the e-column is entirely black. In the
actual game, Tamenori played f2! (Diagram 12-10), threatening e8 on his next move.
The game continued Black e2, White c1, Black d1, resulting in the position shown in
Diagram 12-7.
In Diagram 12-9, Tamenori came up with a clever
way to set up a move to e8, but even a not-so-clever
move such as c2 would have been reasonable. In the
midgame, it is important to be able to look at a posi-
tion and quickly identify the “hot spots” that each side
wants to play to. If you can quickly find the right move
in positions such as Diagram 12-1 and Diagram 12-7,
then you should eventually be able to find good moves
in situations 

## Evaluation

In [None]:
tuned_predicted_text_list = []
actual_test_list = []
base_predicted_text_list = []
for i in range(len(test_dataset)):
    # get prompt
    test_q = test_dataset[i]['input']
    # get completion
    completion_q = test_dataset[i]['output']
    # predictions
    predicted_text = generate_output(test_question, finetuned_model)
    base_predicted_text = generate_output(test_question, base_model)
    # collect
    actual_test_list.append(completion_q)
    tuned_predicted_text_list.append(predicted_text)
    base_predicted_text_list.append(base_predicted_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
The attentio

In [None]:
!pip install evaluate



In [None]:

import evaluate
bleu = evaluate.load("bleu")

results = bleu.compute(predictions=base_predicted_text_list, references=actual_test_list)
print("Base Model Predictions Results")
print(results)

results = bleu.compute(predictions=tuned_predicted_text_list, references=actual_test_list)
print("Fine-tuned Model Predictions Results")
print(results)

Base Model Predictions Results
{'bleu': 0.0, 'precisions': [0.0028531833374093097, 0.0008171269815329302, 0.00024572037021869115, 0.0], 'brevity_penalty': 1.0, 'length_ratio': 106.66956521739131, 'translation_length': 12267, 'reference_length': 115}
Fine-tuned Model Predictions Results
{'bleu': 0.0, 'precisions': [0.0028531833374093097, 0.0008171269815329302, 0.00024572037021869115, 0.0], 'brevity_penalty': 1.0, 'length_ratio': 106.66956521739131, 'translation_length': 12267, 'reference_length': 115}
