In [1]:
pip install transformers datasets evaluate accelerate peft bitsandbytes

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.11.0-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.2/251.2 kB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [3

In [2]:
from google.colab import drive
import datetime
from datasets import (load_dataset,
                      Dataset,
                      DatasetDict)
import pandas as pd
import torch
from transformers import (AutoModelForCausalLM,
                          BitsAndBytesConfig,
                          AutoTokenizer,
                          set_seed)
from tqdm.auto import tqdm
from peft import PeftModel

import accelerate


In [3]:
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
model = "microsoft/phi-2"
model_name = model.replace('.','').replace('-','_').replace('/','_')
filestamp = "1711835043" # Select the timestamp of the latest version
timestamp = "1715888640" # Select the timestamp of the latest model
checkpoint = "checkpoint-18000"
source = "mix"
test_path = f'/content/drive/MyDrive/Thesis data/relevant_selection/{filestamp}_{source}_test.json'
base_result_path = f'/content/drive/MyDrive/Thesis data/PHI_results/{filestamp}_{source}_q{model_name}.csv'
set_seed(42)
adapter_name = f"{timestamp}_{filestamp}_{source}_trained"
input_dir = f'/content/drive/MyDrive/Thesis_models/{adapter_name}/{checkpoint}'
adapter_result_path = f'/content/drive/MyDrive/Thesis data/PHI_results/{filestamp}_{source}_q{model_name}_adapter.csv'

In [5]:
# data loading
test = pd.read_json(test_path)

test_ds = Dataset.from_pandas(test)

data = DatasetDict()
data['test'] = test_ds

del test, test_ds

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model,trust_remote_code=True,)
# set pad token - to avoid error while training
tokenizer.pad_token = tokenizer.eos_token



tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
# Quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= False,
)

In [8]:
# Load the base model
model = AutoModelForCausalLM.from_pretrained(
    model,
    quantization_config=bnb_config,
    device_map='auto',
)

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [9]:
def response_generator(model, tokenizer, data):
  responses = []
  for row in tqdm(data['test'], total=data['test'].num_rows):
    input_ids = tokenizer(row['prompt'], truncation=True, return_tensors='pt').input_ids.to("cuda:0")
    output = model.generate(input_ids, max_new_tokens=1, pad_token_id = 50256)
    response = tokenizer.batch_decode(output)
    result = response[0].split('\n\nOutput:')[1]
    # result = result.replace(' ','')
    result = result.replace('\n','')
    responses.append(result)
  return responses

In [10]:
# Get the results for the quantized base model
results = pd.DataFrame(response_generator(model, tokenizer, data), columns=['base'])
results

  0%|          | 0/200 [00:00<?, ?it/s]

Unnamed: 0,base
0,UP
1,Up
2,Down
3,
4,The
...,...
195,
196,Up
197,Down
198,Up


In [11]:
results.to_csv(base_result_path)

In [12]:
# Load the trained model
peft_model = PeftModel.from_pretrained(model,input_dir,is_trainable=False)

In [13]:
# Get the results for the quantized trained model
results = pd.DataFrame(response_generator(peft_model, tokenizer, data), columns=['trained'])
results

  0%|          | 0/200 [00:00<?, ?it/s]

Unnamed: 0,trained
0,UP
1,DOWN
2,UP
3,DOWN
4,DOWN
...,...
195,UP
196,DOWN
197,UP
198,DOWN


In [14]:
results.to_csv(adapter_result_path)