In [None]:
# In the raw output data/gpt-4/pubmed_*.nll.txt, each line contains N-1 negative log-probabilities for a sentence 
# of length N (tokens)
# The specialty of pubmed data is that we want to examine the *answer* part, i.e., the tokens after "Answer:" in the raw text.

In [1]:
# import from parent directory
import sys
sys.path.insert(0, '../')
from model import Model
from modelscope import AutoTokenizer
import torch
import numpy as np

2024-06-01 16:40:31,510 - modelscope - INFO - PyTorch version 2.2.0 Found.
2024-06-01 16:40:31,511 - modelscope - INFO - Loading ast index from /Users/xy/.cache/modelscope/ast_indexer
2024-06-01 16:40:31,580 - modelscope - INFO - Loading done! Current index file version is 1.14.0, with md5 b6a37aa50898b7ca29cb870cc35ad7a7 and a total number of 976 components indexed


In [2]:
# Read nll data
def _read_data(data_file, N=np.inf):
    data = []
    with open(data_file, 'r') as f:
        count = 0
        for line in f:
            line = line.strip()
            if line == '':
                continue
            num = list(map(float, line.split()))
            data.append(num)
            count += 1
            if count >= N:
                break
    return data

In [4]:
nll_orig = _read_data('../data/gpt-4/pubmed_gpt-4.original.mistral.nll.txt')
# print(nll_orig[0][:5])

nll_samp = _read_data('../data/gpt-4/pubmed_gpt-4.sampled.mistral.nll.txt')
# print(nll_samp[0][:5])

In [5]:
# Read raw text data
def _read_raw_text(data_file, N=np.inf):
    data = []
    with open(data_file, 'r') as f:
        count = 0
        for line in f:
            line = line.strip()
            if line == '':
                continue
            data.append(line)
            count += 1
            if count >= N:
                break
    return data

def _write_raw_text(data, data_file):
    with open(data_file, 'w') as f:
        for line in data:
            f.write(line + '\n')

In [6]:
text_orig = _read_raw_text('../data/gpt-4/pubmed_gpt-4.original.txt')
print(text_orig[0][:20])
text_samp = _read_raw_text('../data/gpt-4/pubmed_gpt-4.sampled.txt')
print(text_samp[0][:20])

Question: Is an adva
Question: Is an adva


In [7]:
est_name = 'gpt2xl' # or mistral etc.

# Load tokenizer accordingly
if est_name == 'mistral':
    model_dir = "/Users/xy/models/mistral-7b"
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
elif est_name == 'gpt2xl':
    model_dir = "/Users/xy/models/gpt2-xl"
    tokenizer = AutoTokenizer.from_pretrained(model_dir)

# Read NLL data
nll_orig = _read_data(f'../data/gpt-4/pubmed_gpt-4.original.{est_name}.nll.txt')
nll_samp = _read_data(f'../data/gpt-4/pubmed_gpt-4.sampled.{est_name}.nll.txt')

def extract_answer_nll(nlls, text):
    new_nlls = []
    for i in range(len(nlls)):
        input_ids = tokenizer(text[i], return_tensors='pt')['input_ids']
        for j in range(input_ids.shape[1]):
            decoded_token = tokenizer.decode(input_ids[0, j].item())
            if decoded_token == 'Answer':
                # skip "Answer" and ":"
                next_token = tokenizer.decode(input_ids[0, j+1].item())
                assert next_token == ':'
                new_nlls.append(nlls[i][j+1:])
                break
    return new_nlls

# Extract Answer NLLs
new_nlls_orig = extract_answer_nll(nll_orig, text_orig)
new_nlls_samp = extract_answer_nll(nll_samp, text_samp)

In [8]:
# Save
def write_nlls(nlls, output_file):
    with open(output_file, 'w') as f:
        for res in nlls:
            if isinstance(res, torch.Tensor):
                res = res.numpy().tolist()
            res_str = ' '.join(f'{num:.4f}' for num in res)
            f.write(f'{res_str}\n')

write_nlls(new_nlls_orig, f'../data/gpt-4/pubmed_AnsInCtx_gpt-4.original.{est_name}.nll.txt')
write_nlls(new_nlls_samp, f'../data/gpt-4/pubmed_AnsInCtx_gpt-4.sampled.{est_name}.nll.txt')

In [9]:
# Extract pure text in pubmed after "Answer:" and store in individual files
def extract_answer_text(text: list[str]) -> list[str]:
    new_text = []
    for line in text:
        parts = line.split('Answer:')
        assert len(parts) == 2
        new_text.append(parts[1].strip())
    return new_text

ans_text_orig = extract_answer_text(text_orig)
ans_text_samp = extract_answer_text(text_samp)

_write_raw_text(ans_text_orig, '../data/gpt-4/pubmed_Ans_gpt-4.original.txt')
_write_raw_text(ans_text_samp, '../data/gpt-4/pubmed_Ans_gpt-4.sampled.txt')