In [None]:
# In the raw output data/gpt-4/pubmed_*.nll.txt, each line contains N-1 negative log-probabilities for a sentence 
# of length N (tokens)
# The specialty of pubmed data is that we want to examine the *answer* part, i.e., the tokens after "Answer:" in the raw text.

In [2]:
# import from parent directory
import sys
sys.path.insert(0, '../')
from model import Model
from modelscope import AutoTokenizer
import torch

2024-05-07 15:14:17,864 - modelscope - INFO - PyTorch version 2.2.0 Found.
2024-05-07 15:14:17,864 - modelscope - INFO - Loading ast index from /Users/xy/.cache/modelscope/ast_indexer
2024-05-07 15:14:17,929 - modelscope - INFO - Loading done! Current index file version is 1.14.0, with md5 b6a37aa50898b7ca29cb870cc35ad7a7 and a total number of 976 components indexed


In [3]:
import numpy as np

# Read nll data
def _read_data(data_file, N=np.inf):
    data = []
    with open(data_file, 'r') as f:
        count = 0
        for line in f:
            line = line.strip()
            if line == '':
                continue
            num = list(map(float, line.split()))
            data.append(num)
            count += 1
            if count >= N:
                break
    return data

In [3]:
nll_orig = _read_data('../data/gpt-4/pubmed_gpt-4.original.mistral.nll.txt')
print(nll_orig[0][:5])

nll_samp = _read_data('../data/gpt-4/pubmed_gpt-4.sampled.mistral.nll.txt')
print(nll_samp[0][:5])

[5.175, 0.0119, 3.3275, 10.8108, 7.6979]
[5.2005, 0.0121, 3.291, 10.8107, 7.6796]


In [5]:
# Read raw text data
def _read_raw_text(data_file, N=np.inf):
    data = []
    with open(data_file, 'r') as f:
        count = 0
        for line in f:
            line = line.strip()
            if line == '':
                continue
            data.append(line)
            count += 1
            if count >= N:
                break
    return data

def _write_raw_text(data, data_file):
    with open(data_file, 'w') as f:
        for line in data:
            f.write(line + '\n')

In [7]:
text_orig = _read_raw_text('../data/gpt-4/pubmed_gpt-4.original.txt')
print(text_orig[0][:20])
text_samp = _read_raw_text('../data/gpt-4/pubmed_gpt-4.sampled.txt')
print(text_samp[0][:20])

Question: Is an adva
Question: Is an adva


In [7]:
# Examine the tokenized result for a line of text
model_dir = "/Users/xy/models/mistral-7b"
tokenizer = AutoTokenizer.from_pretrained(model_dir)

In [19]:
print(text_orig[0])
input_ids = tokenizer(text_orig[0], return_tensors='pt')['input_ids']
print(input_ids)

for i in range(input_ids.shape[1]):
    decoded_token = tokenizer.decode(input_ids[0, i].item())
    if decoded_token == 'Answer':
        print(i, decoded_token)

# First decoded token is "<s>" at each line

Question: Is an advance care planning model feasible in community palliative care? Answer: An advance care planning model is feasible for community palliative care services. Quality audit processes are an essential component of the Model with documentation of advance care planning discussion established as an important outcome measure.
tensor([[    1, 22478, 28747,  1691,   396,  8670,  1656,  7394,  2229, 25953,
          1070,   297,  3618,   284,   455, 26938,  1656, 28804, 26307, 28747,
          1094,  8670,  1656,  7394,  2229,   349, 25953,  1070,   354,  3618,
           284,   455, 26938,  1656,  3345, 28723, 20612, 24790,  9537,   460,
           396,  7974,  5641,   302,   272,  8871,   395, 12905,   302,  8670,
          1656,  7394,  8387,  6740,   390,   396,  2278, 14120,  5266, 28723]])
18 Answer


In [15]:
# Check per-line length in nll_orig and text_orig
for i in range(3):
    input_ids = tokenizer(text_orig[i], return_tensors='pt')['input_ids']
    print(len(nll_orig[i]), input_ids.shape[1])

incorrect_count = 0
for i in range(len(nll_orig)):
    input_ids = tokenizer(text_orig[i], return_tensors='pt')['input_ids']
    if len(nll_orig[i]) != input_ids.shape[1] - 1:
        incorrect_count += 1
print(incorrect_count)
# We have verified that for all lines
# len(nll_orig[i]) == input_ids.shape[1] - 1

59 60
70 71
87 88
0


In [27]:
def extract_answer_nll(nlls, text):
    new_nlls = []
    for i in range(len(nlls)):
        input_ids = tokenizer(text[i], return_tensors='pt')['input_ids']
        for j in range(input_ids.shape[1]):
            decoded_token = tokenizer.decode(input_ids[0, j].item())
            if decoded_token == 'Answer':
                # skip "Answer" and ":"
                next_token = tokenizer.decode(input_ids[0, j+1].item())
                assert next_token == ':'
                new_nlls.append(nlls[i][j+1:])
                break
    return new_nlls

new_nlls_orig = extract_answer_nll(nll_orig, text_orig)
print(len(new_nlls_orig))
new_nlls_samp = extract_answer_nll(nll_samp, text_samp)
print(len(new_nlls_samp)) 


150
150


In [28]:
# Save
def write_nlls(nlls, output_file):
    with open(output_file, 'w') as f:
        for res in nlls:
            if isinstance(res, torch.Tensor):
                res = res.numpy().tolist()
            res_str = ' '.join(f'{num:.4f}' for num in res)
            f.write(f'{res_str}\n')

write_nlls(new_nlls_orig, '../data/gpt-4/pubmed_AnsInCtx_gpt-4.original.mistral.nll.txt')
write_nlls(new_nlls_samp, '../data/gpt-4/pubmed_AnsInCtx_gpt-4.sampled.mistral.nll.txt')

In [9]:
# Extract pure text in pubmed after "Answer:" and store in individual files
def extract_answer_text(text: list[str]) -> list[str]:
    new_text = []
    for line in text:
        parts = line.split('Answer:')
        assert len(parts) == 2
        new_text.append(parts[1].strip())
    return new_text

ans_text_orig = extract_answer_text(text_orig)
ans_text_samp = extract_answer_text(text_samp)

_write_raw_text(ans_text_orig, '../data/gpt-4/pubmed_Ans_gpt-4.original.txt')
_write_raw_text(ans_text_samp, '../data/gpt-4/pubmed_Ans_gpt-4.sampled.txt')