In [1]:
import os
import galai as gal
import torch

from transformers import GPT2LMHeadModel, GPT2Tokenizer

# do it in terminal to set these properly
# os.environ['XDG_CACHE_HOME'] = '/work/frink/yun.hy/.cache'
# os.environ['TRANSFORMERS_CACHE'] = '/work/frink/yun.hy/.cache'
# os.environ['HF_HOME'] = '/work/frink/yun.hy/.cache'
# os.environ['HUGGINGFACE_HUB_CACHE'] = '/work/frink/yun.hy/.cache'
# %env XDG_CACHE_HOME=/work/frink/yun.hy/.cache
# %env TRANSFORMERS_CACHE=/work/frink/yun.hy/.cache
# %env HF_HOME=/work/frink/yun.hy/.cache

device = torch.device("cuda")
max_length = 1024

galactica_model = gal.load_model("standard", num_gpus=1)

biomedlm_tokenizer = GPT2Tokenizer.from_pretrained("stanford-crfm/BioMedLM")
biomedlm_model = GPT2LMHeadModel.from_pretrained("stanford-crfm/BioMedLM").to(device)

In [2]:
import pandas as pd

from tqdm.auto import tqdm
tqdm.pandas()

df = pd.read_csv("./cochrane_reviews_latest_by_topic_20230223.csv", index_col=False)

In [3]:
df.head()

Unnamed: 0,cochrane_id,title,year,month,day,version,url,cochrane_review_group_code,cochrane_review_topic,abstract,plain_language_summary
0,CD002204.pub5,Antifungal therapies for allergic bronchopulmo...,2022,September,2,5,https://doi.org/10.1002/14651858.CD002204.pub5,Cystic Fibrosis and Genetic Disorders,Allergy & intolerance,Background\nAllergic bronchopulmonary aspergil...,Treatments to fight fungal infections that cau...
1,CD012969.pub3,Treatment of dental and orthodontic complicati...,2023,February,2,3,https://doi.org/10.1002/14651858.CD012969.pub3,Cystic Fibrosis and Genetic Disorders,Blood disorders,Background\nThalassaemia is a quantitative abn...,Treatment of dental and orthodontic problems i...
2,CD012974.pub2,Neoadjuvant treatment for stage III and IV cut...,2023,January,17,2,https://doi.org/10.1002/14651858.CD012974.pub2,Skin,"Cancer, Skin disorders",Background\nCutaneous melanoma is amongst the ...,What are the benefits and risks of neoadjuvant...
3,CD012478.pub2,Catheter insertion techniques for improving ca...,2023,February,22,2,https://doi.org/10.1002/14651858.CD012478.pub2,Kidney and Transplant,"Child health, Kidney disease",Background\nPeritoneal dialysis (PD) relies on...,Catheter insertion techniques for improving ca...
4,CD011511.pub3,Vitamin D for the management of asthma,2023,February,6,3,https://doi.org/10.1002/14651858.CD011511.pub3,Airways,"Complementary & alternative medicine, Lungs & ...",Background\nSince the previous Cochrane Review...,Does vitamin D reduce risk of severe asthma at...


In [4]:
def get_galactica_output(row):
    title = row['title']
    prompt = 'Title: ' + title + '\n\n'
    return galactica_model.generate(prompt, new_doc=True, top_p=0.7, max_length=max_length)

In [5]:
def get_biomedlm_output(row):
    title = row['title']
    prompt = 'Title: ' + title
    input_ids = biomedlm_tokenizer.encode(
        prompt, return_tensors="pt"
    ).to(device)

    output = biomedlm_model.generate(input_ids, do_sample=True, max_length=max_length, top_k=50)

    return biomedlm_tokenizer.decode(output[0], skip_special_tokens=True)

In [7]:
df['galactica_output'] = df.progress_apply(get_galactica_output, axis=1)

  0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
df['pubmedgpt_output'] = df.progress_apply(get_biomedlm_output, axis=1)

  0%|          | 0/3 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.


In [9]:
df

Unnamed: 0,cochrane_id,title,year,month,day,version,url,cochrane_review_group_code,cochrane_review_topic,abstract,plain_language_summary,galactica_output,pubmedgpt_output
0,CD002204.pub5,Antifungal therapies for allergic bronchopulmo...,2022,September,2,5,https://doi.org/10.1002/14651858.CD002204.pub5,Cystic Fibrosis and Genetic Disorders,Allergy & intolerance,Background\nAllergic bronchopulmonary aspergil...,Treatments to fight fungal infections that cau...,Title: Antifungal therapies for allergic bronc...,Title: Antifungal therapies for allergic bronc...
1,CD012969.pub3,Treatment of dental and orthodontic complicati...,2023,February,2,3,https://doi.org/10.1002/14651858.CD012969.pub3,Cystic Fibrosis and Genetic Disorders,Blood disorders,Background\nThalassaemia is a quantitative abn...,Treatment of dental and orthodontic problems i...,Title: Treatment of dental and orthodontic com...,Title: Treatment of dental and orthodontic com...
2,CD012974.pub2,Neoadjuvant treatment for stage III and IV cut...,2023,January,17,2,https://doi.org/10.1002/14651858.CD012974.pub2,Skin,"Cancer, Skin disorders",Background\nCutaneous melanoma is amongst the ...,What are the benefits and risks of neoadjuvant...,Title: Neoadjuvant treatment for stage III and...,Title: Neoadjuvant treatment for stage III and...


In [10]:
df.to_csv('./llm_outputs.csv', index=False)