### Clean Title Function with LLM.

In [1]:
import transformers
import torch
from transformers import BitsAndBytesConfig, pipeline, AutoTokenizer
ACCESS_TOKEN = ""
device = "cuda:3"

def load_70b_model(device):
    model_id = "meta-llama/Meta-Llama-3.1-70B-Instruct"
        # Load model directly

    nf4_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16
    )
    model_nf4 = transformers.AutoModelForCausalLM.from_pretrained(model_id, 
                                                    quantization_config=nf4_config,
                                                    device_map={"": device},
                                                    token=ACCESS_TOKEN)
    
    tokenizer = AutoTokenizer.from_pretrained(model_id, token=ACCESS_TOKEN)
    
    pipeline = transformers.pipeline(
        "text-generation",
        model= model_nf4, #model_id,
        # model_kwargs={"torch_dtype": torch.bfloat16},
        tokenizer=tokenizer,
        # device=device,
    )

    messages = [
        {"role": "system", "content": "You are an expert and experienced from the healthcare and biomedical domain with extensive medical knowledge and practical experience. Your name is OpenBioLLM, and your job is to annotate dictionary features and contexts learned by interpretability techniques. Please answer the below message."},
        {"role": "user", "content": "Hello?"},
    ]

    prompt = pipeline.tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = pipeline(
        prompt,
        max_new_tokens=256,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.1,
        top_p=0.9,
    )
    print(outputs[0]["generated_text"][len(prompt):])
    return pipeline

llama3 = load_70b_model(device)

Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Hello. I'm OpenBioLLM, an expert in the healthcare and biomedical domain. I'm here to assist with annotating dictionary features and contexts learned by interpretability techniques. How can I help you today?


In [2]:
def generate_text_with_icl(prompt, pipeline, task_examples, max_new_tokens=256, temperature=0.00001, top_p=0.99) -> str:
    """
    Generate text using the specified prompt and parameters with in-context learning.
    Args:
    prompt (str): The input prompt for text generation.
    pipeline: The text generation pipeline.
    task_examples (list): List of dictionaries containing input-output pairs for in-context learning.
    max_new_tokens (int, optional): The maximum number of new tokens to generate. Defaults to 256.
    temperature (float, optional): The temperature value for sampling. Defaults to 0.00001.
    top_p (float, optional): The top-p value for sampling. Defaults to 0.99.
    Returns:
    str: The generated text.
    """
    # Construct the in-context learning prompt
    icl_prompt = "You are an expert and experienced from the healthcare and biomedical domain with extensive medical knowledge and practical experience. Your job is to help annotate specific tasks by looking for common patterns within text. Here are some examples of how to perform the task:\n\n"
    
    for example in task_examples:
        icl_prompt += f"Input: {example['input']}\nOutput: {example['output']}\n\n"
    
    icl_prompt += f"Now, please perform the same task for the following input:\nInput: {prompt}\nOutput:"

    messages = [
        {"role": "system", "content": icl_prompt},
        {"role": "user", "content": prompt},
    ]
    
    full_prompt = pipeline.tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]
    
    outputs = pipeline(
        full_prompt,
        max_new_tokens=max_new_tokens,
        eos_token_id=terminators,
        do_sample=True,
        temperature=temperature,
        top_p=top_p,
    )
    
    return outputs[0]["generated_text"][len(full_prompt):]

In [4]:
title_cleaning_prompt = """
You are an assistant specialized in cleaning and standardizing academic paper titles. Your task is to take a given title and improve its formatting, spacing, and consistency. Follow these rules:

1. Correct spacing:
   - Ensure single spaces between words.
   - Remove extra spaces before or after hyphens.
   - Add spaces after colons and semicolons.

2. Hyphenation:
   - Use hyphens consistently in compound terms (e.g., "Multi-Scale" not "Multi Scale" or "MultiScale").
   - Correct common hyphenation errors in technical terms (e.g., "Pre-processing" not "Preprocessing").

3. Capitalization:
   - Use title case: Capitalize the first letter of each major word.
   - Do not capitalize articles (a, an, the), coordinating conjunctions (and, but, for, or, nor), or prepositions unless they start the title.
   - Always capitalize the first and last words of the title and subtitle.

4. Acronyms and initialisms:
   - Remove spaces between letters in acronyms (e.g., "CNN" not "C N N").
   - Ensure correct formatting of technical acronyms (e.g., "U-Net" not "UNet" or "U Net").

5. Special characters:
   - Correct the use of special characters like hyphens (-), en dashes (–), and em dashes (—).
   - Ensure proper use of quotation marks and apostrophes.

6. Consistency:
   - Maintain consistent formatting throughout the title.
   - Ensure that similar terms or concepts are formatted the same way.

7. Grammar and spelling:
   - Correct any obvious spelling errors.
   - Ensure proper grammatical structure.

8. No Authors: If the title contains any author names, emails, or affiliations, remove them.

When given a title, apply these rules to clean and standardize it. Provide the corrected title without additional commentary unless there are ambiguities or decisions that require explanation.

Title to clean: {title}

Cleaned title:
"""
import pandas as pd 
def clean_titles(pipeline, df : pd.DataFrame, prompt : str):
    cleaned_titles = []
    for title in df["title"]:
        cleaned_title = generate_text_with_icl(title, pipeline, [{"input": title, "output": ""}], max_new_tokens=256, temperature=0.00001, top_p=0.99)
        cleaned_titles.append(cleaned_title)
    return cleaned_titles



chil_info = pd.read_csv("processed_data/chil_extracted_info.csv") # get all the titles, and clean them.
chil_info.head(2)


Unnamed: 0,year,title,authors,abstract,code_count,gitlab_count,zenodo_count,dataset_count,mimic_count,eicu_count,...,tcga_count,gdc_count,seer_count,tuh_eeg_corpus_count,tuh_abnormal_eeg_corpus_count,tuh_eeg_artifact_corpus_count,tuh_eeg_epilepsy_corpus_count,tuh_eeg_events_corpus_count,tuh_eeg_seizure_corpus_count,tuh_eeg_slowing_corpus_count
0,2020,B M M- Net: Automatic Segmentationof Edemain O...,"Ruru Zhang, Beijing University of Posts and, T...",Retinal effusions and cysts caused by the leak...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2020,Hurtful Words: Quantifying Biasesin Clinical C...,"Haoran Zhang∗, haoran@cs.toronto.edu, Universi...","In this work, we examine the extent to which e...",1,0,0,2,1,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
chil_info["authors"][0]

'Ruru Zhang, Beijing University of Posts and, Telecommunications, Haidian District, Beijing, China, zrr@bupt.edu.cnJiawen He, Beijing University of Posts and, Telecommunications, Haidian District, Beijing, China, euphy@bupt.edu.cnShenda Shi, Beijing University of Posts and, Telecommunications, Haidian District, Beijing, China, cy_z_feng@bupt.edu.cn, Haihong E, Beijing University of Posts and, Telecommunications'

In [12]:


def extract_emails(text, nlp):
    prompt = f"""
    Extract all email addresses from the following text. 
    Only output the email addresses, one per line. 
    If there are no email addresses, output 'No email addresses found. We are not using code here.'

    Text: {text}

    Extracted email addresses:
    """

    # Generate the response
    response = generate_text_with_icl(prompt, nlp, [{"input": text, "output": ""}])

    return response
def string_to_list(input_string):
    """
    Convert a string into a list using '\n' as the delimiter.
    Remove any empty strings from the resulting list.
    
    :param input_string: The input string to be converted
    :return: A list of non-empty strings
    """
    # Split the string by '\n' and remove any empty strings
    return [item.strip() for item in input_string.split('\n') if item.strip()]

# Example usage
sample_text = """
Hello, my name is John Doe. You can reach me at john.doe@example.com or
through my work email johnd@company.com. My colleague's email is
jane.smith@example.org.
"""


# Extract emails
extracted_emails = extract_emails(chil_info["authors"][0], llama3)
# Convert the extracted emails to a list
extracted_emails = string_to_list(extracted_emails)
print(extracted_emails)

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


['zrr@bupt.edu.cn', 'euphy@bupt.edu.cn', 'cy_z_feng@bupt.edu.cn']


In [14]:
def extract_and_clean_emails(text, pipeline):
    prompt = f"""
    Extract all email addresses from the following text.
    Clean the extracted email addresses by removing any unnecessary characters or formatting issues.
    Output only the cleaned email addresses, one per line.
    
    Text: {text}
    
    Cleaned and extracted email addresses:
    """
    # Generate the response
    response = generate_text_with_icl(prompt, pipeline, [{"input": text, "output": ""}], max_new_tokens=256, temperature=0.00001, top_p=0.99)
    return response

def process_dataframe(pipeline, df: pd.DataFrame, text_column: str):
    # Extract and clean emails
    processed_emails = df[text_column].apply(lambda x: extract_and_clean_emails(x, pipeline))
    
    # Create a new dataframe with processed emails
    new_df = df.copy()
    new_df['processed_emails'] = processed_emails
    
    # Reorder columns to put processed_emails right after the original text column
    cols = list(new_df.columns)
    text_index = cols.index(text_column)
    cols.insert(text_index + 1, cols.pop(cols.index('processed_emails')))
    new_df = new_df[cols]
    
    return new_df
chil_info = process_dataframe(llama3, chil_info, "authors")

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


zrr@bupt.edu.cn
euphy@bupt.edu.cn
cy_z_feng@bupt.edu.cn


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


haoran@cs.toronto.edu
amyxlu@cs.toronto.edu
msa@cs.toronto.edu
mmd@mit.edu
marzyeh@cs.toronto.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


neal.ravindra@yale.edu
arijit.sehanobish@yale.edu
jenna.pappalardo@yale.edu
david.hafler@yale.edu
david.vandijk@yale.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


np6@princeton.edu
bee@cs.princeton.edu
finale@seas.harvard.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


vishwalim@nyu.edu
nabeel@nyu.edu
rumi.chunara@nyu.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


minhunl@cs.cmu.edu
dps@cs.cmu.edu
asim@cs.cmu.edu
alex@isr.tecnico.ulisboa.pt
sergi.bermudez@m-iti.org


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


No email addresses found. We are not using code here.


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


zidi.xiu@duke.edu
chenyang.tao@duke.edu
ricardo.henao@duke.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


aewj@mit.edu
lucas1@mit.edu
tpollard@mit.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


zhangwei@cs.wisc.edu
kuangz@stanford.edu
Peissig.Peggy@marshfieldresearch.org
david.page@duke.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


No email addresses found. We are not using code here.


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


No email addresses found. We are not using code here.


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


jjanizek@cs.washington.edu
erion@cs.washington.edu
degrave@cs.washington.edu
suinlee@cs.washington.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


milaha@amazon.com


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


yubin.park@emory.edu
yubin@bonsairesearch.com
joyce.c.ho@emory.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


kmlewis@mit.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


No email addresses found. We are not using code here.


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


paidamoyo.chapfuwa@duke.edu
chunyl@microsoft.com
nm208@duke.edu
lcarin@duke.edu
ricardo.henao@duke.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


No email addresses found. We are not using code here.


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


thomas.hooven@chp.edu
adam.lin@columbia.edu
ansafsalleb@columbia.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


sanja.scepanovic@nokia-bell-labs.com
enrique.martin-lopez@nokia-bell-labs.com
quercia@cantab.net
khan.baykaner@nokia-bell-labs.com


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


luke.oakden-rayner@adelaide.edu.au
jdunnmon@cs.stanford.edu
gustavo.carneiro@adelaide.edu.au
chrismre@cs.stanford.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


shirlywang@cs.toronto.edu
mmd@mit.edu
geeticka@mit.edu
marzyeh@cs.toronto.edu
mhughes@cs.tufts.edu
tristan@microsoft.com


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


jackzhu@nyu.edu
narges.razavian@nyulangone.org


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


david.dov@duke.edu
serge.assaad@duke.edu
shijing.si@duke.edu
rui.wang16@duke.edu
hongtengxu@ruc.edu.cn


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


guoweial001@e.ntu.edu.sg
kors0001@e.ntu.edu.sg
asysong@ntu.edu.sg
fernaldo.winnerdy@ntu.edu.sg
kwlim@ntu.edu.sg


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


No email addresses found. We are not using code here.


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Here are the cleaned and extracted email addresses:


raouf.kerkouche@inria.fr
acs@crysys.hu
claude.castelluccia@inria.fr
pierre.geneves@cnrs.fr


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


ecr38@cam.ac.uk
pl219@cam.ac.uk
stephanie.hyland@microsoft.com


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


konstantin.pandl@kit.edu
scott.thiebes@kit.edu
fabianfeiland@web.de
sunyaev@kit.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


saahil.jain@cs.stanford.edu
akshaysm@stanford.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


jean.feng@ucsf.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


ds806@cl.cam.ac.uk
ip325@cam.ac.uk
sb400@cam.ac.uk
njw1004@cam.ac.uk
cm542@cl.cam.ac.uk


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


khorrams@oregonstate.edu
lawsont@oregonstate.edu
lif@oregonstate.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


linial04@campus.technion.ac.il
neta.r@technion.ac.il
danny.eytan@technion.ac.il
urishalit@technion.ac.il


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


alexke@cs.stanford.edu
willells@cs.stanford.edu
oishi.banerjee@cs.stanford.edu
ang@cs.stanford.edu
pranavsr@cs.stanford.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


ho.danliang@u.nus.edu
iain.tan.b.h@singhealth.com.sg
motani@nus.edu.sg


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


araghu@mit.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


basilmaag@gmail.com
sfeuerriegel@ethz.ch
mathiaskraus@ethz.ch
maytal.saar-tsechansky@mccombs.utexas.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


xliu0@cs.washington.edu
ziheng@cs.washington.edu
jwfromm@octoml.ai
xuhaixu@cs.washington.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


No email addresses found. We are not using code here.


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


No email addresses found. We are not using code here.


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


mattsap@aiwhoo.com
kumar@udel.edu
gdominic@udel.edu
decker@udel.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


paidamoyo.chapfuwa@duke.edu
serge.assaad@duke.edu
zengshx777@gmail.com
michal.pencina@duke.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


pranavsr@cs.stanford.edu
anirudhjoshi@cs.stanford.edu
anujpare@cs.stanford.edu
ang@cs.stanford.edu
mlungren@stanford.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


mmd@mit.edu
bretnestor@cs.toronto.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


haoran@cs.toronto.edu
dullerud@cs.toronto.edu
laleh@cs.toronto.edu
morrissq@mskcc.org
shalmali@seas.harvard.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


bonggun.shin@deargen.me
sspark@deargen.me
jy.bak@skku.edu
joyce.c.ho@emory.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


gd5ss@virginia.edu
lc3cp@virginia.edu
dd3ar@virginia.edu
sk9epp@virginia.edu
lb3dp@virginia.edu
mob3f@virginia.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


mohamed.ghalwash@ibm.com
zijun.yao@ibm.com
prithwish.chakraborty@ibm.com
jvcodella@gmail.com


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


flores.gerardo@gmail.com
georgechen@cmu.edu
tpollard@mit.edu
azirikly@jhu.edu
michael.hughes@tufts.edu
tasmie.sarker@ahli.cc
joyce.c.ho@emory.edu
tristan@microsoft.com


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


twkillian@cs.toronto.edu
mghassem@mit.edu
shalmali@seas.harvard.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


florian.pfisterer@stat.uni-muenchen.de
chris.harbron@roche.com
gunther.jansen@roche.com
tao.xu.tx1@roche.com


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


talelh@kinstitute.org.il
chen@kinstitute.org.il


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


xiaolei.huang@memphis.edu
dernonco@adobe.com
mdredze@cs.jhu.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


chuzhixuan.czx@alibaba-inc.com
rathbun@uga.edu
sheng.li@uga.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


vincent.jeanselme@mrc-bsu.cam.ac.uk
brian.tom@mrc-bsu.cam.ac.uk
jessica.barrett@mrc-bsu.cam.ac.uk


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


mwai@chalmers.se
fredrik.johansson@chalmers.se


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


mehdi.fatemi@microsoft.com
mary@cs.toronto.edu
petchj@hhsc.ca
nelsonwa@hhsc.ca
stuart.connolly@phri.ca
alexander.benz@phri.ca
anthony.carnicelli@duke.edu
mghassem@mit.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


daeyoung.k@kaist.ac.kr
seongsu@kaist.ac.kr
shokim@kaist.ac.kr
edwardchoi@kaist.ac.kr


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


addison.weatherhead@mail.utoronto.ca
robert.greer@sickkids.ca
michael-alice.moga@sickkids.ca
mjaye.mazwi@sickkids.ca
biliary.colic@gmail.com
anna.goldenberg@utoronto.ca
stonekaboni@cs.toronto.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


stonekaboni@cs.toronto.edu
morgensh@cs.toronto.edu
azadeh.assadi@sickkids.ca
apokhrel@cs.toronto.edu
xi.huang1@sickkids.ca
anandj@cs.toronto.edu
robert.greer@sickkids.ca
pekhimenko@cs.toronto.edu
melissa.mccradden@sickkids.ca


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


pacesun@kaist.ac.kr
jiyounglee0523@kaist.ac.kr
ojw0123@kaist.ac.kr
wjprice@mit.edu
mdyhkim@amc.seoul.kr
edwardchoi@kaist.ac.kr


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


haoranz@mit.edu
natalie.dullerud@mail.utoronto.edu
karsten.roth@uni-tuebingen.de
lauren.oakden-rayner@adelaide.edu.au
spfohl@stanford.edu
mghassem@mit.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


juyongk@cs.cmu.edu
jeremyweiss@cmu.edu
pradeepr@cs.cmu.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


ankit.pal@saama.com
logesh.umapathi@saama.com
malaikannan.sankarasubbu@saama.com


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


zxznm@kaist.ac.kr
seongsu@kaist.ac.kr
jiho283@kaist.ac.kr
tackeun.kim@snu.ac.kr
edwardchoi@kaist.ac.kr


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


araghu@mit.edu
divyas@mit.edu
epomerantsev@mgh.harvard.edu
guttag@mit.edu
cmstultz@mit.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


kwanlee9209@aitrics.com
hyewonj@mit.edu
kim79@cooper.edu
soul2star@nhimc.or.kr
hipo0207@yuhs.ac
edwardchoi@kaist.ac.kr


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


ojw0123@kaist.ac.kr
hschung@kaist.ac.kr
cto@medicalai.com
dghong@medicalai.com
edwardchoi@kaist.ac.kr


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


jzhu4@andrew.cmu.edu
gdarnell@apple.com
agni@apple.com
dingzhao@cmu.edu
lbo@illinois.edu
xuanlong@umich.edu
shirleyr@apple.com


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


subhrajitroy@google.com
dmincu@google.com
levp@google.com
nrostamzadeh@google.com
chintanghate@google.com
natalieharris@google.com
christinium@google.com
schrouff@google.com
nenadt@deepmind.com
fletcher.hartsell@duke.edu
kheller@google.com


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


keramati@cs.stanford.edu
ogottesm@cs.brown.edu
lceli@mit.edu
finale@seas.harvard.edu
ebrun@cs.stanford.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


shadi.rahimian@cispa.saarland
raouf.kerkouche@cispa.de
ina.kurth@dkfz-heidelberg.de
fritz@cispa.de


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


boba km@tamu.edu
tasmie.sarker@ahli.cc
Andrew Beam@hms.harvard.edu
joyce.c.ho@emory.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


ksali85@student.gsu.edu
kbbello1@student.gsu.edu
kpchourasia1@student.gsu.edu
krthazhepunathil1@student.gsu.edu
kpin-yu.chen@ibm.com
kimdad.khan@lums.edu.pk
kmpatterson30@gsu.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


shuoshuo.liu@psu.edu
lynn.lin@duke.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


toyoshid@ucsc.edu
fansx@uw.edu
tylermc@uw.edu
zhenkewu@umich.edu
lizehang@ucsc.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


toyoshid@ucsc.edu
fansx@uw.edu
tylermc@uw.edu
zhenkewu@umich.edu
lizehang@ucsc.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


siyitang@stanford.edu
jdunnmon@stanford.edu
liangqqu@hku.hk
ksaab@stanford.edu
tina4@stanford.edu
cleemess@stanford.edu
rubin@stanford.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


ywu10@memphis.edu
i-chan.huang@stjude.org
xiaolei.huang@memphis.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


No email addresses found. We are not using code here.


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


zhi.chen1@duke.edu
ht395@cornell.edu
urszc@microsoft.com
cynthia@cs.duke.edu
rcaruana@microsoft.com


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


No email addresses found. We are not using code here.


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


shiuhn95@snu.ac.kr
jhwon@snu.ac.kr
dlee0880@snu.ac.kr
renqianluo@microsoft.com
lijun.wu@microsoft.com
yingce.xia@microsoft.com
taoqin@microsoft.com
howardlee@snu.ac.kr


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


li-control.xu@connect.polyu.hk
bokelvin.liu@connect.polyu.hk
ameer-hamz.khan@polyu.edu.hk
cslfan@comp.polyu.edu.hk
xiao-ming.wu@polyu.edu.hk


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


No email addresses found. We are not using code here.


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


UTD8HJ@VIRGINIA.EDU
JG5YCN@VIRGINIA.EDU
GUIMIND@AMAZON.COM
J.CARLYANG@EMORY.EDU
BRADJC@VIRGINIA.EDU
BTB5K@HSCMAIL.MCC.VIRGINIA.EDU
JZ9Q@VIRGINIA.EDU
EA6N@VIRGINIA.EDU


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


andre.manoel@microsoft.com
mirianh@microsoft.com
tal.baumel@microsoft.com
shize.su@microsoft.com
jialeichen@microsoft.com
rsim@microsoft.com
daniel.keegan.miller@gmail.com
dannykarmon@google.com
ydbdim@amazon.com


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


localh@kaist.ac.kr
yseongjunyang@krafton.com
daeyoung.k@kaist.ac.kr
radhikadua@google.com
jykim@kyuh.ac.kr
eunhoy@kaist.ac.kr
edwardchoi@kaist.ac.kr


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


g.cina@amsterdamumc.nl
t.e.rober@uva.nl
r.goedhart2@uva.nl
s.i.birbil@uva.nl


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


mikeam@cs.washington.edu
althoff@cs.washington.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


mikeam@cs.washington.edu
estebans@cs.washington.edu
arinbjorn@evidation.com
pgade@evidation.com
eramirez@evidation.com
schmidt@cs.washington.edu
luca.foschini@sagebase.org
althoff@cs.washington.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


kevinywu@stanford.edu
dominik dahlem@optum.com
christopher.hane@optum.com
eran.halperin@uhg.com
jamesz@stanford.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


cji@mit.edu
amalaa@berkeley.edu
dsontag@csail.mit.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


arvind.pillai.gr@dartmouth.edu
sknepal@cs.dartmouth.edu
andrew.t.p.campbell@gmail.com


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


eunbyeol.cho@kaist.ac.kr
mjbooo@kaist.ac.kr
pacesun@kaist.ac.kr
jiyoun.kim@kaist.ac.kr
jinsungyoon@google.com
edwardchoi@kaist.ac.kr


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


qixuanj@mit.edu
j.h.f.oosterhoff@tudelft.nl
yepenghuang@hsph.harvard.edu
mghassem@mit.edu
gbrat@bidmc.harvard.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


hardy.hardy@mcgill.ca
hardy@mikroskil.ac.id
derek.ruths@mcgill.ca
nicholas.king@mcgill.ca


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


william.lacava@childrens.harvard.edu
elle.lett@childrens.harvard.edu
gwan@hsph.harvard.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


vincent.jeanselme@mrc-bsu.cam.ac.uk
changho.yoon@sjc.ox.ac.uk
brian.tom@mrc-bsu.cam.ac.uk
jessica.barrett@mrc-bsu.cam.ac.uk


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


No email addresses found. We are not using code here.


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


hrf@umich.edu
joyclee@med.umich.edu
wiensj@umich.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


kmatton@mit.edu
roblewis@mit.edu
guttag@mit.edu
picard@mit.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


jzheng22@mit.edu
hanrui@mit.edu
anandc@mit.edu
aaguirre1@mgh.harvard.edu
songhan@mit.edu
hslee@mtl.mit.edu
sodini@mtl.mit.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


georgechen@cmu.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


dotjandr@umich.edu
wiensj@umich.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


hlzhou@andrew.cmu.edu
yuwenc2@andrew.cmu.edu
zlipton@cmu.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


No email addresses found. We are not using code here.


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


iman@cs.umass.edu
mfiterau@cs.umass.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


yi.yang@emory.edu
hejie.cui@emory.edu
j.carlyang@emory.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


kailasv@stanford.edu
Gregory.Lyng@optum.com
brian.l.hill@optum.com
kimmo.karkkainen@optum.com
jeffrey.hertzberg@optum.com
jamesz@stanford.edu
eran.halperin@uhg.com


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


No email addresses found. We are not using code here.


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


nsupriy@amazon.com
nmishra@amazon.com
ynaamad@amazon.com
rehg@gatech.edu
mehul@aryn.ai
awagner@bwh.harvard.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


LEHMER16@MIT.EDU
DEZ@MIT.EDU
DMAHAJA@US.IBM.COM
JONAS@XYLA.COM
MICAH@XYLA.COM
ZACK@XYLA.COM
DANIEL@XYLA.COM
PSZ@MIT.EDU
ALISTAIR.JOHNSON@SICKKIDS.CA
EALSENTZER@BWH.HARVARD.EDU


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


tpollard@mit.edu
edwardchoi@kaist.ac.kr
singhalp@pennmedicine.upenn.edu
michael.hughes@tufts.edu
elena.sizikova@fda.hhs.gov
bobakm@tamu.edu
iychen@berkeley.edu
few2001@med.cornell.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


wjhan@andrew.cmu.edu
dggomez@andrew.cmu.edu
aavi@andrew.cmu.edu
chaojind@andrew.cmu.edu
michael.a.rosenberg@cuanschutz.edu
dweber2@andrew.cmu.edu
emersonliu@msn.com
dingzhao@andrew.cmu.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


kyriakos.schwarz@uzh.ch
alicia.pliegomendieta@usz.ch
maolaaisha.aminanmu@uzh.ch
lara.planas-paz@usz.ch
chantal.pauli@uzh.ch
ahmed.allam@uzh.ch
michael.krauthammer@uzh.ch


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


haoting zhang@berkeley.edu
dz2478@columbia.edu
yunduan lin@berkeley.edu
jinghai he@berkeley.edu
QZhu@lbl.gov
maxshen@berkeley.edu
zyzheng@berkeley.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


s.nakamura.sakai@yale.edu
dennis.shung@yale.edu
jasjeet.sekhon@yale.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


kskim@mgh.harvard.edu
sean05071@dgist.ac.kr
slangarica@mgh.harvard.edu
adham.alkhadrawi@mgh.harvard.edu
SDO@mgh.harvard.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


meerak@umich.edu
wiensj@umich.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


runze.yan@emory.edu
chengding@gatech.edu
ran.xiao@emory.edu
aleksandr.vladimirovich.fedorov@emory.edu
randall.lee@ucsf.edu
fnahab@emory.edu
xiao.hu@emory.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


zdb6dz@virginia.edu
deb@virginia.edu
marc.adams@asu.edu
ssa2w@virginia.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


huiwei@cs.umass.edu
maxxu@gatech.edu
csamplawski@cs.umass.edu
jrehg@illinois.edu
santosh.kumar@memphis.edu
marlin@cs.umass.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


alfredn@kth.se
azizpour@kth.se


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


ravva@udel.edu
pkullu@udel.edu
fahim@udel.edu
rlb@udel.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


ran.xu@emory.edu
yiwenlu@sas.upenn.edu
chang.liu2@emory.edu
ychen123@pennmedicine.upenn.edu
yan.v.sun@emory.edu
xiao.hu@emory.edu
joyce.c.ho@emory.edu
j.carlyang@emory.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


fongkeisen@u.nus.edu
motani@nus.edu.sg


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


atoye@stevens.edu
lgomez@stevens.edu
samantha.kleinberg@stevens.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


ab2947@cornell.edu
sh2574@cornell.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


christof.naumzik@googlemail.com
akongsted@health.sdu.dk
werner.vach@basel-academy.ch
feuerriegel@lmu.de


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


ttumyche@kaist.ac.kr
dyan.lee717@gmail.com
wonjae.kim@navercorp.com
j1nhwa.kim@navercorp.com
tackeun.kim@snu.ac.kr
radio622@gmail.com
leonard.sunwoo@gmail.com
edwardchoi@kaist.ac.kr


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


marine.hoche@alumni.ethz.ch
omineeva@ethz.ch
manuel.burger@inf.ethz.ch
alessandro.blasimme@hest.ethz.ch
raetsch@inf.ethz.ch


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


qingen@cunet.carleton.ca
yuhong.guo@carleton.ca


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


lorenzo.bini@unige.ch
fatemeh.nassajian@unige.ch
margarita.liarou@unige.ch
thomas.matthes@hcuge.ch
stephane.marchand-maillet@unige.ch


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


stefan.hegselmann@uni-muenster.de
zjshen@mit.edu
flogierse@uni-muenster.de
monica.agrawal@duke.edu
dsontag@csail.mit.edu
xjiang@uni-muenster.de


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


ke.wang064@duke.edu
jerry.yang@duke.edu
leeor.hershkovich@duke.edu
hayoung.jeong@duke.edu
bill.chen@duke.edu
karnika.singh@duke.edu
ali.roganizad@duke.edu
mobashir.shandhi@duke.edu
andrew.spector@duke.edu
jessilyn.dunn@duke.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


alireza.amirshahi@epfl.ch
jonathan.dan@epfl.ch
jose.mirandacalero@epfl.ch
amir.aminifar@eit.lth.se
david.atienza@epfl.ch


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


akdb3@cam.ac.uk
p.h.g.hendrix@tilburguniversity.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


michael.lau2@columbia.edu
yt2793@columbia.edu
rk3291@columbia.edu
sa4166@columbia.edu
k.thakoor@columbia.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


hmahdi2026@meds.uwo.ca
e.nashnoush@mail.utoronto.ca
rami.saab@utoronto.ca
arjun.balachandar@mail.utoronto.ca
rishit@cs.toronto.edu
lucas.perri@outlook.com
h.khosravani@utoronto.ca


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Here are the cleaned and extracted email addresses:


raghav.tandon@gatech.edu
jläh@emory.edu
cassie.mitchell@bme.gatech.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


kevinywu@stanford.edu
wue@stanford.edu
krodolfa@law.stanford.edu
dho@law.stanford.edu
jamesz@stanford.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


ahsan.hi@northeastern.edu
mcinerney.de@northeastern.edu
jkim@bwh.harvard.edu
cpotter3@bwh.harvard.edu
gsyoung@bwh.harvard.edu
s.amir@northeastern.edu
b.wallace@northeastern.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


nicolas.raymond2@usherbrooke.ca
hakima.laribi@usherbrooke.ca
mcaru@pennstatehealth.psu.edu
gmmitiche@esi.dz
valerie.marcil@umontreal.ca
maja.krajinovic@umontreal.ca


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


ybkim95@mit.edu
xoxu@mit.edu
dmcduff@google.com
cynthiab@mit.edu
haewon@mit.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


hyeche@ethz.ch
manuel.burger@ethz.ch
dinara.veshchezerova@ethz.ch
raetsch@inf.ethz.ch


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


mariia.sidulova@fda.hhs.gov
seyed.kahaki@fda.hhs.gov
hagemani@wustl.edu
alexej.gossmann@fda.hhs.gov


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


anzeyimana@umass.edu
anthonycampb@umass.edu
James.Scanlan@providence.org
jstekler@uw.edu
marquard@umn.edu
Barry.Saver@swedish.org
jgummeso@umass.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


tavisy@student.ethz.ch
mkuznetsova@ethz.ch
christian.holz@inf.ethz.ch
shkurta.gashi@ai.ethz.ch


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


am2@stanford.edu
joerke@stanford.edu
wdenton@stanford.edu
barbarae@stanford.edu
ebrun@cs.stanford.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


marikacusick@stanford.edu
gchertow@stanford.edu
owens@stanford.edu
micwilliams@stanfordhealthcare.org
sherrirose@stanford.edu


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


pkasl@ucsd.edu
ssoltani@ucsd.edu
lbruce@ucsd.edu
vkviswan@ucsd.edu
wendy.hartogensis@ucsf.edu
a1gupta@ucsd.edu
ialtintas@ucsd.edu
Stephan.Dilchert@baruch.cuny.edu
mt3qb@mst.edu
raosukru@msu.edu
stickn21@msu.edu
nadendla@mst.edu
canfieldci@mst.edu


In [8]:
import pandas as pd

def clean_titles(pipeline, df: pd.DataFrame, prompt: str):
    cleaned_titles = []
    for title in df["title"]:
        formatted_prompt = prompt.format(title=title)
        cleaned_title = generate_text_with_icl(formatted_prompt, pipeline, [{"input": formatted_prompt, "output": ""}], max_new_tokens=256, temperature=0.00001, top_p=0.99)
        print(cleaned_title)
        cleaned_titles.append(cleaned_title)
    return cleaned_titles

def process_dataframe(pipeline, df: pd.DataFrame, prompt: str):
    # Clean the titles
    cleaned_titles = clean_titles(pipeline, df, prompt)
    
    # Create a new dataframe with cleaned titles
    new_df = df.copy()
    new_df['cleaned_title'] = cleaned_titles
    
    # Reorder columns to put cleaned_title right after the original title
    cols = list(new_df.columns)
    title_index = cols.index('title')
    cols.insert(title_index + 1, cols.pop(cols.index('cleaned_title')))
    new_df = new_df[cols]
    
    return new_df
cleaned_chil_info = process_dataframe(llama3, chil_info, title_cleaning_prompt)

# Display the first 5 rows of the new dataframe
print(cleaned_chil_info.head())
# Optionally, save the new dataframe to a CSV file
cleaned_chil_info.to_csv("processed_data/cleaned_chil_extracted_info.csv", index=False)

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


BMM-Net: Automatic Segmentation of Edema in Optical Coherence Tomography Based on Boundary Detection and Multi-Scale Network


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Hurtful Words: Quantifying Biases in Clinical Contextual Word Embeddings


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Disease State Prediction from Single-Cell Data Using Graph Attention Networks


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Defining Admissible Rewards for High-Confidence Policy Evaluation in Batch Reinforcement Learning


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Population-Aware Hierarchical Bayesian Domain Adaptation via Multi-Component Invariant Learning


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Interactive Hybrid Approach to Combine Machine and Human Intelligence for Personalized Rehabilitation Assessment


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Using SNOMED to Automate Clinical Concept Mapping


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Variational Learning of Individual Survival Distributions


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


De-Identification of Free-Text Medical Records Using Pre-Trained Bidirectional Transformers


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Adverse Drug Reaction Discovery from Electronic Health Records with Deep Neural Networks


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Interpretable Subgroup Discovery in Treatment Effect Estimation with Application to Opioid Prescribing Guidelines


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Analyzing the Role of Model Uncertainty for Electronic Health Records


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


An Adversarial Approach for the Robust Classification of Pneumonia from Chest Radiographs


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Explaining an Increase in Predicted Risk for Clinical Alerts


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Cali Forest: Calibrated Random Forest for Health Data


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Fast Learning-Based Registration of Sparse 3D Clinical Images


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Multi-Modal Missing Data-Aware Stacked Autoencoder for Biomedical Abstract Screening


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Survival Cluster Analysis


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


TASE: Temporal and Static Tensor Factorization for Phenotyping Electronic Health Records


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Multiple Instance Learning for Predicting Necrotizing Enterocolitis in Premature Infants Using Microbiome Data


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Extracting Medical Entities from Social Media


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Hidden Stratification Causes Clinically Meaningful Failures in Machine Learning for Medical Imaging


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


MIC-Extract: A Data Extraction, Preprocessing, and Representation Pipeline for


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Variationally Regularized Graph-Based Representation Learning for Electronic Health Records


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Affinitention Nets: Kernel Perspective on Attention Architectures for Set Classification with Applications to Medical Text and Images


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


RNA Alternative Splicing Prediction with Discrete Compositional Energy Network


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Concept-Based Model Explanations for Electronic Health Records


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Privacy-Preserving and Bandwidth-Efficient Federated Learning: An Application to In-Hospital Mortality Prediction


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Temporal Pointwise Convolutional Networks for Length of Stay Prediction in the Intensive Care Unit


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Trustworthy: Scalable Data Valuation with the Shapley Value


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Visual Che Xbert: Addressing the Discrepancy Between Radiology Report Labels and Image Labels


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Learning to Safely Approve Updates to Machine Learning Algorithms


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Self-Supervised Transfer Learning of Physiological Representations from Free-Living Wearable Data


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


IGOS++: Integrated Gradient Optimized Saliency by Bilateral Perturbations


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Generative ODE Modeling with Known Unknowns


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Che X-Transfer: Performance and Parameter Efficiency of Image Net Models for Chest X-Ray Interpretation


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Predictive Models for Colorectal Cancer Recurrence Using Multi-Modal Healthcare Data


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Learning to Predict with Supporting Evidence: Applications to Clinical Risk Prediction


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Modeling Longitudinal Dynamics of Comorbidities


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Meta Phys: Few-Shot Adaptation for Non-Contact Physiological Measurement


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


T-DPSOM - An Interpretable Clustering Method for Unsupervised Learning of Patient Health States


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Cleaned title:


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Contextualization and Individualization for Just-in-Time Adaptive Interventions to Reduce Sedentary Behavior


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Enabling Counterfactual Survival Analysis with Balanced Representations


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


External: Generalization of Deep Learning Models for Chest X-ray Interpretation to Photos of X-rays and External Clinical Settings


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


A Comprehensive EHR Time-Series Pre-Training Benchmark


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


An Empirical Framework for Domain Generalization in Clinical Settings


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Controlled Molecule Generator for Optimizing Multiple Chemical Properties


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Influenza-Like Symptom Recognition Using Mobile Sensing and Graph Neural Networks


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Phenotypical Ontology-Driven Framework for Multi-Task Learning


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Cleaned title:


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Counterfactually Guided Policy Transfer in Clinical Settings


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Evaluating Domain Generalization for Survival Analysis in Clinical Studies


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Estimating Model Performance on External Samples from Their Limited Statistical Characteristics


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Enriching Unsupervised User Embedding via Medical Concepts


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Multi-Task Adversarial Learning for Treatment Effect Estimation in Basket Trials


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Neural Survival Clustering: Non-Parametric Mixture of Neural Networks for Survival Clustering


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


ADCB: An Alzheimer's Disease Simulator for Benchmarking Observational Estimators of Causal Effects


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Semi-Markov Offline Reinforcement Learning for Healthcare


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Uncertainty-Aware Text-to-Program for Question Answering on Structured Electronic Health Records


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Learning Unsupervised Representations for ICU Time Series


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


How to Validate Machine Learning Models Prior to Deployment: Silent Trial Protocol for Evaluation of Real-Time Models at ICU


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Unifying Heterogeneous Electronic Health Records Systems via Text-Based Code Embedding


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Improving the Fairness of Chest X-Ray Classifiers


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Context-Sensitive Spelling Correction of Clinical Text via Conditional Independence


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


MedMCQA: A Large-Scale Multi-Subject Choice Dataset for Medical Domain Question Answering


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Graph-Text Multi-Modal Pre-Training for Medical Representation Learning


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Data Augmentation for Electrocardiograms


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Real-Time Seizure Detection Using EEG: A Comprehensive Comparison of Recent Approaches Under a Realistic Setting


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Lead-Agnostic Self-Supervised Learning for Local and Global Representations of Electrocardiogram


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Physio MTL: Personalizing Physiological Patterns Using Optimal Transport Multi-Task Regression


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Disability Prediction in Multiple Sclerosis Using Performance Outcome Measures and Demographic Data


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Identification of Subgroups with Similar Benefits in Off-Policy Evaluation


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Practical Challenges in Differentially-Private Federated Survival Analysis of Medical Data


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Cleaned title:


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Virus2Vec: Viral Sequence Classification Using Machine Learning


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Adaptive Weighted Multi-View Clustering


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Bayesian Active Questionnaire Design for Cause-of-Death Assignment Using Verbal Autopsies


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Supplementary Materials for Bayesian Active Questionnaire Design for Cause-of-Death Assignment Using Verbal Autopsies


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Modeling Multivariate Biosignals with Graph Neural Networks and Structured State Space Models


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Token Imbalance Adaptation for Radiology Report Generation


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Supplementary


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Missing Values and Imputation in Healthcare Data: Can Interpretable Machine Learning Help?


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Interpretable Missing Values in Healthcare: Figure 7 - Impact of Father's Education on Infant Mortality Risk. Appendix A: Testing for MCAR with EBM: Case Study.


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Revisiting Machine-Learning-Based Drug Repurposing: Indications Are Not a Right Prediction Target


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Multi-Modal Pre-Training for Medical Vision-Language Understanding and Generation: An Empirical Study with a New Benchmark


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Dataset 1.1: Motivation for What Purpose Was the Dataset Created? The Proposed RGC Dataset Is Created for Medical Vision-Language Pre-Training and to Serve as a Benchmark for Medical Image-Text Retrieval and Report Generation.


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


SRDA: Mobile Sensing-Based Fluid Overload Detection for End-Stage Kidney Disease Patients Using Sensor Relation Dual Autoencoder


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Federated Multilingual Models for Medical Transcript Analysis


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Toward the Practical Utility of Federated Learning in the Medical Domain


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Semantic Match: Debugging Feature Attribution Methods in XAI for Healthcare


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Self-Supervised Pretraining and Transfer Learning Enable Flu and COVID-19 Predictions in Small Mobile Sensing Datasets


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


HomeKit: A Benchmark for Time Series Classification on a Large Mobile Sensing Dataset with Laboratory-Tested Ground Truth of Influenza Infections


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Collecting Data When Missingness Is Unknown: A Method for Improving Model Performance Given Under-Reporting in Patient Populations


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Large-Scale Study of Temporal Shift in Health Insurance Claims


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Rare Life Event Detection via Mobile Sensing Using Multi-Task Learning


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Rediscovery of CNN's Versatility for Text-Based Encoding of Raw Electronic Health Records


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Clinical Relevance Score for Guided Trauma Injury Pattern Discovery with Weakly Supervised β-VAE


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Who Controlled the Evidence? Question Answering for Disclosure Information Extraction


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Fair Admission Risk Prediction with Proportional Multicalibration


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Neural Fine-Gray: Monotonic Neural Networks for Competing Risks


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Neural Fine-Gray: Monotonic Neural Networks for Competing Risks


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Denoising Autoencoders for Learning from Noisy Patient-Reported Data


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Contrastive Learning of Electrodermal Activity Representations for Stress Detection


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Machine Learning for Arterial Blood Pressure Prediction


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


A General Framework for Visualizing Embedding Spaces of Neural Survival Analysis Models Based on Angular Information


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Leveraging an Alignment Set in Tackling Instance-Dependent Label Noise


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Evaluating Model Performance in Medical Datasets Over Time


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Evaluating Model Performance in Medical Datasets Over Time: A Snapshot into the State of Proceedings and the First 20 Papers That Came Up in the Radiology Medical Journal When Searching for the Keyword "Machine Learning" and Filtering for Papers from To


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Multi-Wave: Multiresolution Deep Architectures through Wavelet Decomposition for Multivariate Time Series Prediction


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


PTGB: Pre-Train Graph Neural Networks for Brain Network Analysis


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Understanding and Predicting the Effect of Environmental Factors on People with Type 2 Diabetes


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Understanding and Predicting Environment Effects on Individuals with T2D: Appendix A - CGM Dataset. We include information on the range of values for each external factor in our dataset in Table 4. Note that all extreme weather events and all temporal events are binary-valued. We plot the distribution of the number of days recorded for each individual in Figure 4. The large spike in the last bucket contains roughly 10% of individuals in our cohort and represents people who have recorded M data on over 75% of days over the 2.5-year duration of data collection. Figure 4: Distribution of Number of Recorded Days of M Data per Individual. The spike in the last bucket is due to about 10% of individuals providing recordings (nearly) every day over the 2.5-year data collection period. B. Classifier Details: Information on the values input into the random forest classifiers is provided in Table 5. Note that the M and activity data is not used for classifiers, thus no M activity data features. C

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Explaining a Machine Learning Decision to Physicians via Counterfactuals


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Do We Still Need Clinical Language Models


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Cleaned title:


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Interpretation of Intracardiac Electrograms Through Textual Representations


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


DeepDOS: A Graph Neural Network-Based Drug Synergy Prediction Algorithm


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Daily Physical Activity Monitoring: Adaptive Learning from Multi-Source Motion Sensor Data


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Enhancing Collaborative Medical Outcomes through Private Synthetic Hypercube Augmentation: PriSHA


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Integrating Chat GPT into Secure Hospital Networks: A Case Study on Improving Radiology Report Analysis


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Multiple Instance Learning with Absolute Position Information


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


SQWA: Signal Quality Aware Deep Neural Architecture for Enhanced Accuracy in Atrial Fibrillation Detection from Noisy PG Signals


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Improved Bayesian Permutation Entropy Estimator


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Temporally Multi-Scale Sparse Self-Attention for Physical Activity Data Imputation


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Regularizing and Interpreting Vision Transformers by Patch Selection on Echocardiography Data


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


A Machine Learning Approach for Predicting Upper Limb Motion Intentions with Multimodal Data in Virtual Reality


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


From Basic to Extra Features: Hypergraph Transformer Pre-Training Then Fine-Tuning for Balanced Clinical Predictions on EHR


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Explainable and Privacy-Preserving Machine Learning via Domain-Aware Symbolic Regression


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Simulation of Health Time Series with Nonstationarity


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Brain-Mamba: Encoding Brain Activity via Selective State Space Models


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Data-Driven Subgrouping of Patient Trajectories with Chronic Diseases: Evidence from Low Back Pain


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Vision-Language Generative Model for View-Specific Chest X-Ray Generation


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.




Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Unsupervised Domain Adaptation for Medical Image Segmentation with Dynamic Prototype-Based Contrastive Learning


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Flow Cyt: A Comparative Study of Deep Learning Approaches for Multi-Class Classification in Cytometry Benchmarking


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


A Data-Centric Approach to Generate Faithful and High-Quality Patient Summaries with Large Language Models


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Addressing Wearable Sleep Tracking Inequity: A New Dataset and Novel Methods for a Population with Sleep Disorders


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


FETCH: A Fast and Efficient Technique for Channel Selection in G Wearable Systems


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Interpretable Breast Cancer Classification Using CNN on Mammographic Images


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Using Expert Gaze for Self-Supervised and Supervised Contrastive Learning of Glaucoma from OCT Data


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Tuning In: Analysis of Audio Classifier Performance in Clinical Settings with Limited Data


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Scalable Subtype and Stage Inference via Simultaneous Clustering of Subjects and Biomarkers


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Regulating AI Adaptation: An Analysis of Medical Device Updates


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Retrieving Evidence from EHRs with LMs: Possibilities and Challenges


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Development of Error Passing Network for Optimizing the Prediction of VO2 Peak in Acute Leukemia Survivors


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Health-LM: Large Language Models for Health Prediction via Wearable Sensor Data


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Dynamic Survival Analysis for Early Event Prediction


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Contextual Unsupervised Deep Clustering in Digital Pathology


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Dose Mate: A Real-World Evaluation of Machine Learning Classification of Pill Taking Using Wrist-Worn Motion Sensors


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Systematic Evaluation of Self-Supervised Learning Approaches for Wearable-Based Fatigue Recognition


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Adaptive Interventions with User-Defined Goals for Health Behavior Change


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Algorithmic Changes Are Not Enough: Evaluating the Removal of Race Adjustment from the eGFR Equation


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Across-Study Analysis of Wearable Datasets and the Generalizability of Acute Illness Monitoring Models
Learning Social Fairness Preferences from Non-Expert Stakeholder Opinions in Kidney Placement
   year                                              title  \
0  2020  B M M- Net: Automatic Segmentationof Edemain O...   
1  2020  Hurtful Words: Quantifying Biasesin Clinical C...   
2  2020  Disease State Prediction From Single- Cell Dat...   
3  2020  Defining Admissible Rewardsfor High- Confidenc...   
4  2020  Population-aware Hierarchical Bayesian Domain ...   

                                       cleaned_title  \
0  BMM-Net: Automatic Segmentation of Edema in Op...   
1  Hurtful Words: Quantifying Biases in Clinical ...   
2  Disease State Prediction from Single-Cell Data...   
3  Defining Admissible Rewards for High-Confidenc...   
4  Population-Aware Hierarchical Bayesian Domain ...   

                                             authors  \
0  Ruru Zhang, Beijing University of 

### get select API calls and save their queries to json somewhere.

In [1]:
import pandas as pd
import json
import os
from serpapi import GoogleSearch
from time import sleep

def spaces_to_underscores(filename):
    return filename.replace(' ', '_')

def underscores_to_spaces(filename):
    return filename.replace('_', ' ')

def clean_filename(title):
    # Remove invalid characters and limit length
    clean = "".join(c for c in title if c.isalnum() or c in (' ', '-', '_')).rstrip()
    return clean[:100]  # Limit filename length

def rename_files_in_directory(directory, convert_function):
    for filename in os.listdir(directory):
        if filename.endswith('.json'):
            new_filename = convert_function(filename)
            old_path = os.path.join(directory, filename)
            new_path = os.path.join(directory, new_filename)
            os.rename(old_path, new_path)
            print(f"Renamed: {filename} -> {new_filename}")

def clean_existing_files(directory):
    rename_files_in_directory(directory, spaces_to_underscores)

def search_and_save_results(df, api_key, output_dir='serpapi'):
    os.makedirs(output_dir, exist_ok=True)

    for index, row in df.iterrows():
        cleaned_title = row['cleaned_title']
        filename = clean_filename(cleaned_title)
        filename = spaces_to_underscores(filename) + '.json'
        
        params = {
            "api_key": api_key,
            "engine": "google_scholar",
            "q": cleaned_title,
            "hl": "en"
        }
        
        try:
            search = GoogleSearch(params)
            results = search.get_dict()
            
            with open(os.path.join(output_dir, filename), 'w', encoding='utf-8') as f:
                json.dump(results, f, ensure_ascii=False, indent=4)
            
            print(f"Saved results for: {cleaned_title}")
            sleep(2)
        
        except Exception as e:
            print(f"Error processing {cleaned_title}: {str(e)}")
    
    print("Finished processing all titles.")

# Usage
api_key = "your_api_key_here"  # Replace with your actual API key
cleaned_chil_info = pd.read_csv("processed_data/cleaned_chil_extracted_info.csv")
cleaned_chil_info_subset = cleaned_chil_info.head(10)
clean_existing_files('serpapi') # save and then clean to underscores for reading

# Search and save results
# search_and_save_results(cleaned_chil_info_subset, api_key)
# Clean up existing files



# Example of converting back to spaces (if needed)
# rename_files_in_directory('serpapi', underscores_to_spaces)

### Scrape and record necessary data from downloaded json files

In [2]:
import json
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import pandas as pd
import os

# Download necessary NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

print("NLTK version:", nltk.__version__)
print("NLTK data path:", nltk.data.path)

def load_json_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

def extract_original_title(data):
    return data['search_parameters']['q']

def compare_titles(original_title, result_title):
    original_tokens = word_tokenize(original_title.lower())
    result_tokens = word_tokenize(result_title.lower())
    
    stop_words = set(stopwords.words('english'))
    original_tokens = [token for token in original_tokens if token.isalpha() and token not in stop_words]
    result_tokens = [token for token in result_tokens if token.isalpha() and token not in stop_words]
    overlapping_words = set(original_tokens) & set(result_tokens)
    return len(overlapping_words) >= 2

def extract_citations_from_file(file_path):
    data = load_json_data(file_path)
    original_title = extract_original_title(data)
    
    if 'organic_results' in data and len(data['organic_results']) > 0:
        result = data['organic_results'][0]
        
        # Compare titles before extracting information
        if compare_titles(original_title, result['title']):
            publication_info = result.get('publication_info', {})
            
            authors = publication_info.get('authors', [])
            author_names = [author.get('name', '') for author in authors]
            author_ids = [author.get('author_id', '') for author in authors]
            
            citation_info = {
                'original_title': original_title,
                'title': result.get('title', ''),
                'authors': ', '.join(author_names),
                'author_ids': ', '.join(author_ids),
                'publication': publication_info.get('summary', ''),
                'total_citations': result.get('inline_links', {}).get('cited_by', {}).get('total', 0),
                'pdf_link': next((r['link'] for r in result.get('resources', []) if r.get('file_format') == 'PDF'), None),
                'citation_link': result.get('inl ine_links', {}).get('cited_by', {}).get('link', '')
            }
            return citation_info
    return None

def process_directory_and_update_dataframe(directory_path, df):
    new_data = []
    for filename in os.listdir(directory_path):
        if filename.endswith('.json'):
            file_path = os.path.join(directory_path, filename)
            citation_info = extract_citations_from_file(file_path)
            if citation_info:
                new_data.append(citation_info)
    
    new_df = pd.DataFrame(new_data)
    
    # Merge the new dataframe with the original one
    merged_df = df.copy()
    for col in new_df.columns:
        if col in merged_df.columns:
            merged_df[col] = new_df[col].combine_first(merged_df[col])
        else:
            merged_df[col] = new_df[col]
    
    return merged_df

# Usage
directory_path = "serpapi/chil"  # Replace with your actual directory path
cleaned_chil_info = pd.read_csv("processed_data/cleaned_chil_extracted_info.csv")
cleaned_chil_info_subset = cleaned_chil_info.head(10)

updated_df = process_directory_and_update_dataframe(directory_path, cleaned_chil_info_subset)

# Save the updated dataframe
updated_df.to_csv("processed_data/updated_chil_extracted_info_subset_with_comparison.csv", index=False)

print("Updated dataframe head:")
print(updated_df.head())
print("\nUpdated dataframe info:")
print(updated_df.info())

NLTK version: 3.8.1
NLTK data path: ['/home/johnwu3/nltk_data', '/home/johnwu3/miniconda3/envs/LLM/nltk_data', '/home/johnwu3/miniconda3/envs/LLM/share/nltk_data', '/home/johnwu3/miniconda3/envs/LLM/lib/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data']
Updated dataframe head:
   year                                              title  \
0  2020  Adverse drug reaction discovery from electroni...   
1  2020  BMM-Net: automatic segmentation of edema in op...   
2  2020  Deidentification of free-text medical records ...   
3  2020  Defining admissible rewards for high-confidenc...   
4  2020  Disease state prediction from single-cell data...   

                                 authors  \
0             W Zhang, P Peissig, D Page   
1                       HE, Z Ou, M Song   
2  AEW Johnson, L Bulgarelli, TJ Pollard   
3  N Prasad, B Engelhardt, F Doshi-Velez   
4               N Ravindra, A Sehanobish   

                   

In [22]:
updated_df.head(5)

Unnamed: 0,year,title,authors,processed_emails,cleaned_title,abstract,code_count,gitlab_count,zenodo_count,dataset_count,...,tuh_eeg_epilepsy_corpus_count,tuh_eeg_events_corpus_count,tuh_eeg_seizure_corpus_count,tuh_eeg_slowing_corpus_count,original_title,author_ids,publication,total_citations,pdf_link,citation_link
0,2020,Adverse drug reaction discovery from electroni...,"W Zhang, P Peissig, D Page",zrr@bupt.edu.cn\neuphy@bupt.edu.cn\ncy_z_feng@...,BMM-Net: Automatic Segmentation of Edema in Op...,Retinal effusions and cysts caused by the leak...,0,0,0,0,...,0,0,0,0,Adverse Drug Reaction Discovery from Electroni...,"ToHDHHsAAAAJ, h9soD-oAAAAJ, IlsuQ4oAAAAJ","W Zhang, Z Kuang, P Peissig, D Page - Proceedi...",8,https://dl.acm.org/doi/pdf/10.1145/3368555.338...,https://scholar.google.com/scholar?cites=10979...
1,2020,BMM-Net: automatic segmentation of edema in op...,"HE, Z Ou, M Song",haoran@cs.toronto.edu\namyxlu@cs.toronto.edu\n...,Hurtful Words: Quantifying Biases in Clinical ...,"In this work, we examine the extent to which e...",1,0,0,2,...,0,0,0,0,BMM-Net: Automatic Segmentation of Edema in Op...,"J4akh64AAAAJ, aYvRNFYAAAAJ, 6MRGS-4AAAAJ","R Zhang, J He, S Shi, HE, Z Ou, M Song - Proce...",1,https://dl.acm.org/doi/pdf/10.1145/3368555.338...,https://scholar.google.com/scholar?cites=54690...
2,2020,Deidentification of free-text medical records ...,"AEW Johnson, L Bulgarelli, TJ Pollard",neal.ravindra@yale.edu\narijit.sehanobish@yale...,Disease State Prediction from Single-Cell Data...,Single-cell RNA sequencing (scRNA-seq) has rev...,1,0,0,0,...,0,0,0,0,De-Identification of Free-Text Medical Records...,"66GCSz8AAAAJ, vKfiYkcAAAAJ, kd2pMFYAAAAJ","AEW Johnson, L Bulgarelli, TJ Pollard - Procee...",67,https://dl.acm.org/doi/pdf/10.1145/3368555.338...,https://scholar.google.com/scholar?cites=70993...
3,2020,Defining admissible rewards for high-confidenc...,"N Prasad, B Engelhardt, F Doshi-Velez",np6@princeton.edu\nbee@cs.princeton.edu\nfinal...,Defining Admissible Rewards for High-Confidenc...,A key impediment to reinforcement learning (RL...,0,0,0,1,...,0,0,0,0,Defining Admissible Rewards for High-Confidenc...,"P6PMzKAAAAAJ, VEGtG7YAAAAJ, hwQtFB0AAAAJ","N Prasad, B Engelhardt, F Doshi-Velez - Procee...",6,https://dl.acm.org/doi/pdf/10.1145/3368555.338...,https://scholar.google.com/scholar?cites=18316...
4,2020,Disease state prediction from single-cell data...,"N Ravindra, A Sehanobish",vishwalim@nyu.edu\nnabeel@nyu.edu\nrumi.chunar...,Population-Aware Hierarchical Bayesian Domain ...,While machine learning is rapidly being develo...,1,0,0,0,...,0,0,0,0,Disease State Prediction from Single-Cell Data...,"atVAWRMAAAAJ, MEby6-QAAAAJ","N Ravindra, A Sehanobish, JL Pappalardo… - Pro...",44,https://dl.acm.org/doi/pdf/10.1145/3368555.338...,https://scholar.google.com/scholar?cites=87006...


In [9]:
type(updated_df["author_ids"][0])

str

In [12]:
updated_df.head(10)

Unnamed: 0,year,title,cleaned_title,authors,abstract,code_count,gitlab_count,zenodo_count,dataset_count,mimic_count,...,tuh_eeg_epilepsy_corpus_count,tuh_eeg_events_corpus_count,tuh_eeg_seizure_corpus_count,tuh_eeg_slowing_corpus_count,original_title,author_ids,publication,total_citations,pdf_link,citation_link
0,2020,Adverse drug reaction discovery from electroni...,BMM-Net: Automatic Segmentation of Edema in Op...,"W Zhang, P Peissig, D Page",Retinal effusions and cysts caused by the leak...,0,0,0,0,0,...,0,0,0,0,Adverse Drug Reaction Discovery from Electroni...,"ToHDHHsAAAAJ, h9soD-oAAAAJ, IlsuQ4oAAAAJ","W Zhang, Z Kuang, P Peissig, D Page - Proceedi...",8,https://dl.acm.org/doi/pdf/10.1145/3368555.338...,https://scholar.google.com/scholar?cites=10979...
1,2020,BMM-Net: automatic segmentation of edema in op...,Hurtful Words: Quantifying Biases in Clinical ...,"HE, Z Ou, M Song","In this work, we examine the extent to which e...",1,0,0,2,1,...,0,0,0,0,BMM-Net: Automatic Segmentation of Edema in Op...,"J4akh64AAAAJ, aYvRNFYAAAAJ, 6MRGS-4AAAAJ","R Zhang, J He, S Shi, HE, Z Ou, M Song - Proce...",1,https://dl.acm.org/doi/pdf/10.1145/3368555.338...,https://scholar.google.com/scholar?cites=54690...
2,2020,Deidentification of free-text medical records ...,Disease State Prediction from Single-Cell Data...,"AEW Johnson, L Bulgarelli, TJ Pollard",Single-cell RNA sequencing (scRNA-seq) has rev...,1,0,0,0,0,...,0,0,0,0,De-Identification of Free-Text Medical Records...,"66GCSz8AAAAJ, vKfiYkcAAAAJ, kd2pMFYAAAAJ","AEW Johnson, L Bulgarelli, TJ Pollard - Procee...",67,https://dl.acm.org/doi/pdf/10.1145/3368555.338...,https://scholar.google.com/scholar?cites=70993...
3,2020,Defining admissible rewards for high-confidenc...,Defining Admissible Rewards for High-Confidenc...,"N Prasad, B Engelhardt, F Doshi-Velez",A key impediment to reinforcement learning (RL...,0,0,0,1,1,...,0,0,0,0,Defining Admissible Rewards for High-Confidenc...,"P6PMzKAAAAAJ, VEGtG7YAAAAJ, hwQtFB0AAAAJ","N Prasad, B Engelhardt, F Doshi-Velez - Procee...",6,https://dl.acm.org/doi/pdf/10.1145/3368555.338...,https://scholar.google.com/scholar?cites=18316...
4,2020,Disease state prediction from single-cell data...,Population-Aware Hierarchical Bayesian Domain ...,"N Ravindra, A Sehanobish",While machine learning is rapidly being develo...,1,0,0,0,0,...,0,0,0,0,Disease State Prediction from Single-Cell Data...,"atVAWRMAAAAJ, MEby6-QAAAAJ","N Ravindra, A Sehanobish, JL Pappalardo… - Pro...",44,https://dl.acm.org/doi/pdf/10.1145/3368555.338...,https://scholar.google.com/scholar?cites=87006...
5,2020,Hurtful words: quantifying biases in clinical ...,Interactive Hybrid Approach to Combine Machine...,"H Zhang, AX Lu, M Abdalla, M McDermott",Automated assessment of rehabilitation exercis...,0,0,0,0,0,...,0,0,0,0,Hurtful Words: Quantifying Biases in Clinical ...,"6aWRAPkAAAAJ, gQpYbRsAAAAJ, U8D2dlMAAAAJ, _V96...","H Zhang, AX Lu, M Abdalla, M McDermott… - proc...",176,https://dl.acm.org/doi/pdf/10.1145/3368555.338...,https://scholar.google.com/scholar?cites=15347...
6,2020,Interactive hybrid approach to combine machine...,Using SNOMED to Automate Clinical Concept Mapping,"MH Lee, DP Siewiorek, A Smailagic",The International Classification of Disease (I...,0,0,0,0,0,...,0,0,0,0,Interactive Hybrid Approach to Combine Machine...,"quDiEBkAAAAJ, B13SvmAAAAAJ, qMCGI94AAAAJ","MH Lee, DP Siewiorek, A Smailagic… - Proceedin...",35,https://dl.acm.org/doi/pdf/10.1145/3368555.338...,https://scholar.google.com/scholar?cites=10230...
7,2020,Population-aware hierarchical bayesian domain ...,Variational Learning of Individual Survival Di...,"V Mhasawade, NA Rehman, R Chunara",The abundance of modern health data provides m...,1,0,0,1,0,...,0,0,0,0,Population-Aware Hierarchical Bayesian Domain ...,"S64xxT0AAAAJ, Hrl6vaoAAAAJ, 7NhhkR8AAAAJ","V Mhasawade, NA Rehman, R Chunara - Proceeding...",10,https://dl.acm.org/doi/pdf/10.1145/3368555.338...,https://scholar.google.com/scholar?cites=20628...
8,2020,Variational learning of individual survival di...,De-Identification of Free-Text Medical Records...,"Z Xiu, C Tao, R Henao",The ability of caregivers and investigators to...,1,0,0,2,1,...,0,0,0,0,Variational Learning of Individual Survival Di...,"rDrIfkoAAAAJ, qyzhQgIAAAAJ, p_mm4-YAAAAJ","Z Xiu, C Tao, R Henao - Proceedings of the ACM...",19,https://dl.acm.org/doi/pdf/10.1145/3368555.338...,https://scholar.google.com/scholar?cites=30233...
9,2020,Using SNOMED to automate clinical concept mapping,Adverse Drug Reaction Discovery from Electroni...,"P Long, O Doyle",Adverse drug reactions (ADRs) are detrimental ...,0,0,0,0,0,...,0,0,0,0,Using SNOMED to Automate Clinical Concept Mapping,"Xg4y16YAAAAJ, hAD669MAAAAJ","S Gupta, F Dieleman, P Long, O Doyle… - Procee...",1,https://dl.acm.org/doi/pdf/10.1145/3368555.338...,https://scholar.google.com/scholar?cites=81364...


In [15]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go

# ... [Previous code remains unchanged] ...

def count_nan_titles(chil_df, ml4h_df, mlhc_df):
    """
    Counts the number of rows where the 'title' column is NaN for each dataframe.
    
    Args:
    chil_df, ml4h_df, mlhc_df: pandas DataFrames containing paper data
    
    Returns:
    dict: A dictionary with the counts of NaN titles for each conference
    """
    nan_titles = {
        'CHIL': chil_df['title'].isna().sum(),
        'ML4H': ml4h_df['title'].isna().sum(),
        'MLHC': mlhc_df['title'].isna().sum()
    }
    
    total_nan_titles = sum(nan_titles.values())
    total_papers = len(chil_df) + len(ml4h_df) + len(mlhc_df)
    
    nan_titles['Total'] = total_nan_titles
    nan_titles['Percentage'] = (total_nan_titles / total_papers) * 100
    
    return nan_titles

# Load the dataframes
chil_semantic_df = pd.read_csv("processed_data/chil_semantic_scholar_citations.csv")
ml4h_semantic_df = pd.read_csv("processed_data/ml4h_semantic_scholar_citations.csv")
mlhc_semantic_df = pd.read_csv("processed_data/mlhc_semantic_scholar_citations.csv")

# Count NaN citations
nan_citation_counts = count_nan_citations(chil_semantic_df, ml4h_semantic_df, mlhc_semantic_df)

# Print the NaN citation results
print("\nNaN Citation Counts:")
for conference, count in nan_citation_counts.items():
    if conference == 'Percentage':
        print(f"{conference}: {count:.2f}%")
    else:
        print(f"{conference}: {count}")

# Count uncleaned titles
uncleaned_title_counts = count_uncleaned_titles(chil_semantic_df, ml4h_semantic_df, mlhc_semantic_df)

# Print the uncleaned title results
print("\nUncleaned Title Counts:")
for conference, count in uncleaned_title_counts.items():
    if conference == 'Percentage':
        print(f"{conference}: {count:.2f}%")
    else:
        print(f"{conference}: {count}")

# Count NaN titles
nan_title_counts = count_nan_titles(chil_semantic_df, ml4h_semantic_df, mlhc_semantic_df)

# Print the NaN title results
print("\nNaN Title Counts:")
for conference, count in nan_title_counts.items():
    if conference == 'Percentage':
        print(f"{conference}: {count:.2f}%")
    else:
        print(f"{conference}: {count}")

# ... [Rest of the code remains unchanged] ...


NaN Citation Counts:
CHIL: 29
ML4H: 16
MLHC: 110
Total: 155
Percentage: 24.41%

Uncleaned Title Counts:
CHIL: 4
ML4H: 3
MLHC: 23
Total: 30
Percentage: 4.72%

NaN Title Counts:
CHIL: 4
ML4H: 3
MLHC: 23
Total: 30
Percentage: 4.72%


In [17]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import os

# ... [Previous functions remain unchanged] ...

# Load the dataframes
chil_semantic_df = pd.read_csv("processed_data/chil_semantic_scholar_citations.csv")
ml4h_semantic_df = pd.read_csv("processed_data/ml4h_semantic_scholar_citations.csv")
mlhc_semantic_df = pd.read_csv("processed_data/mlhc_semantic_scholar_citations.csv")

# Print original dataframe sizes
print("\nOriginal Dataframe Sizes:")
print(f"CHIL: {len(chil_semantic_df)}")
print(f"ML4H: {len(ml4h_semantic_df)}")
print(f"MLHC: {len(mlhc_semantic_df)}")

# Count NaN titles before removal
nan_title_counts = count_nan_titles(chil_semantic_df, ml4h_semantic_df, mlhc_semantic_df)

# Print the NaN title results before removal
print("\nNaN Title Counts (Before Removal):")
for conference, count in nan_title_counts.items():
    if conference == 'Percentage':
        print(f"{conference}: {count:.2f}%")
    else:
        print(f"{conference}: {count}")

# Remove rows with NaN titles
chil_semantic_df = chil_semantic_df.dropna(subset=['title'])
ml4h_semantic_df = ml4h_semantic_df.dropna(subset=['title'])
mlhc_semantic_df = mlhc_semantic_df.dropna(subset=['title'])

# Print new dataframe sizes
print("\nDataframe Sizes After Removing NaN Titles:")
print(f"CHIL: {len(chil_semantic_df)}")
print(f"ML4H: {len(ml4h_semantic_df)}")
print(f"MLHC: {len(mlhc_semantic_df)}")

# Recount NaN citations after removing rows with NaN titles
nan_citation_counts = count_nan_citations(chil_semantic_df, ml4h_semantic_df, mlhc_semantic_df)

# Print the new NaN citation results
print("\nNaN Citation Counts (After Removing NaN Titles):")
for conference, count in nan_citation_counts.items():
    if conference == 'Percentage':
        print(f"{conference}: {count:.2f}%")
    else:
        print(f"{conference}: {count}")

# Save cleaned dataframes to new CSV files
output_dir = "processed_data/cleaned"
os.makedirs(output_dir, exist_ok=True)

chil_semantic_df.to_csv(f"{output_dir}/chil_semantic_scholar_citations_cleaned.csv", index=False)
ml4h_semantic_df.to_csv(f"{output_dir}/ml4h_semantic_scholar_citations_cleaned.csv", index=False)
mlhc_semantic_df.to_csv(f"{output_dir}/mlhc_semantic_scholar_citations_cleaned.csv", index=False)

print("\nCleaned CSV files have been saved in the 'processed_data/cleaned' directory.")

# ... [Rest of the code remains unchanged] ...


Original Dataframe Sizes:
CHIL: 155
ML4H: 134
MLHC: 346

NaN Title Counts (Before Removal):
CHIL: 4
ML4H: 3
MLHC: 23
Total: 30
Percentage: 4.72%

Dataframe Sizes After Removing NaN Titles:
CHIL: 151
ML4H: 131
MLHC: 323

NaN Citation Counts (After Removing NaN Titles):
CHIL: 25
ML4H: 13
MLHC: 87
Total: 125
Percentage: 20.66%

Cleaned CSV files have been saved in the 'processed_data/cleaned' directory.


In [34]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import os

# ... [Previous functions remain unchanged] ...

def get_titles_with_nan_citations(df):
    """
    Returns a list of cleaned titles for papers with NaN citation counts.
    
    Args:
    df: pandas DataFrame containing paper data
    
    Returns:
    list: A list of cleaned titles for papers with NaN citation counts
    """
    return df[df['citation_count'].isna()]['cleaned_title'].tolist()

# Load the cleaned dataframes
chil_semantic_df = pd.read_csv("processed_data/specifically_cleaned/chil_semantic_scholar_citations.csv")
ml4h_semantic_df = pd.read_csv("processed_data/specifically_cleaned/ml4h_semantic_scholar_citations.csv")
mlhc_semantic_df = pd.read_csv("processed_data/specifically_cleaned/mlhc_semantic_scholar_citations.csv")

# Get lists of titles with NaN citations for each conference
chil_nan_citation_titles = get_titles_with_nan_citations(chil_semantic_df)
ml4h_nan_citation_titles = get_titles_with_nan_citations(ml4h_semantic_df)
mlhc_nan_citation_titles = get_titles_with_nan_citations(mlhc_semantic_df)

# Print the results
print("\nTitles with NaN citations:")
print(f"\nCHIL ({len(chil_nan_citation_titles)}):")
for title in chil_nan_citation_titles:  # Print first 5 for brevity
    print(f"- {title}")
# if len(chil_nan_citation_titles) > 5:
#     print("...")

print(f"\nML4H ({len(ml4h_nan_citation_titles)}):")
for title in ml4h_nan_citation_titles:  # Print first 5 for brevity
    print(f"- {title}")
# if len(ml4h_nan_citation_titles) > 5:
#     print("...")

print(f"\nMLHC ({len(mlhc_nan_citation_titles)}):")
for title in mlhc_nan_citation_titles:  # Print first 5 for brevity
    print(f"- {title}")
# if len(mlhc_nan_citation_titles) > 5:
#     print("...")

# ... [Rest of the code remains unchanged] ...


Titles with NaN citations:

CHIL (23):
- MIC-Extract: A Data Extraction, Preprocessing, and Representation Pipeline for
- Visual Che Xbert: Addressing the Discrepancy Between Radiology Report Labels and Image Labels
- Che X-Transfer: Performance and Parameter Efficiency of Image Net Models for Chest X-Ray Interpretation
- Interpretable Missing Values in Healthcare: Figure 7 - Impact of Father's Education on Infant Mortality Risk. Appendix A: Testing for MCAR with EBM: Case Study.
- Toward the Practical Utility of Federated Learning in the Medical Domain
- Evaluating Model Performance in Medical Datasets Over Time: A Snapshot into the State of Proceedings and the First 20 Papers That Came Up in the Radiology Medical Journal When Searching for the Keyword "Machine Learning" and Filtering for Papers from To
- Understanding and Predicting Environment Effects on Individuals with T2D: Appendix A - CGM Dataset. We include information on the range of values for each external factor in our dat

In [32]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import os
import re

# ... [Previous functions remain unchanged] ...

def specific_cleaning(df):
    """
    Performs specific cleaning operations on the dataframe.
    
    Args:
    df: pandas DataFrame containing paper data
    
    Returns:
    pandas DataFrame: The cleaned dataframe
    """
    # Titles to remove
    titles_to_remove = [
        "Supplementary",
        "Motivation for What Purpose Was the Dataset Created?",
        "Conference Proceedings",
        "Variable Total Age (Mean SD) 55.14",
        "There is no title to clean",
        "Machine Learning for Healthcare August 8-10",
        "Development of a Clinical Decision Tool and Protocol for Identification and Treatment",
        "Clinical Abstract Track",
        "To 32.9 in 1,"
    ]
    
    # Remove rows with specified titles
    pattern = '|'.join(map(re.escape, titles_to_remove))
    df = df[~df['cleaned_title'].str.contains(pattern, case=False, regex=True)]
    
    # Clean specific titles
    df['cleaned_title'] = df['cleaned_title'].replace({
        "EG to Text: Learning to Write Medical Reports from G Recordings": 
        "EEG to Text: Learning to Write Medical Reports from EEG Recordings"
    })
    
    # Remove "Preprint: Under Review" and associated numbers
    df['cleaned_title'] = df['cleaned_title'].apply(lambda x: re.sub(r'Preprint: Under Review \d+:\s*\d+–\d+,', '', x).strip())
    df['cleaned_title'] = df['cleaned_title'].apply(lambda x: re.sub(r'Preprint: Under Review \d+–\d+,?', '', x).strip())
    
    # Remove rows where the cleaned title is empty after cleaning
    df = df[df['cleaned_title'] != '']
    
    return df

# Load the cleaned dataframes
chil_semantic_df = pd.read_csv("processed_data/cleaned/chil_semantic_scholar_citations_cleaned.csv")
ml4h_semantic_df = pd.read_csv("processed_data/cleaned/ml4h_semantic_scholar_citations_cleaned.csv")
mlhc_semantic_df = pd.read_csv("processed_data/cleaned/mlhc_semantic_scholar_citations_cleaned.csv")

# Apply specific cleaning to each dataframe
print("Applying specific cleaning operations...")
chil_semantic_df = specific_cleaning(chil_semantic_df)
ml4h_semantic_df = specific_cleaning(ml4h_semantic_df)
mlhc_semantic_df = specific_cleaning(mlhc_semantic_df)

# Print the number of rows in each dataframe after cleaning
print(f"\nRows after specific cleaning:")
print(f"CHIL: {len(chil_semantic_df)}")
print(f"ML4H: {len(ml4h_semantic_df)}")
print(f"MLHC: {len(mlhc_semantic_df)}")

# Save the specifically cleaned dataframes
output_dir = "processed_data/specifically_cleaned"
os.makedirs(output_dir, exist_ok=True)

chil_semantic_df.to_csv(f"{output_dir}/chil_semantic_scholar_citations.csv", index=False)
ml4h_semantic_df.to_csv(f"{output_dir}/ml4h_semantic_scholar_citations.csv", index=False)
mlhc_semantic_df.to_csv(f"{output_dir}/mlhc_semantic_scholar_citations.csv", index=False)

print(f"\nSpecifically cleaned CSV files have been saved in the '{output_dir}' directory.")

# Function to check for "preprint" in titles
def count_preprint_titles(df):
    return df['cleaned_title'].str.lower().str.contains('preprint').sum()

# Count and print the number of titles containing "preprint"
print("\nNumber of titles containing 'preprint':")
print(f"CHIL: {count_preprint_titles(chil_semantic_df)}")
print(f"ML4H: {count_preprint_titles(ml4h_semantic_df)}")
print(f"MLHC: {count_preprint_titles(mlhc_semantic_df)}")

# Function to get titles containing "preprint"
def get_preprint_titles(df):
    return df[df['cleaned_title'].str.lower().str.contains('preprint')]['cleaned_title'].tolist()

# Print any titles that still contain "preprint"
for name, df in [("CHIL", chil_semantic_df), ("ML4H", ml4h_semantic_df), ("MLHC", mlhc_semantic_df)]:
    preprint_titles = get_preprint_titles(df)
    if preprint_titles:
        print(f"\n{name} title(s) still containing 'preprint':")
        for title in preprint_titles:
            print(f"- {title}")
    else:
        print(f"\nNo {name} titles contain 'preprint'.")

# ... [Rest of the code remains unchanged] ...

Applying specific cleaning operations...

Rows after specific cleaning:
CHIL: 148
ML4H: 127
MLHC: 283

Specifically cleaned CSV files have been saved in the 'processed_data/specifically_cleaned' directory.

Number of titles containing 'preprint':
CHIL: 0
ML4H: 0
MLHC: 0

No CHIL titles contain 'preprint'.

No ML4H titles contain 'preprint'.

No MLHC titles contain 'preprint'.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_title'] = df['cleaned_title'].replace({
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_title'] = df['cleaned_title'].apply(lambda x: re.sub(r'Preprint: Under Review \d+:\s*\d+–\d+,', '', x).strip())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_title'] = df['cle

In [36]:
import pandas as pd
import json
import os
from serpapi import GoogleSearch
from time import sleep
import re

def clean_filename(title):
    # Remove invalid characters and limit length
    clean = re.sub(r'[^\w\s-]', '', title).strip()
    return clean[:100]  # Limit filename length

def search_and_save_results(df, api_key, output_dir='serpapi_results'):
    os.makedirs(output_dir, exist_ok=True)
    
    # Filter for rows where citation_count is NaN
    df_nan_citations = df[pd.isna(df['citation_count'])]
    
    print(f"Processing {len(df_nan_citations)} titles with NaN citation counts...")
    
    for index, row in df_nan_citations.iterrows():
        cleaned_title = row['cleaned_title']
        filename = clean_filename(cleaned_title)
        filename = filename.replace(' ', '_') + '.json'
        
        params = {
            "api_key": api_key,
            "engine": "google_scholar",
            "q": cleaned_title,
            "hl": "en"
        }
        
        try:
            search = GoogleSearch(params)
            results = search.get_dict()
            
            with open(os.path.join(output_dir, filename), 'w', encoding='utf-8') as f:
                json.dump(results, f, ensure_ascii=False, indent=4)
            
            print(f"Saved results for: {cleaned_title}")
            sleep(2)  # To avoid hitting rate limits
        
        except Exception as e:
            print(f"Error processing {cleaned_title}: {str(e)}")
    
    print("Finished processing all titles with NaN citation counts.")

# Usage
api_key = ""  # Replace with your actual API key

# Load the cleaned dataframes
input_dir = "processed_data/specifically_cleaned"
chil_df = pd.read_csv(f"{input_dir}/chil_semantic_scholar_citations.csv")
ml4h_df = pd.read_csv(f"{input_dir}/ml4h_semantic_scholar_citations.csv")
mlhc_df = pd.read_csv(f"{input_dir}/mlhc_semantic_scholar_citations.csv")

# Process each dataframe
for name, df in [("CHIL", chil_df), ("ML4H", ml4h_df), ("MLHC", mlhc_df)]:
    print(f"\nProcessing {name} titles...")
    output_dir = f"serpapi_results_{name.lower()}_nan_citations"
    search_and_save_results(df, api_key, output_dir)

print("All processing completed.")


Processing CHIL titles...
Processing 23 titles with NaN citation counts...
Saved results for: MIC-Extract: A Data Extraction, Preprocessing, and Representation Pipeline for
Saved results for: Visual Che Xbert: Addressing the Discrepancy Between Radiology Report Labels and Image Labels
Saved results for: Che X-Transfer: Performance and Parameter Efficiency of Image Net Models for Chest X-Ray Interpretation
Saved results for: Interpretable Missing Values in Healthcare: Figure 7 - Impact of Father's Education on Infant Mortality Risk. Appendix A: Testing for MCAR with EBM: Case Study.
Saved results for: Toward the Practical Utility of Federated Learning in the Medical Domain
Saved results for: Evaluating Model Performance in Medical Datasets Over Time: A Snapshot into the State of Proceedings and the First 20 Papers That Came Up in the Radiology Medical Journal When Searching for the Keyword "Machine Learning" and Filtering for Papers from To
Saved results for: Understanding and Predicti

In [1]:
import pandas as pd
import json
import os
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

# Download necessary NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

def clean_filename(title):
    # Remove special characters and limit length
    clean = re.sub(r'[^\w\s-]', '', title).strip()
    clean = re.sub(r'\s+', '_', clean)  # Replace spaces with underscores
    return clean[:100]  # Limit filename length

def compare_titles(original_title, result_title):
    original_tokens = word_tokenize(original_title.lower())
    result_tokens = word_tokenize(result_title.lower())
    stop_words = set(stopwords.words('english'))
    original_tokens = [token for token in original_tokens if token.isalpha() and token not in stop_words]
    result_tokens = [token for token in result_tokens if token.isalpha() and token not in stop_words]
    overlapping_words = set(original_tokens) & set(result_tokens)
    return len(overlapping_words) >= 3

def extract_citations_from_file(file_path, original_title):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    
    if 'organic_results' in data and len(data['organic_results']) > 0:
        result = data['organic_results'][0]
        if compare_titles(original_title, result['title']):
            return result.get('inline_links', {}).get('cited_by', {}).get('total', 0)
    return None

def process_dataframe(df, input_dir):
    # Filter for rows where citation_count is NaN
    df_nan_citations = df[pd.isna(df['citation_count'])]
    
    print(f"Processing {len(df_nan_citations)} titles with NaN citation counts...")
    unmatched_count = 0
    for index, row in df_nan_citations.iterrows():
        cleaned_title = row['cleaned_title']
        filename = clean_filename(cleaned_title) + '.json'
        file_path = os.path.join(input_dir, filename)
        
        if os.path.exists(file_path):
            # Extract citations
            citations = extract_citations_from_file(file_path, cleaned_title)
            if citations is not None:
                df.loc[index, 'citation_count'] = citations
            else:
                unmatched_count += 1
        else:
            print(f"File not found for: {cleaned_title}")
            unmatched_count += 1
    
    # Drop rows with NaN citation counts
    df_cleaned = df.dropna(subset=['citation_count'])
    
    return df_cleaned, unmatched_count

# Usage
# Load the cleaned dataframes
input_dir = "processed_data/specifically_cleaned"
chil_df = pd.read_csv(f"{input_dir}/chil_semantic_scholar_citations.csv")
ml4h_df = pd.read_csv(f"{input_dir}/ml4h_semantic_scholar_citations.csv")
mlhc_df = pd.read_csv(f"{input_dir}/mlhc_semantic_scholar_citations.csv")

# Process each dataframe
for name, df in [("CHIL", chil_df), ("ML4H", ml4h_df), ("MLHC", mlhc_df)]:
    print(f"\nProcessing {name} titles...")
    input_dir = f"serpapi_results_{name.lower()}_nan_citations"
    df_cleaned, unmatched_count = process_dataframe(df, input_dir)
    
    # Save the cleaned dataframe
    output_file = f"processed_data/cleaned_{name.lower()}_citations.csv"
    df_cleaned.to_csv(output_file, index=False)
    print(f"Saved cleaned {name} dataframe to {output_file}")
    
    # Print statistics
    print(f"Total entries processed: {len(df)}")
    print(f"Entries with NaN citations: {len(df) - len(df_cleaned)}")
    print(f"Entries without matching SerpAPI results: {unmatched_count}")
    print(f"Entries with updated citations: {len(df_cleaned) - (len(df) - len(df[pd.isna(df['citation_count'])]))}")

print("All processing completed.")


Processing CHIL titles...
Processing 23 titles with NaN citation counts...
Saved cleaned CHIL dataframe to processed_data/cleaned_chil_citations.csv
Total entries processed: 148
Entries with NaN citations: 1
Entries without matching SerpAPI results: 1
Entries with updated citations: 0

Processing ML4H titles...
Processing 9 titles with NaN citation counts...
Saved cleaned ML4H dataframe to processed_data/cleaned_ml4h_citations.csv
Total entries processed: 127
Entries with NaN citations: 4
Entries without matching SerpAPI results: 4
Entries with updated citations: 0

Processing MLHC titles...
Processing 60 titles with NaN citation counts...
Saved cleaned MLHC dataframe to processed_data/cleaned_mlhc_citations.csv
Total entries processed: 283
Entries with NaN citations: 25
Entries without matching SerpAPI results: 25
Entries with updated citations: 0
All processing completed.
