In [71]:
import pandas as pd
from semanticscholar import SemanticScholar
import openai
from xml.etree import ElementTree
from Bio import Entrez

In [47]:
from file_io import get_model_directory_path
import os
import json
import re
## Read an example response from the chatgpt_response (only the Summary)
with open('config.json') as config_file:
    data = json.load(config_file)

#`MODEL_ANNOTATION_ROOT` is the path to the root directory of the model annotation repository
os.environ['MODEL_ANNOTATION_ROOT'] = data["MODEL_ANNOTATION_ROOT"]

# load the API key
key = data["OPENAI_API_KEY"]
os.environ['MODEL_ANNOTATION_ROOT'] = 'Projects/cellmaps_annotate_hierarchy/cellmaps_annotate_hierarchy/'
model_name = data["MAP_NAME"]

version = data["MAP_V"]

file_name = data["MAP_FILE"]

# example Cluster1-5
system = 'Cluster1-5'
response_path = os.path.join(get_model_directory_path(model_name, version),
system, f"{system}_chatgpt_response")


def get_summary(file_name):
    with open(file_name, "r") as file:
        content = file.read()
        
    # Regular expression pattern for matching the Summary section
    pattern = re.compile(r'(#+\s*)?(\*{1,2}|_{1,2})?Summary[:\s]*(.*?)(#+\s*)?References', re.DOTALL | re.IGNORECASE)
    match = pattern.search(content)

    if match:
        cleaned_text = re.sub(r'[\*#]', '', match.group(3))  # Remove remaining asterisks and hash symbols
        summary = cleaned_text.strip()  # Remove leading and trailing whitespace
        # print(summary)
        return summary
    else:
        print("No Summary found.")
        return None

#usage 
paragraphs = get_summary(response_path+'.md')
paragraphs = list(filter(lambda p: len(p.split()) > 5, paragraphs.split("\n")))

In [48]:
paragraphs

['The human protein system analyzed in this report consists of 37 proteins primarily localized in the mitochondrion, with many of them involved in protein binding, cytosol, and mitochondrial matrix. The system is involved in various biological processes such as tricarboxylic acid cycle, electron transfer activity, electron transport chain, fatty acid beta-oxidation, and response to xenobiotic stimulus. Based on the cellular location and function, we propose the name "Mitochondrial Protein Interaction System" for this protein system.',
 'The Mitochondrial Protein Interaction System is predominantly composed of proteins localized in the mitochondrion (37 proteins), with a significant number of them involved in protein binding (35 proteins) and cytosol (28 proteins) [1]. The system is involved in several essential biological processes, including the tricarboxylic acid cycle (4 proteins: IDH2, IDH3A, FH, ACO2), electron transfer activity (4 proteins: CIAPIN1, ME2, AKR7A2, AKR7A3), and elec

In [72]:
import requests

api_key = data["OPENAI_API_KEY"]
headers = {
    "Authorization": f"Bearer {api_key}",
    "Content-Type": "application/json",
}

In [73]:
max_tokens = data["MAX_TOKENS"] # Set your max tokens here
rate_per_token = data["RATE_PER_TOKEN"]# Set your rate per token here 
model = data["GPT_MODEL"]
DOLLAR_LIMIT = data["DOLLAR_LIMIT"]  # Set your dollar limit here
logfile_name = "valid_ref_" #data["LOG_NAME"] # Set your log file name here
LOG_FILE = os.path.join(get_model_directory_path(model_name, version), f"{logfile_name}log.json")

def load_log(LOG_FILE):
    if os.path.exists(LOG_FILE):
        with open(LOG_FILE, "r") as f:
            return json.load(f)
    else:
        return {"tokens_used": 0, "dollars_spent": 0.0, "time_taken_last_run": 0.0, "time_taken_total": 0.0}

def save_log(LOG_FILE,log_data):
    with open(LOG_FILE, "w") as f:
        json.dump(log_data, f, indent=4)

def estimate_cost(tokens, rate_per_token):
    return tokens * rate_per_token


def get_keyword_from_paragraph(paragraph, gpt_model='gpt-4', verbose=False):
    log_data = load_log(LOG_FILE)
    tokens_estimate = len(paragraph) + max_tokens
    query = """I have paragraph\nParagraph:\n%s\nI would like to search PubMed to validate this abstract. give me a list of 5 keywords. Keywords must include gene symbols and their related functions. please order keywords by their importance in paragraph, from high important to low important. Also genes should be located first. Just tell me keywords only with comma seperated without spacing"""%paragraph

    # print(query)

    keyword_extraction_data = {
    "model": gpt_model,
        "temperature": 0,
        "messages": [
        {"role": "system", "content": "You are a helpful assistant."},
    ] + [{"role": "user", "content": query}]}
    
    if estimate_cost(log_data["tokens_used"] + tokens_estimate, rate_per_token) > DOLLAR_LIMIT:
        print("The API call is estimated to exceed the dollar limit. Aborting.")
        return

    try:

        response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=keyword_extraction_data)

        response_json = response.json()

        tokens_used = response_json["usage"]["total_tokens"]
        # Update and save the log
        log_data["tokens_used"] += tokens_used
        log_data["dollars_spent"] = estimate_cost(log_data["tokens_used"], rate_per_token)
        print(tokens_used)
        save_log(LOG_FILE,log_data)

        if 'choices' in response_json.keys():
            result = response_json["choices"][0]["message"]["content"]
            
        else:
            result = None
        if verbose: 
            print("Query:")
            print(query)
            print("Result:")
            print(result)
        if result is not None:
            print("Tokens used: %s"%tokens_used)
            return [keyword.strip() for keyword in result.split(",")]
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return None
    

In [74]:
get_keyword_from_paragraph(paragraphs[0], gpt_model='gpt-4', verbose=False)

232
Tokens used: 232


['RNA polymerase II',
 'chromatin remodeling',
 'SWI/SNF complex',
 'cell differentiation',
 'DNA repair']

In [52]:
get_keyword_from_paragraph(paragraphs[1], gpt_model='gpt-4', verbose=False)

Tokens used: 267


['IDH2',
 'IDH3A',
 'tricarboxylic acid cycle',
 'electron transfer activity',
 'electron transport chain']

In [51]:
get_keyword_from_paragraph(paragraphs[2], gpt_model='gpt-4', verbose=False)

Tokens used: 209


['ACADM',
 'fatty acid beta-oxidation',
 'ECHS1',
 'HADH',
 'response to xenobiotic stimulus']

In [24]:
def get_mla_citation(doi):
    url = f'https://api.crossref.org/works/{doi}'
    headers = {'accept': 'application/json'}
    
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        data = response.json()
        #print(data)
        item = data['message']
        
        authors = item['author']
        formatted_authors = []
        for author in authors:
            formatted_authors.append(f"{author['family']}, {author.get('given', '')}")
        authors_str = ', '.join(formatted_authors)
        
        title = item['title'][0]
        container_title = item['container-title'][0]
        year = item['issued']['date-parts'][0][0]
        volume = item.get('volume', '')
        issue = item.get('issue', '')
        page = item.get('page', '')
        
        mla_citation = f"{authors_str}. \"{title}.\" {container_title}"
        if volume or issue:
            mla_citation += f", vol. {volume}" if volume else ''
            mla_citation += f", no. {issue}" if issue else ''
        mla_citation += f", {year}, pp. {page}."
        
        return mla_citation

In [25]:
def get_mla_citation_from_pubmed_id(paper_dict):
    article = paper_dict['MedlineCitation']['Article']
    #print(article.keys())
    authors = article['AuthorList']
    formatted_authors = []
    for author in authors:
        last_name = author['LastName'] if author['LastName'] is not None else ''
        first_name = author['ForeName'] if author['ForeName'] is not None else ''
        formatted_authors.append(f"{last_name}, {first_name}")
    authors_str = ', '.join(formatted_authors)

    title = article['ArticleTitle']
    journal = article['Journal']['Title']
    year = article['Journal']['JournalIssue']['PubDate']['Year']
    page = article['Pagination']['MedlinePgn']
    mla_citation = f"{authors_str}. \"{title}\" {journal}"
    if "Volume" in article['Journal']['JournalIssue']['PubDate']:
        volume = article['Journal']['JournalIssue']['PubDate']['Volume']
        mla_citation += f", vol. {volume}" if volume else ''
    elif "Issue" in article['Journal']['JournalIssue']['PubDate']:
        issue = article['Journal']['JournalIssue']['PubDate']['Issue']
        mla_citation += f", no. {issue}" if issue else ''
    mla_citation += f", {year}, pp. {page}."
    return mla_citation

In [26]:
def get_citation(paper):
    names = ",".join([author['name'] for author in paper['authors']])
    corrected_title = paper['title']
    journal = paper['journal']['name']
    pub_date = paper['publicationDate']
    if 'volume' in paper['journal'].keys(): 
        volume = paper['journal']['volume'].strip()
    else:
        volume = ''
    if 'pages' in paper['journal'].keys():
        pages = paper['journal']['pages'].strip()
    else:
        doi = paper['externalIds']['DOI']
        pages = doi.strip().split(".")[-1]
    citation = f"{names}. {corrected_title} {journal} {volume} ({pub_date[0:4]}):{pages}"
    return citation

In [83]:
def get_references(queried_papers, paragraph, gpt_model='gpt-4', n=10, verbose=False):
    citations = []
    for paper in queried_papers:
        abstract = paper['MedlineCitation']['Article']['Abstract']['AbstractText'][0]
        message = """I have pharagraph\n Pharagraph:\n%s\nand abstract.\n Abstract:\n%s\nDoes this abstract support this paragraph? Please tell me yes or no"""%(paragraph, abstract)
        
        reference_check_data = {
            "model": gpt_model,
            "temperature": 0,
            "messages": [
                {"role": "system", "content": "You are a helpful assistant."},
            ] + [{"role": "user", "content": message }],
        }
        reference_check_data['messages'].append({"role":"user", "content":message})

        log_data = load_log(LOG_FILE)
        tokens_estimate = len(paragraph) + max_tokens

        if estimate_cost(log_data["tokens_used"] + tokens_estimate, rate_per_token) > DOLLAR_LIMIT:
            print("The API call is estimated to exceed the dollar limit. Aborting.")
            return

        else:
            response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=reference_check_data)

            response_json = response.json()
            tokens_used = response_json["usage"]["total_tokens"]
            # Update and save the log
            log_data["tokens_used"] += tokens_used
            log_data["dollars_spent"] = estimate_cost(log_data["tokens_used"], rate_per_token)
            print(tokens_used)
            save_log(LOG_FILE,log_data)

            if 'choices' in response_json.keys():
                result = response_json['choices'][0]['message']['content']
                if result[:3].lower()=='yes':
                    try:
                        citation = get_mla_citation_from_pubmed_id(paper)
                        if citation not in citations:
                            citations.append(citation)
                    except Exception as e:
                        print("Cannot parse citation even though this paper support pargraph")
                        print("Error detail: ", e)
                        pass
                    if len(citations)>=n:
                        return citations
            else:
                result = "No"    
            if verbose:
                print("Title: ", paper['MedlineCitation']['Article']['ArticleTitle'])
                print("Query: ")
                print(message)
                print("Result:")
                print(result)
                print("="*200)

    return citations
        
        
        

In [76]:
def search_pubmed(keywords, email, sort_by='citation_count', retmax=10):
    Entrez.email = email

    search_query = f"{keywords} AND (hasabstract[text])"
    search_handle = Entrez.esearch(db='pubmed', term=search_query, sort=sort_by, retmax=retmax)
    search_results = Entrez.read(search_handle)
    search_handle.close()

    id_list = search_results['IdList']

    if not id_list:
        print("No results found.")
        return []

    fetch_handle = Entrez.efetch(db='pubmed', id=id_list, retmode='xml')
    articles = Entrez.read(fetch_handle)['PubmedArticle']
    fetch_handle.close()

    return articles

In [29]:
def get_papers(keywords, n, email):
    total_papers = []
    while True:
        keyword_joined = " AND ".join(["("+keyword+"[Title/Abstract])" for keyword in keywords])
        if len(keywords)==2:
                return total_papers
        try:
            semantic_scholar_queried_keywords= search_pubmed(keyword_joined, email=email, retmax=n)
            total_papers += list(semantic_scholar_queried_keywords[:n])
        except:
            pass
        keywords = keywords[:-1]
        print("Reducing keywords to %s"%",".join(keywords))
                

In [77]:
def get_references_for_paragraphs(paragraphs, email, n=5, gpt_model='gpt-4', verbose=False):
    references_paragraphs = []
    for i, paragraph in enumerate(paragraphs):
        if verbose:
            print("""Extracting keywords from paragraph\nParagraph:\n%s"""%paragraph)
            print("="*75)
        keywords = get_keyword_from_paragraph(paragraph, gpt_model=gpt_model, verbose=verbose)
        #keywords = list(sorted(keywords, key=len))
        keyword_joined = ",".join(keywords)
        print("Keywords: ", keyword_joined)
        print("Serching paper with keywords...")
        semantic_scholar_queried_keywords = get_papers(keywords, n, email)
        if len(semantic_scholar_queried_keywords)==0:
            print("No paper searched!!")
            references_paragraphs.append([])
        print("In paragraph %d, %d references are queried"%(i+1, len(semantic_scholar_queried_keywords)))
        references = get_references(semantic_scholar_queried_keywords, paragraph, gpt_model=gpt_model, n=n, verbose=verbose)
        references_paragraphs.append(references)
        print("In paragraph %d, %d references are matched"%(i+1, len(references)))
        print("")
        print("")
    n_refs = sum([len(refs) for refs in references_paragraphs])
    print("Total %d references are queried"%n_refs)
    print(references_paragraphs)
    # i = 1
    # referenced_paragraphs = ""
    # footer = "="*200+"\n"
    # for paragraph, references in zip(paragraphs, references_paragraphs):
    #     referenced_paragraphs += paragraph
    #     for reference in references:
    #         referenced_paragraphs += "[%d]"%i
    #         footer += "[%d] %s"%(i, reference) + '\n'
    #         i+=1
    #     referenced_paragraphs += "\n"
    # return referenced_paragraphs + footer
    i = 1
    footer = "### Validated References: \n"
    for references in references_paragraphs:
        for reference in references:
            if reference:
                footer += "[%d] %s"%(i, reference) + '\n'
                i+=1
    return footer
    
        

In [56]:
paragraphs_with_references = get_references_for_paragraphs(paragraphs, email = 'mhu@health.ucsd.edu', n=3, gpt_model="gpt-4", verbose=True)

Extracting keywords from paragraph
Paragraph:
The human protein system analyzed in this report consists of 37 proteins primarily localized in the mitochondrion, with many of them involved in protein binding, cytosol, and mitochondrial matrix. The system is involved in various biological processes such as tricarboxylic acid cycle, electron transfer activity, electron transport chain, fatty acid beta-oxidation, and response to xenobiotic stimulus. Based on the cellular location and function, we propose the name "Mitochondrial Protein Interaction System" for this protein system.
Query:
I have paragraph
Paragraph:
The human protein system analyzed in this report consists of 37 proteins primarily localized in the mitochondrion, with many of them involved in protein binding, cytosol, and mitochondrial matrix. The system is involved in various biological processes such as tricarboxylic acid cycle, electron transfer activity, electron transport chain, fatty acid beta-oxidation, and response to

In [63]:
references = get_references_for_paragraphs(paragraphs, email = 'mhu@health.ucsd.edu', n=3, gpt_model="gpt-4", verbose=True)

Extracting keywords from paragraph
Paragraph:
The human protein system analyzed in this report consists of 37 proteins primarily localized in the mitochondrion, with many of them involved in protein binding, cytosol, and mitochondrial matrix. The system is involved in various biological processes such as tricarboxylic acid cycle, electron transfer activity, electron transport chain, fatty acid beta-oxidation, and response to xenobiotic stimulus. Based on the cellular location and function, we propose the name "Mitochondrial Protein Interaction System" for this protein system.
Query:
I have paragraph
Paragraph:
The human protein system analyzed in this report consists of 37 proteins primarily localized in the mitochondrion, with many of them involved in protein binding, cytosol, and mitochondrial matrix. The system is involved in various biological processes such as tricarboxylic acid cycle, electron transfer activity, electron transport chain, fatty acid beta-oxidation, and response to

In [64]:
# Cluster 1-5
references

'### References: \n[1] Paredes, Gabriela F, Viehboeck, Tobias, Markert, Stephanie, Mausz, Michaela A, Sato, Yui, Liebeke, Manuel, König, Lena, Bulgheresi, Silvia. "Differential regulation of degradation and immune pathways underlies adaptation of the ectosymbiotic nematode Laxus oneistus to oxic-anoxic interfaces." Scientific reports, 2022, pp. 9725.\n[2] Jain, Naintara, Gomkale, Ridhima, Bernhard, Olaf, Rehling, Peter, Cruz-Zaragoza, Luis Daniel. "A quantitative fluorescence-based approach to study mitochondrial protein import." EMBO reports, 2023, pp. e55760.\n[3] Ke, Nijia, Kumka, Joseph E, Fang, Mingxu, Weaver, Brian, Burstyn, Judith N, Bauer, Carl E. "Redox Brake Regulator RedB and FnrL Function as Yin-Yang Regulators of Anaerobic-Aerobic Metabolism in Rhodobacter capsulatus." Microbiology spectrum, 2022, pp. e0235422.\n'

In [66]:
# Cluster 5-2
from file_io import get_model_directory_path
import os
import json
import re
## Read an example response from the chatgpt_response (only the Summary)
with open('config.json') as config_file:
    data = json.load(config_file)

#`MODEL_ANNOTATION_ROOT` is the path to the root directory of the model annotation repository
os.environ['MODEL_ANNOTATION_ROOT'] = data["MODEL_ANNOTATION_ROOT"]

# load the API key
key = data["OPENAI_API_KEY"]
os.environ['MODEL_ANNOTATION_ROOT'] = 'Projects/cellmaps_annotate_hierarchy/cellmaps_annotate_hierarchy/'
model_name = data["MAP_NAME"]

version = data["MAP_V"]

file_name = data["MAP_FILE"]


system = 'Cluster5-2'
response_path = os.path.join(get_model_directory_path(model_name, version),
system, f"{system}_chatgpt_response")


def get_summary(file_name):
    with open(file_name, "r") as file:
        content = file.read()
        
    # Regular expression pattern for matching the Summary section
    pattern = re.compile(r'(#+\s*)?(\*{1,2}|_{1,2})?Summary[:\s]*(.*?)(#+\s*)?References', re.DOTALL | re.IGNORECASE)
    match = pattern.search(content)

    if match:
        cleaned_text = re.sub(r'[\*#]', '', match.group(3))  # Remove remaining asterisks and hash symbols
        summary = cleaned_text.strip()  # Remove leading and trailing whitespace
        # print(summary)
        return summary
    else:
        print("No Summary found.")
        return None

#usage 
paragraphs = get_summary(response_path+'.md')
paragraphs = list(filter(lambda p: len(p.split()) > 5, paragraphs.split("\n")))

paragraphs

['The human protein system analyzed here consists of 20 proteins, most of which are involved in chromatin remodeling and transcription by RNA polymerase II. The proteins are also associated with various cellular components and complexes, such as the SWI/SNF complex, brahma complex, npBAF complex, nBAF complex, and GBAF complex. The system plays a role in multiple biological processes, including cell differentiation, cell population proliferation, and DNA repair. Based on the cellular location and function of these proteins, we propose the name "Chromatin Remodeling and Transcription Regulation System" for this protein system.']

In [84]:
references_5_2 = get_references_for_paragraphs(paragraphs, email = 'mhu@health.ucsd.edu', n=3, gpt_model="gpt-4", verbose=True)

Extracting keywords from paragraph
Paragraph:
The human protein system analyzed here consists of 20 proteins, most of which are involved in chromatin remodeling and transcription by RNA polymerase II. The proteins are also associated with various cellular components and complexes, such as the SWI/SNF complex, brahma complex, npBAF complex, nBAF complex, and GBAF complex. The system plays a role in multiple biological processes, including cell differentiation, cell population proliferation, and DNA repair. Based on the cellular location and function of these proteins, we propose the name "Chromatin Remodeling and Transcription Regulation System" for this protein system.
232
Query:
I have paragraph
Paragraph:
The human protein system analyzed here consists of 20 proteins, most of which are involved in chromatin remodeling and transcription by RNA polymerase II. The proteins are also associated with various cellular components and complexes, such as the SWI/SNF complex, brahma complex, np

In [85]:
references_5_2

'### Validated References: \n[1] Caretti, Giuseppina, Schiltz, R Louis, Dilworth, F Jeffrey, Di Padova, Monica, Zhao, Po, Ogryzko, Vasily, Fuller-Pace, Frances V, Hoffman, Eric P, Tapscott, Stephen J, Sartorelli, Vittorio. "The RNA helicases p68/p72 and the noncoding RNA SRA are coregulators of MyoD and skeletal muscle differentiation." Developmental cell, 2006, pp. 547-60.\n[2] Wang, Jie, Yan, Hai-Bo, Zhang, Qian, Liu, Wei-Yan, Jiang, Ying-Hua, Peng, Gang, Wu, Fei-Zhen, Liu, Xin, Yang, Peng-Yuan, Liu, Feng. "Enhancement of E-cadherin expression and processing and driving of cancer cell metastasis by ARID1A deficiency." Oncogene, 2021, pp. 5468-5481.\n[3] Mazina, M Yu, Nikolenko, Yu V, Krasnov, A N, Vorobyeva, N E. "[SWI/SNF Protein Complexes Participate in the Initiation and Elongation Stages of Drosophila hsp70 Gene Transcription]." Genetika, 2016, pp. 164-9.\n'

In [70]:
from pages_io import write_system_page
from file_io import get_root_path
write_system_page(references_5_2,'md',model_name, version, system, "valid_references", get_root_path()) 
write_system_page(references,'md',model_name, version, 'Cluster1-5', "valid_references", get_root_path())

In [97]:
if os.path.exists(os.path.join(get_model_directory_path(model_name, version),
        system, f"{system}_valid_references.md")):
    print("File exists")

File exists


In [1]:
from file_io import get_model_directory_path
from model_nodes_edges import load_nodes_edges
import os
import json
import re
## Read an example response from the chatgpt_response (only the Summary)
with open('config.json') as config_file:
    data = json.load(config_file)

#`MODEL_ANNOTATION_ROOT` is the path to the root directory of the model annotation repository
os.environ['MODEL_ANNOTATION_ROOT'] = data["MODEL_ANNOTATION_ROOT"]

# load the API key
key = data["OPENAI_API_KEY"]
os.environ['MODEL_ANNOTATION_ROOT'] = 'Projects/cellmaps_annotate_hierarchy/cellmaps_annotate_hierarchy/'
model_name = data["MAP_NAME"]

version = data["MAP_V"]

file_name = data["MAP_FILE"]

nodes, edges = load_nodes_edges(model_name, version, file_name)
systems = nodes['term'].values.tolist()
# response_path = os.path.join(get_model_directory_path(model_name, version),
# system, f"{system}_chatgpt_response")


def get_summary(file_name):
    with open(file_name, "r") as file:
        content = file.read()
        
    # Regular expression pattern for matching the Summary section
    pattern = re.compile(r'(#+\s*)?(\*{1,2}|_{1,2})?Summary[:\s]*(.*?)(#+\s*)?References', re.DOTALL | re.IGNORECASE)
    match = pattern.search(content)

    if match:
        cleaned_text = re.sub(r'[\*#]', '', match.group(3))  # Remove remaining asterisks and hash symbols
        summary = cleaned_text.strip()  # Remove leading and trailing whitespace
        # print(summary)
        return summary
    else:
        print("No Summary found.")
        return None

#usage 

# paragraphs = get_summary(response_path+'.md')
# paragraphs = list(filter(lambda p: len(p.split()) > 5, paragraphs.split("\n")))

# paragraphs

/cellar/users/mhu/Projects/cellmaps_annotate_hierarchy/cellmaps_annotate_hierarchy/MuSIC2_Maps/May2023_final/muse_imgdim_1024_ppidim_1024_latentd_128_layered.chi_10.maxres_80.alg_leiden.pruned.edges
/cellar/users/mhu/Projects/cellmaps_annotate_hierarchy/cellmaps_annotate_hierarchy/MuSIC2_Maps/May2023_final/muse_imgdim_1024_ppidim_1024_latentd_128_layered.chi_10.maxres_80.alg_leiden.pruned.nodes


In [3]:
nodes = nodes.sort_values(by=['size'], ascending=True)
systems = nodes.loc[nodes['size']<=50]['term'].values.tolist()
len(systems)

227

In [19]:
## check if these systems all have the valid_references.md file
import os 
for system in systems:
    ref_page = os.path.join(get_model_directory_path(model_name, version), system, f"{system}_valid_references.md")
    if not os.path.exists(ref_page):
        print(system + " does not have a valid_references.md file")
    # combine the valid ref to the gpt responses 
    else:
        print('add valid ref to the gpt response for ' + system)
        # read the valid ref file
        with open(ref_page, "r") as file:
            content = file.read()
            # if empty afte the title, then add None
            if content == '### Validated References: \n':
                content = '### Validated References: \nNone\n\n'
            # Add additional newlines BETWEEN REFERENCES
            content = content.replace("\n", "\n\n")
        # read the gpt response file
        gpt_response_path = os.path.join(get_model_directory_path(model_name, version), system, f"{system}_chatgpt_response.md")
        with open(gpt_response_path, "r") as file:
            response_content = file.read()
        # combine the two files
        combined_content = response_content + '\n\n' + content
        # print(combined_content)
        # write the combined file to the gpt response file
        new_gpt_response_path = os.path.join(get_model_directory_path(model_name, version), system, f"{system}_chatgpt_response_validref.md")
        with open(new_gpt_response_path, "w") as file:
            file.write(combined_content)


add valid ref to the gpt response for Cluster4-56
add valid ref to the gpt response for Cluster3-69
add valid ref to the gpt response for Cluster4-55
add valid ref to the gpt response for Cluster4-51
add valid ref to the gpt response for Cluster4-45
add valid ref to the gpt response for Cluster2-77
add valid ref to the gpt response for Cluster3-74
add valid ref to the gpt response for Cluster2-76
add valid ref to the gpt response for Cluster4-46
add valid ref to the gpt response for Cluster3-66
add valid ref to the gpt response for Cluster3-68
add valid ref to the gpt response for Cluster3-77
add valid ref to the gpt response for Cluster3-67
add valid ref to the gpt response for Cluster2-75
add valid ref to the gpt response for Cluster4-52
add valid ref to the gpt response for Cluster3-71
add valid ref to the gpt response for Cluster3-75
add valid ref to the gpt response for Cluster4-49
add valid ref to the gpt response for Cluster4-53
add valid ref to the gpt response for Cluster4-60


In [20]:
import os 
from pages_io import create_music_2_system_analysis_page
from file_io import get_root_path, read_system_tsv

for system in systems:
    ref_page = os.path.join(get_model_directory_path(model_name, version), system, f"{system}_chatgpt_response_validref.md")
    if not os.path.exists(ref_page):
        print(system + " does not have a chatgpt_response_validref.md file")
    
    # read the response + valid ref content and update to the analysis file
    else:
        print('update analysis page for ' + system)
        response_w_valid_ref = os.path.join(get_model_directory_path(model_name, version), system, f"{system}_chatgpt_response_validref.md")
        with open(response_w_valid_ref, "r") as file:
            content = file.read()
        summarized_tsv = read_system_tsv(model_name, version, system, "go_summary", get_root_path())
        analysis_pg = create_music_2_system_analysis_page(system, content, nodes, summarized_tsv)
        old_analysis_pg = os.path.join(get_model_directory_path(model_name, version), system, f"{system}_analysis_page.md")
        #change old analysis page to analysis_page_old
        os.rename(old_analysis_pg, old_analysis_pg[:-3]+'_old.md')
        # write the new analysis page to analysis_page file 
        with open(old_analysis_pg, "w") as file:
            file.write(analysis_pg)
        

update analysis page for Cluster4-56
update analysis page for Cluster3-69
update analysis page for Cluster4-55
update analysis page for Cluster4-51
update analysis page for Cluster4-45
update analysis page for Cluster2-77
update analysis page for Cluster3-74
update analysis page for Cluster2-76
update analysis page for Cluster4-46
update analysis page for Cluster3-66
update analysis page for Cluster3-68
update analysis page for Cluster3-77
update analysis page for Cluster3-67
update analysis page for Cluster2-75
update analysis page for Cluster4-52
update analysis page for Cluster3-71
update analysis page for Cluster3-75
update analysis page for Cluster4-49
update analysis page for Cluster4-53
update analysis page for Cluster4-60
update analysis page for Cluster3-79
update analysis page for Cluster3-73
update analysis page for Cluster4-50
update analysis page for Cluster4-47
update analysis page for Cluster3-72
update analysis page for Cluster1-27
update analysis page for Cluster4-43
u

In [21]:
import subprocess
git_repo_dir = "/cellar/users/mhu/src/MUSIC2_systems/May2023_final"
for system in systems:
    analysis_page = os.path.join(get_model_directory_path(model_name, version), system, f"{system}_analysis_page.md")
    # print(analysis_page)
    if not os.path.exists(analysis_page):
        print(f"Analysis page for {system} does not exist")
    else:
        # Replace the existing file with the updated file
        subprocess.run(['cp', '-f', analysis_page, git_repo_dir])


# ARCHIVE CODE

In [8]:
from Bio import Entrez
import requests
from file_io import get_model_directory_path,get_root_path
from pages_io import write_system_page
from model_nodes_edges import load_nodes_edges
import os
import json
import re

## Read an example response from the chatgpt_response (only the Summary)
with open('config.json') as config_file:
    data = json.load(config_file)

#`MODEL_ANNOTATION_ROOT` is the path to the root directory of the model annotation repository
os.environ['MODEL_ANNOTATION_ROOT'] = data["MODEL_ANNOTATION_ROOT"]

# load the API key
key = data["OPENAI_API_KEY"]
os.environ['MODEL_ANNOTATION_ROOT'] = 'Projects/cellmaps_annotate_hierarchy/cellmaps_annotate_hierarchy/'
model_name = data["MAP_NAME"]

version = data["MAP_V"]

file_name = data["MAP_FILE"]
api_key = data["OPENAI_API_KEY"]
headers = {
    "Authorization": f"Bearer {api_key}",
    "Content-Type": "application/json",
}

max_tokens = data["MAX_TOKENS"] # Set your max tokens here
rate_per_token = data["RATE_PER_TOKEN"]# Set your rate per token here 
model = data["GPT_MODEL"]
DOLLAR_LIMIT = data["DOLLAR_LIMIT"]  # Set your dollar limit here
logfile_name = "valid_ref_" #data["LOG_NAME"] # Set your log file name here
LOG_FILE = os.path.join(get_model_directory_path(model_name, version), f"{logfile_name}log.json")

def load_log(LOG_FILE):
    if os.path.exists(LOG_FILE):
        with open(LOG_FILE, "r") as f:
            return json.load(f)
    else:
        return {"tokens_used": 0, "dollars_spent": 0.0, "time_taken_last_run": 0.0, "time_taken_total": 0.0}

def save_log(LOG_FILE,log_data):
    with open(LOG_FILE, "w") as f:
        json.dump(log_data, f, indent=4)

def estimate_cost(tokens, rate_per_token):
    return tokens * rate_per_token


def get_keyword_from_paragraph(paragraph, gpt_model='gpt-4', verbose=False):
    log_data = load_log(LOG_FILE)
    tokens_estimate = len(paragraph) + max_tokens
    query = """I have paragraph\nParagraph:\n%s\nI would like to search PubMed to validate this abstract. give me a list of 5 keywords. Keywords must include gene symbols and their related functions. please order keywords by their importance in paragraph, from high important to low important. Also genes should be located first. Just tell me keywords only with comma seperated without spacing"""%paragraph

    # print(query)

    keyword_extraction_data = {
    "model": gpt_model,
        "temperature": 0,
        "messages": [
        {"role": "system", "content": "You are a helpful assistant."},
    ] + [{"role": "user", "content": query}]}
    
    if estimate_cost(log_data["tokens_used"] + tokens_estimate, rate_per_token) > DOLLAR_LIMIT:
        print("The API call is estimated to exceed the dollar limit. Aborting.")
        return

    try:

        response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=keyword_extraction_data)

        response_json = response.json()

        tokens_used = response_json["usage"]["total_tokens"]
        # Update and save the log
        log_data["tokens_used"] += tokens_used
        log_data["dollars_spent"] = estimate_cost(log_data["tokens_used"], rate_per_token)
        print(tokens_used)
        save_log(LOG_FILE,log_data)

        if 'choices' in response_json.keys():
            result = response_json["choices"][0]["message"]["content"]
            
        else:
            result = None
        if verbose: 
            print("Query:")
            print(query)
            print("Result:")
            print(result)
        if result is not None:
            print("Tokens used: %s"%tokens_used)
            return [keyword.strip() for keyword in result.split(",")]
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return None
    

def get_mla_citation(doi):
    url = f'https://api.crossref.org/works/{doi}'
    headers = {'accept': 'application/json'}
    
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        data = response.json()
        #print(data)
        item = data['message']
        
        authors = item['author']
        formatted_authors = []
        for author in authors:
            formatted_authors.append(f"{author['family']}, {author.get('given', '')}")
        authors_str = ', '.join(formatted_authors)
        
        title = item['title'][0]
        container_title = item['container-title'][0]
        year = item['issued']['date-parts'][0][0]
        volume = item.get('volume', '')
        issue = item.get('issue', '')
        page = item.get('page', '')
        
        mla_citation = f"{authors_str}. \"{title}.\" {container_title}"
        if volume or issue:
            mla_citation += f", vol. {volume}" if volume else ''
            mla_citation += f", no. {issue}" if issue else ''
        mla_citation += f", {year}, pp. {page}."
        
        return mla_citation

def get_mla_citation_from_pubmed_id(paper_dict):
    article = paper_dict['MedlineCitation']['Article']
    #print(article.keys())
    authors = article['AuthorList']
    formatted_authors = []
    for author in authors:
        last_name = author['LastName'] if author['LastName'] is not None else ''
        first_name = author['ForeName'] if author['ForeName'] is not None else ''
        formatted_authors.append(f"{last_name}, {first_name}")
    authors_str = ', '.join(formatted_authors)

    title = article['ArticleTitle']
    journal = article['Journal']['Title']
    year = article['Journal']['JournalIssue']['PubDate']['Year']
    page = article['Pagination']['MedlinePgn']
    mla_citation = f"{authors_str}. \"{title}\" {journal}"
    if "Volume" in article['Journal']['JournalIssue']['PubDate']:
        volume = article['Journal']['JournalIssue']['PubDate']['Volume']
        mla_citation += f", vol. {volume}" if volume else ''
    elif "Issue" in article['Journal']['JournalIssue']['PubDate']:
        issue = article['Journal']['JournalIssue']['PubDate']['Issue']
        mla_citation += f", no. {issue}" if issue else ''
    mla_citation += f", {year}, pp. {page}."
    return mla_citation

def get_citation(paper):
    names = ",".join([author['name'] for author in paper['authors']])
    corrected_title = paper['title']
    journal = paper['journal']['name']
    pub_date = paper['publicationDate']
    if 'volume' in paper['journal'].keys(): 
        volume = paper['journal']['volume'].strip()
    else:
        volume = ''
    if 'pages' in paper['journal'].keys():
        pages = paper['journal']['pages'].strip()
    else:
        doi = paper['externalIds']['DOI']
        pages = doi.strip().split(".")[-1]
    citation = f"{names}. {corrected_title} {journal} {volume} ({pub_date[0:4]}):{pages}"
    return citation

def get_references(queried_papers, paragraph, gpt_model='gpt-4', n=10, verbose=False):
    citations = []
    for paper in queried_papers:
        abstract = paper['MedlineCitation']['Article']['Abstract']['AbstractText'][0]
        message = """I have pharagraph\n Pharagraph:\n%s\nand abstract.\n Abstract:\n%s\nDoes this abstract support this paragraph? Please tell me yes or no"""%(paragraph, abstract)
        
        reference_check_data = {
            "model": gpt_model,
            "temperature": 0,
            "messages": [
                {"role": "system", "content": "You are a helpful assistant."},
            ] + [{"role": "user", "content": message }],
        }
        reference_check_data['messages'].append({"role":"user", "content":message})

        log_data = load_log(LOG_FILE)
        tokens_estimate = len(paragraph) + max_tokens

        if estimate_cost(log_data["tokens_used"] + tokens_estimate, rate_per_token) > DOLLAR_LIMIT:
            print("The API call is estimated to exceed the dollar limit. Aborting.")
            return

        try:
            response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=reference_check_data)

            response_json = response.json()
        except Exception as e:
            print(f"An error occurred: {e}")
            continue

        tokens_used = response_json["usage"]["total_tokens"]
        # Update and save the log
        log_data["tokens_used"] += tokens_used
        log_data["dollars_spent"] = estimate_cost(log_data["tokens_used"], rate_per_token)
        print(tokens_used)
        save_log(LOG_FILE,log_data)

        if 'choices' in response_json.keys():
            result = response_json['choices'][0]['message']['content']
            if result[:3].lower()=='yes':
                try:
                    citation = get_mla_citation_from_pubmed_id(paper)
                    if citation not in citations:
                        citations.append(citation)
                except Exception as e:
                    print("Cannot parse citation even though this paper support pargraph")
                    print("Error detail: ", e)
                    pass
                if len(citations)>=n:
                    return citations
        else:
            result = "No"    
        if verbose:
            print("Title: ", paper['MedlineCitation']['Article']['ArticleTitle'])
            print("Query: ")
            print(message)
            print("Result:")
            print(result)
            print("="*200)

    return citations
        
def search_pubmed(keywords, email, sort_by='citation_count', retmax=10):
    Entrez.email = email

    search_query = f"{keywords} AND (hasabstract[text])"
    search_handle = Entrez.esearch(db='pubmed', term=search_query, sort=sort_by, retmax=retmax)
    search_results = Entrez.read(search_handle)
    search_handle.close()

    id_list = search_results['IdList']

    if not id_list:
        print("No results found.")
        return []

    fetch_handle = Entrez.efetch(db='pubmed', id=id_list, retmode='xml')
    articles = Entrez.read(fetch_handle)['PubmedArticle']
    fetch_handle.close()

    return articles

def get_papers(keywords, n, email):
    total_papers = []
    while True:
        keyword_joined = " AND ".join(["("+keyword+"[Title/Abstract])" for keyword in keywords])
        if len(keywords)==2:
                return total_papers
        try:
            semantic_scholar_queried_keywords= search_pubmed(keyword_joined, email=email, retmax=n)
            total_papers += list(semantic_scholar_queried_keywords[:n])
        except:
            pass
        keywords = keywords[:-1]
        print("Reducing keywords to %s"%",".join(keywords))
                
def get_references_for_paragraphs(paragraphs, email, n=5, gpt_model='gpt-4', verbose=False):
    references_paragraphs = []
    for i, paragraph in enumerate(paragraphs):
        if verbose:
            print("""Extracting keywords from paragraph\nParagraph:\n%s"""%paragraph)
            print("="*75)
        keywords = get_keyword_from_paragraph(paragraph, gpt_model=gpt_model, verbose=verbose)
        #keywords = list(sorted(keywords, key=len))
        keyword_joined = ",".join(keywords)
        print("Keywords: ", keyword_joined)
        print("Serching paper with keywords...")
        semantic_scholar_queried_keywords = get_papers(keywords, n, email)
        if len(semantic_scholar_queried_keywords)==0:
            print("No paper searched!!")
            references_paragraphs.append([])
        print("In paragraph %d, %d references are queried"%(i+1, len(semantic_scholar_queried_keywords)))
        references = get_references(semantic_scholar_queried_keywords, paragraph, gpt_model=gpt_model, n=n, verbose=verbose)
        references_paragraphs.append(references)
        print("In paragraph %d, %d references are matched"%(i+1, len(references)))
        print("")
        print("")
    n_refs = sum([len(refs) for refs in references_paragraphs])
    print("Total %d references are queried"%n_refs)
    print(references_paragraphs)
    # i = 1
    # referenced_paragraphs = ""
    # footer = "="*200+"\n"
    # for paragraph, references in zip(paragraphs, references_paragraphs):
    #     referenced_paragraphs += paragraph
    #     for reference in references:
    #         referenced_paragraphs += "[%d]"%i
    #         footer += "[%d] %s"%(i, reference) + '\n'
    #         i+=1
    #     referenced_paragraphs += "\n"
    # return referenced_paragraphs + footer
    i = 1
    footer = "### Validated References: \n"
    for references in references_paragraphs:
        for reference in references:
            if reference:
                footer += "[%d] %s"%(i, reference) + '\n'
                i+=1
    return footer

def get_summary(file_name):
        with open(file_name, "r") as file:
            content = file.read()
            
        # Regular expression pattern for matching the Summary section
        pattern = re.compile(r'(#+\s*)?(\*{1,2}|_{1,2})?Summary[:\s]*(.*?)(#+\s*)?References', re.DOTALL | re.IGNORECASE)
        match = pattern.search(content)

        if match:
            cleaned_text = re.sub(r'[\*#]', '', match.group(3))  # Remove remaining asterisks and hash symbols
            summary = cleaned_text.strip()  # Remove leading and trailing whitespace
            # print(summary)
            return summary
        else:
            print("No Summary found.")
            return None



nodes, edges = load_nodes_edges(model_name, version, file_name)
# sort nodes by size 
nodes = nodes.sort_values(by=['size'], ascending=True)
systems = nodes['term'].values.tolist()[:-3] #remove the root and 2 huge organelle nodes
system = 'Cluster3-22'
response_path = os.path.join(get_model_directory_path(model_name, version),
system, f"{system}_chatgpt_response")

paragraphs = get_summary(response_path+'.md')
paragraphs = list(filter(lambda p: len(p.split()) > 5, paragraphs.split("\n")))
# check if the valid reference is already generated
if os.path.exists(os.path.join(get_model_directory_path(model_name, version),
system, f"{system}_valid_references.md")):
    print("Valid reference already generated for %s"%system)

else:
    print("Generating valid reference for %s"%system)
    reference = get_references_for_paragraphs(paragraphs, email = data['EMAIL'], n=3, gpt_model=model, verbose=True)
    write_system_page(reference,'md',model_name, version, system, "valid_references", get_root_path())
        

/cellar/users/mhu/Projects/cellmaps_annotate_hierarchy/cellmaps_annotate_hierarchy/MuSIC2_Maps/May2023_final/muse_imgdim_1024_ppidim_1024_latentd_128_layered.chi_10.maxres_80.alg_leiden.pruned.edges
/cellar/users/mhu/Projects/cellmaps_annotate_hierarchy/cellmaps_annotate_hierarchy/MuSIC2_Maps/May2023_final/muse_imgdim_1024_ppidim_1024_latentd_128_layered.chi_10.maxres_80.alg_leiden.pruned.nodes
Generating valid reference for Cluster3-22
Extracting keywords from paragraph
Paragraph:
The human protein system analyzed here consists of 24 proteins involved in protein binding, with a majority of them (20 proteins) localized in the nucleoplasm and nucleus. The system is primarily involved in transcription by RNA polymerase II, DNA-templated transcription, histone H3 acetylation, DNA repair, and monoubiquitinated histone deubiquitination. The SAGA complex, transcription factor TFIID complex, and RNA polymerase II general transcription initiation factor activity are among the cellular componen