In [4]:
import requests
import PyPDF2
from dotenv import load_dotenv
import openai
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import json
from langchain.text_splitter import CharacterTextSplitter
import sentence_transformers
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
from langchain.vectorstores import Chroma, FAISS
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain, LLMChain
from langchain.llms import HuggingFaceHub
from langchain.document_loaders import DirectoryLoader, PyPDFDirectoryLoader, CSVLoader
from langchain.indexes import VectorstoreIndexCreator
import getpass
from IPython.display import display, HTML

In [3]:
def search_semantic_scholar(query,year,n=100):
    current_index = 0
    r = requests.get(
    'https://api.semanticscholar.org/graph/v1/paper/search?query='+query+'&year='+year+'-',
    params={'fields': 'title,year,abstract,url,openAccessPdf,tldr,isOpenAccess,publicationDate'}
    )
    #print(json.dumps(r.json(), indent=2))
    r = r.json()
    #total number of papers found 
    total = r['total']
    offset = r['offset']
    #nextindex = r['next']
    final_results = pd.json_normalize(r['data'])
    current_index = offset+n+1
    print('Total papers: ',total)
    print('Current index: ',current_index)
    while current_index < total:
        r = requests.get(
        'https://api.semanticscholar.org/graph/v1/paper/search?query='+query+'&offset='+str(current_index)+'&year='+year+'-'+'&limit='+str(n),
        params={'fields': 'title,year,abstract,url,openAccessPdf,tldr,isOpenAccess,publicationDate'}
        )
        #print(json.dumps(r.json(), indent=2))
        r = r.json()
        offset = r['offset']
        #nextindex = r['next']
        results = pd.json_normalize(r['data'])
        final_results = pd.concat([final_results, results], ignore_index=True)
        current_index = offset+n+1
        print('Current index: ',current_index)
    return final_results


instance = search_semantic_scholar(query="HER3 ErBb3",year="2022",n=100)

Total papers:  476
Current index:  101
Current index:  202
Current index:  303
Current index:  404
Current index:  505


In [7]:
display(instance.sort_values('publicationDate',ascending=False))

Unnamed: 0,paperId,url,title,abstract,year,isOpenAccess,publicationDate,openAccessPdf.url,openAccessPdf.status,tldr.model,tldr.text,openAccessPdf,tldr
192,a2adc9d9eb46537935bbaa40d38703d43391defa,https://www.semanticscholar.org/paper/a2adc9d9...,Stromal-derived NRG1 enables oncogenic KRAS by...,Activating KRAS mutations (KRAS*) in pancreati...,2023,False,2023-09-29,,,tldr@v2.0.0,It is found that CAFs can contribute toKRAS* i...,,
210,a2adc9d9eb46537935bbaa40d38703d43391defa,https://www.semanticscholar.org/paper/a2adc9d9...,Stromal-derived NRG1 enables oncogenic KRAS by...,Activating KRAS mutations (KRAS*) in pancreati...,2023,False,2023-09-29,,,tldr@v2.0.0,It is found that CAFs can contribute toKRAS* i...,,
312,da8a72d58dfeaee4b019fa63499f03bff54396a6,https://www.semanticscholar.org/paper/da8a72d5...,A multi-ancestry genome-wide association study...,Type 1 diabetes is a chronic autoimmune diseas...,2023,True,2023-09-18,https://www.medrxiv.org/content/medrxiv/early/...,GREEN,tldr@v2.0.0,A multi-ancestry GWAS enabled identification o...,,
218,679151a019a3ac96b4937863a42d0d05934019f5,https://www.semanticscholar.org/paper/679151a0...,Role of human epidermal growth factor receptor...,ABSTRACT Background ALK tyrosine kinase inhibi...,2023,True,2023-09-18,https://www.tandfonline.com/doi/pdf/10.1080/15...,GOLD,tldr@v2.0.0,Evidence is provided that HER3 may mediate TKI...,,
79,df947244810339cfddf7a075c1059e49aca19724,https://www.semanticscholar.org/paper/df947244...,HER3 functions as an effective therapeutic tar...,,2023,True,2023-09-16,https://cancerci.biomedcentral.com/counter/pdf...,GOLD,tldr@v2.0.0,The data demonstrate that increased HER3 is an...,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
255,0e8f9d5527245866bfc68962b2e4934392f4380f,https://www.semanticscholar.org/paper/0e8f9d55...,Upstream Process Design of Antibody-drug Conju...,: Colorectal cancer is a common tumor in the d...,2022,True,,https://francis-press.com/uploads/papers/ca5w2...,BRONZE,tldr@v2.0.0,The antibody selected by the drug is patrituma...,,
275,a1dd42028996f9414c4d6677d1f3fc0ea5e09650,https://www.semanticscholar.org/paper/a1dd4202...,Enabling Precision Therapy in Oncology: Liquid...,MYD88 PIK3C2G RAD51B SMAD4 TP53 ACVR1 BRCA1 CR...,2022,False,,,,tldr@v2.0.0,PIK3C2G RAD54L SNCAIP TSC2 AKT3 BTG2 CSF3R ERC...,,
310,61a6ec6b7130b84f024c3f6508be4b76a6a43220,https://www.semanticscholar.org/paper/61a6ec6b...,TITLE: Dasatinib Resensitizes MAPK Inhibitor E...,Resistance to combination BRAF/MEK inhibitor (...,2023,False,,,,tldr@v2.0.0,It is proposed that dasatinib-based MAPKi ther...,,
322,48871c227f1da93fe6b2656f9772cd941c920f7c,https://www.semanticscholar.org/paper/48871c22...,A Microfluidic SERS Assay to Characterize the ...,,2023,False,,,,tldr@v2.0.0,ESCP is demonstrated for characterizing the ex...,,


In [4]:
loader = PyPDFDirectoryLoader(".", glob="**/[!.]*.pdf")

In [34]:
def load_pdf():
    text=""
    loader = PyPDFDirectoryLoader(".", glob="**/[!.]*.pdf")
    for page in loader.load():
       text+=page.page_content
    return text
load_pdf()

'/gid00030/gid00035/gid00032/gid00030/gid00038/gid00001/gid00033/gid00042/gid00045 /gid00001\n/gid00048/gid00043/gid00031/gid00028/gid00047/gid00032/gid00046Citation: Miano, C.; Morselli, A.;\nPontis, F.; Bongiovanni, C.; Sacchi, F.;\nDa Pra, S.; Romaniello, D.; Tassinari,\nR.; Sgarzi, M.; Pantano, E.; et al.\nNRG1/ERBB3/ERBB2 Axis Triggers\nAnchorage-Independent Growth of\nBasal-like/Triple-Negative Breast\nCancer Cells. Cancers 2022 ,14, 1603.\nhttps://doi.org/10.3390/\ncancers14071603\nAcademic Editors: Reiner Strick and\nRamona Erber\nReceived: 4 February 2022\nAccepted: 16 March 2022\nPublished: 22 March 2022\nPublisher’s Note: MDPI stays neutral\nwith regard to jurisdictional claims in\npublished maps and institutional afﬁl-\niations.\nCopyright: © 2022 by the authors.\nLicensee MDPI, Basel, Switzerland.\nThis article is an open access article\ndistributed under the terms and\nconditions of the Creative Commons\nAttribution (CC BY) license (https://\ncreativecommons.org/licenses/

In [36]:
loader = CSVLoader(file_path='key_points.csv', csv_args={
    'delimiter': ',',
    'quotechar': '"',
    'fieldnames': ['abstract', 'keypoints', 'target']
})
documents = loader.load()
embeddings = OpenAIEmbeddings()
db = Chroma.from_documents(documents, embeddings)

def retrieve_info(query):
    response = db.similarity_search(query, 
                                    k=3) #retrieves top 3
    contents = [doc.page_content for doc in response]
    return contents

llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k-0613")

template = """
You are an expert summarizer on breast cancer research papers. I am going to share with you an academic paper
and you will give me a summarization of the most relevant information in 10 bullet points or less, based on past
summaries.
You will follow all the rules below: 

1/ Response must be similar to past summaries, in terms of length, and tone of voice
2/ You must list all the targets (ADC, HER3, ADC etc.) that the paper has mentioned, after the bullet points are formed
3/ If the summaries are found to be irrelevant, you may mention that you are unsure

Below is a typical academic paper on cancer: 
{message}

Here is a list of how the summaries are like for a similar paper: 
{summary}

Please write the summary for the paper below:
"""

prompt = PromptTemplate(
    input_variables=["message","summary"],
    template=template

)
chain = LLMChain(llm=llm, prompt=prompt)

def load_pdf():
    text=""
    loader = PyPDFDirectoryLoader(".", glob="**/[!.]*.pdf")
    for page in loader.load():
       text+=page.page_content
    return text

def generate_response(message):
    summary=retrieve_info(message)
    response = chain.run(message=message, summary=summary)
    return response

message = """
Abstract: ERBB3, also known as HER3, is a tyrosine kinase transmembrane receptor of the ERBB
family. Upon binding to neuregulin 1 (NRG1), ERBB3 preferentially dimerizes with HER2 (ERBB2),
in turn inducing aggressive features in several cancer types. The analysis of a dataset of breast
cancer patients unveiled that higher ERBB3 mRNA expression correlates with shorter relapse-free
survival in basal-like breast cancers, despite low ERBB3 expression in this breast cancer subtype.
Administration of neuregulin 1 beta (NRG1β) significantly affected neither cellular proliferation nor
the basal migratory ability of basal-like/triple-negative quasi-normal MCF10A breast cells, cultured
in mono-layer conditions. Furthermore, no significant regulation in cell morphology or in the
expression of basal/myoepithelial and luminal markers was observed upon stimulation with NRG1β.
In non-adherent conditions, NRG1βadministration to MCF10A cells did not significantly influence
cell survival; however, it robustly induced cell growth as spheroids (3D growth). Intriguingly, a
remarkable upregulation of ERBB3 and ERBB2 protein abundance was observed in 3D compared
to 2D cell cultures, and NRG1β-induced 3D cell growth was efficiently prevented by the anti-
HER2 monoclonal antibody pertuzumab. Similar results were obtained by the analysis of basal-
like/triple-negative breast cancer cellular models, MDA-MB-468 and MDA-MB-231 cells, in which
NRG1βinduced anchorage-independent cell growth that in turn was prevented or reduced by the
simultaneous administration of anti-HER2 neutralizing antibodies. Finally, the ability of pertuzumab
in suppressing NRG1β-induced 3D growth was also evaluated and confirmed in MCF10A engineered
with HER2-overexpression. We suggest that the NRG1/ERBB3/ERBB2 pathway promotes the
anchorage-independent growth of basal-like breast cancer cells. Importantly, we provide evidence
that ERBB2 neutralization, in particular by pertuzumab, robustly inhibits this process. Our results
pave the way towards the development of novel anticancer strategies for basal-like breast cancer
patients based on the interception of the NRG1/ERBB3/ERBB2 signaling axis"""
generate_response(message) 

'- Higher ERBB3 mRNA expression is correlated with shorter relapse-free survival in basal-like breast cancers.\n- Neuregulin 1 beta (NRG1β) does not significantly affect cellular proliferation or basal migratory ability in basal-like breast cells.\n- NRG1β administration induces 3D cell growth and upregulates ERBB3 and ERBB2 protein abundance.\n- The anti-HER2 monoclonal antibody pertuzumab prevents NRG1β-induced 3D cell growth.\n- Similar results were observed in basal-like/triple-negative breast cancer cellular models.\n- ERBB2 neutralization, particularly by pertuzumab, inhibits the anchorage-independent growth of basal-like breast cancer cells.\n- The NRG1/ERBB3/ERBB2 pathway promotes the anchorage-independent growth of basal-like breast cancer cells.\n- The interception of the NRG1/ERBB3/ERBB2 signaling axis could lead to novel anticancer strategies for basal-like breast cancer patients. \n\nTargets mentioned: ERBB3, HER2, NRG1β, pertuzumab.'