In [1]:
from Bio import Entrez
import pandas as pd

In [2]:
def fetch_papers(query: str, max_results: int = 10) -> list:
    #Fetch papers from PubMed with query
    Entrez.email = "zanwarpratham@gmail.com"  # Required by PubMed API
    try:
        # Search PubMed
        handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results)
        search_results = Entrez.read(handle)
        handle.close()

        # Fetch details for each paper
        id_list = search_results["IdList"]
        handle = Entrez.efetch(db="pubmed", id=",".join(id_list), retmode="xml")
        papers = Entrez.read(handle)
        handle.close()

        return papers["PubmedArticle"]
    except Exception as e:
        print(f"Error fetching papers: {e}")
        return []

In [3]:
query = "cancer AND 2023"
papers = fetch_papers(query, max_results=5)
print(papers)

[{'MedlineCitation': DictElement({'CitationSubset': ['IM'], 'KeywordList': [ListElement([StringElement('I', attributes={'MajorTopicYN': 'N'}), StringElement('I0', attributes={'MajorTopicYN': 'N'}), StringElement('I1', attributes={'MajorTopicYN': 'N'}), StringElement('I10', attributes={'MajorTopicYN': 'N'}), StringElement('System dynamics modelling', attributes={'MajorTopicYN': 'N'}), StringElement('cancer control', attributes={'MajorTopicYN': 'N'}), StringElement('cancer diagnosis', attributes={'MajorTopicYN': 'N'}), StringElement('cancer screening', attributes={'MajorTopicYN': 'N'}), StringElement('cancer treatment', attributes={'MajorTopicYN': 'N'})], attributes={'Owner': 'NOTNLM'})], 'SpaceFlightMission': [], 'OtherID': [], 'GeneralNote': [], 'InvestigatorList': [], 'OtherAbstract': [DictElement({'AbstractText': ["This study systematically reviews the application of system dynamics modeling (SDM) in cancer control, aiming to assess the research quality and provide insights into the 

In [5]:
def filter_non_academic_authors(paper: dict) -> dict:
    non_academic_authors = []
    company_affiliations = []

    # Extract authors and affiliations
    authors = paper.get("MedlineCitation", {}).get("Article", {}).get("AuthorList", [])
    for author in authors:
        affiliations = author.get("AffiliationInfo", [])
        for aff in affiliations:
            affiliation = aff.get("Affiliation", "").lower()
            if "pharma" in affiliation or "biotech" in affiliation:
                non_academic_authors.append(author.get("LastName", "") + " " + author.get("ForeName", ""))
                company_affiliations.append(affiliation)

    return {
        "non_academic_authors": non_academic_authors,
        "company_affiliations": company_affiliations,
    }

In [6]:
for paper in papers:
    filtered = filter_non_academic_authors(paper)
    print(filtered)

{'non_academic_authors': ['Ung Carolina Oi Lam', 'Hu Hao'], 'company_affiliations': ['centre for pharmaceutical regulatory sciences, university of macau, macao.', 'centre for pharmaceutical regulatory sciences, university of macau, macao.']}
{'non_academic_authors': [], 'company_affiliations': []}
{'non_academic_authors': [], 'company_affiliations': []}
{'non_academic_authors': [], 'company_affiliations': []}
{'non_academic_authors': [], 'company_affiliations': []}


In [7]:
def generate_csv(papers: list, filename: str = None) -> None:
    data = []
    for paper in papers:
        filtered = filter_non_academic_authors(paper)
        if filtered["non_academic_authors"]:
            data.append({
                "PubmedID": paper.get("MedlineCitation", {}).get("PMID", ""),
                "Title": paper.get("MedlineCitation", {}).get("Article", {}).get("ArticleTitle", ""),
                "Publication Date": paper.get("MedlineCitation", {}).get("Article", {}).get("Journal", {}).get("JournalIssue", {}).get("PubDate", {}).get("Year", ""),
                "Non-academic Author(s)": ", ".join(filtered["non_academic_authors"]),
                "Company Affiliation(s)": ", ".join(filtered["company_affiliations"]),
                "Corresponding Author Email": ""  # Placeholder for now
            })

    df = pd.DataFrame(data)
    if filename:
        df.to_csv(filename, index=False)
    else:
        print(df.to_string(index=False))

In [8]:
generate_csv(papers, filename="output.csv")