In [1]:
from Bio import Entrez
import pandas as pd
from time import sleep
from datetime import datetime

In [2]:
def fetch_papers(query: str, max_results: int = 10) -> list:
    # Fetch papers from PubMed with query
    Entrez.email = "zanwarpratham@gmail.com"
    try:
        # Search PubMed
        handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results)
        search_results = Entrez.read(handle)
        handle.close()

        # Fetch details for each paper
        id_list = search_results["IdList"]
        handle = Entrez.efetch(db="pubmed", id=",".join(id_list), retmode="xml")
        sleep(1)
        papers = Entrez.read(handle)
        handle.close()

        return papers["PubmedArticle"]
    except Exception as e:
        print(f"Error fetching papers: {e}")
        return []

In [9]:
# Adding query to get the filtered papers
query = "cancer"
papers = fetch_papers(query, max_results=10)
print(papers)

[{'MedlineCitation': DictElement({'KeywordList': [ListElement([StringElement('CpG oligodeoxynucleotide', attributes={'MajorTopicYN': 'N'}), StringElement('E6', attributes={'MajorTopicYN': 'N'}), StringElement('E7', attributes={'MajorTopicYN': 'N'}), StringElement('HPV DNA vaccine', attributes={'MajorTopicYN': 'N'}), StringElement('HSP70', attributes={'MajorTopicYN': 'N'}), StringElement('IL-28B gene adjuvant', attributes={'MajorTopicYN': 'N'})], attributes={'Owner': 'NOTNLM'})], 'OtherID': [], 'CitationSubset': ['IM'], 'InvestigatorList': [], 'SpaceFlightMission': [], 'OtherAbstract': [], 'GeneralNote': [], 'PMID': StringElement('39780219', attributes={'Version': '1'}), 'DateRevised': {'Year': '2025', 'Month': '01', 'Day': '09'}, 'Article': DictElement({'ArticleDate': [DictElement({'Year': '2025', 'Month': '01', 'Day': '08'}, attributes={'DateType': 'Electronic'})], 'ELocationID': [StringElement('10.1186/s12985-024-02604-7', attributes={'EIdType': 'doi', 'ValidYN': 'Y'})], 'Language': 

In [10]:
def filter_non_academic_authors(paper: dict) -> dict:
    non_academic_authors = []
    company_affiliations = []

    # Extract authors and affiliations
    authors = paper.get("MedlineCitation", {}).get("Article", {}).get("AuthorList", [])
    for author in authors:
        affiliations = author.get("AffiliationInfo", [])
        for aff in affiliations:
            affiliation = aff.get("Affiliation", "").lower()
            if "pharma" in affiliation or "biotech" in affiliation:
                non_academic_authors.append(author.get("LastName", "") + " " + author.get("ForeName", ""))
                company_affiliations.append(affiliation)

    return {
        "non_academic_authors": non_academic_authors,
        "company_affiliations": company_affiliations,
    }

In [11]:
# Iterating through each paper
for paper in papers:
    filtered = filter_non_academic_authors(paper)
    print(filtered)

{'non_academic_authors': [], 'company_affiliations': []}
{'non_academic_authors': [], 'company_affiliations': []}
{'non_academic_authors': [], 'company_affiliations': []}
{'non_academic_authors': [], 'company_affiliations': []}
{'non_academic_authors': [], 'company_affiliations': []}
{'non_academic_authors': [], 'company_affiliations': []}
{'non_academic_authors': ['Xia Yuqing'], 'company_affiliations': ['pharmaceutical sciences, massachusetts college of pharmacy and health science university, 179 longwood avenue, boston, massachusetts, 02115, usa.']}
{'non_academic_authors': [], 'company_affiliations': []}
{'non_academic_authors': [], 'company_affiliations': []}
{'non_academic_authors': [], 'company_affiliations': []}


In [12]:
def generate_csv(papers: list, filename: str = None) -> None:

    # Initialising data as list
    data = []

    # Iterating through each paper and  storing filtered paper data in list
    for paper in papers:
        filtered = filter_non_academic_authors(paper)
        if filtered["non_academic_authors"]:
            data.append({
                "PubmedID": paper.get("MedlineCitation", {}).get("PMID", ""),
                "Title": paper.get("MedlineCitation", {}).get("Article", {}).get("ArticleTitle", ""),
                "Publication Date": paper.get("MedlineCitation", {}).get("Article", {}).get("Journal", {}).get("JournalIssue", {}).get("PubDate", {}).get("Year", ""),
                "Non-academic Author(s)": ", ".join(filtered["non_academic_authors"]),
                "Company Affiliation(s)": ", ".join(filtered["company_affiliations"]),
                "Corresponding Author Email": ""
            })

    df = pd.DataFrame(data)
    if filename:
        df.to_csv(filename, index=False)
    else:
        print(df.to_string(index=False))

In [13]:
generate_csv(papers, filename=f"pubmed_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv")