In [1]:
import numpy as np
import pandas as pd
from Bio import Entrez

# TLDR
Build pubmed queries (that can be turned into RSS feeds) from Google sheet describing parameters.

### Type 1: People search – return articles from people based on name/descriptors
### Type 2: Keyword search – return articles from keywords + journals

Example Google Sheet: https://docs.google.com/spreadsheets/d/108U4eq7zkwbqaygDMlCKe1cp0q0BlJ6Z0beI7Q9Egzs/edit#gid=0

# Rationale and Strategy

Typical approaches for following the literature aren't that good at differentiating the signal from the noise.

Examples of signal:

- Papers in and around primary field (from any level of journal if it's sufficiently close, but needs to be more relevant as source IF decreases)
- Exciting papers of broad relevance to the field (e.g. cell engineering / cell therapies writ large)
- Papers by key people, such as personal connections and famous scientists

Examples of noise:

- Intractable numbers of papers to screen
- Papers in predatory journals or from obscure people/places
- Non research/review articles (e.g. news, notes, etc.)

## Pubmed search helpful tips

* use `hasabstract` to filter out news, notes, errata, etc.
* use `NOT review[pt]` to filter out reviews; see [HERE](https://pubmed.ncbi.nlm.nih.gov/help/#publication-types)
* use `last X years[dp]` to filter by recency; see [HERE](https://pubmed.ncbi.nlm.nih.gov/help/#filter-strategy-pubdate)

Also –

* Wildcards can be used, like `Nat Rev*[Journal]` for all Nat Rev family journals
* `[1au]` and `[lastau]` flags only work with truncated names like Bhargava HK, not with full name



# HELPER FUNCTIONS

In [2]:
def pmids_for_query(query):
    """
    Return PMIDs resulting frmo a query
    
    """
    
    # Search pubmed for the query, returning the PMIDs fo all results (up to 1e4 results)
    handle = Entrez.esearch(db='pubmed', retmax=10000, retmode='xml', term=query)
    searchResults = Entrez.read(handle)
    
    pmids = searchResults['IdList']
    
    return pmids

def pubmed_articles_for_query(query):
    """
    Return a dataframe of articles resulting from a pubmed query.
    """        
    # Search pubmed for the query, returning the PMIDs fo all results (up to 1e4 results)
    handle = Entrez.esearch(db='pubmed', retmax=10000, retmode='xml', term=query)
    searchResults = Entrez.read(handle)
    
    pmids = pmids_for_query(query)
    
    # Get the articles from the PMIDs
    handle = Entrez.efetch(db='pubmed', retmode='xml', id=pmids)
    results = Entrez.read(handle)
    articles = results['PubmedArticle']
    
    df = pd.DataFrame({'PMID': pmids})

    journal_names = []
    titles = []
    dois = []
    abstracts = []
    years = []
    
    results = []

    for i in range(len(articles)):
        # Store the desired information in a dictionary
        result = {}
        
        result['PMID'] = pmids[i]
        
        # MedlineCitation contains all the data of interest.
        article = articles[i]
        citation = article['MedlineCitation']

        # Retrieve the fields of interest. Some have multiple fallback locations.
        result['Journal'] = citation['Article']['Journal']['Title']
        result['Title'] = citation['Article']['ArticleTitle']
        try:
            result['Abstract'] = citation['Article']['Abstract']['AbstractText'][0]
        except:
            result['Abstract'] = None
        try:
            result['Year'] = citation["Article"]["Journal"]["JournalIssue"]["PubDate"]["Year"]
        except:
            try:
                result['Year'] = citation["Article"]["ArticleDate"][0]["Year"]
            except:
                try:
                    result['Year'] = citation["DateRevised"]["Year"]
                except:
                    result['Year'] = None

        # Get the DOI
        refArray = article['PubmedData']['ArticleIdList']
        doi = None
        for entry in refArray:
            if entry.attributes["IdType"] == "doi":
                doi = entry
        result['doi'] = doi
        
        # Get the article type
        try:
            types = []
            typelist = citation['Article']['PublicationTypeList']
            for t in typelist:
                tt = str(t)
                if "Research Support" not in tt:
                    types.append(tt)
                
            result['Types'] = types
        except:
            result['Types'] = None
        
        results.append(result)
        
    df = pd.DataFrame(results)
    df['Year'] = df['Year'].astype(int)
    return df

def avg_articles_per_year_last5(query):
    """
    Return the average number of articles a year for from 2017-2021 for `query`
    
    """
    
    # Only get last 6 years of data
    query += (" AND \"last 6 years\"[dp]")
    
    try:
        data = pubmed_articles_for_query(query)
        avgs = [len(data[data['Year'] == val]) for val in [2017, 2018, 2019, 2020, 2021]]
        print(np.mean(avgs))
        return np.mean(avgs)
    except Exception as e:
        print('failed on: %s'%query)
        print(e)
        return np.nan
    
def truncated_name(name):
    """
    Convert a string name of the form "Firstname Minitial Lastname" or "Firstname Lastname" to Lastname FinitialMinitial
    
    e.g. Hersh K Bhargava -> Bhargava HK; Hersh Bhargava -> Bhargava H
    
    """
    
    split = name.split(' ')
    if len(split) == 3:
        result = '%s %s%s' % (split[2], split[0][0], split[1][0])
    elif len(split) == 2:
        result = '%s %s' % (split[1], split[0][0])
    else:
        print('invalid result from split, suspect invalid name. INptu was : %s'%name)
        result = None
    return result

def author_query(fullname, altnames=None, affiliations=None, author_position='any', orcid=None):
    """
    Generate a pubmed query string to match:
    
        ((ANY name+author_position) AND ((ANY affiliation) OR orcid)
    
    Very annoyingly, Pubmed doesn't support [1au] or [lastau] tags with full author names
    So would need to do:
    
        (Hersh K Bhargava[FAU]) AND (Bhargava HK[1au])
        
    Also can't search by ORCID position
        
    Parameters
    ----------
    name : str
        Primary search term for the name of the author. Must be of the form 'Hersh K Bhargava' or 'Hersh Bhargava'
    altnames : list[str]
        List of alternative names (each matched with OR). Used as-is.
    affiliations : list[str]
        List of affiliations, will be matched for at least one
    position : str
        Identifier for the author position. Can be 'any', 'first', or 'last'
    orcid : str
        ORCID identifier for the author
    
    """
    
    # Assemble author names
    names = [fullname]
    if altnames != None:
        names += altnames
    
    # Figure out which author position
    if author_position == 'any':
        poshandle = '[au]'
    elif author_position == 'first':
        poshandle = '[1au]'
    elif author_position == 'last':
        poshandle = '[lastau]'
        
    query = ''
        
    # First build the name query (each added with OR)
    for _name in names:
        # Figure out whether it's truncated or full format
        if len(query) > 0:
            query += ' OR '
        else:
            query += '('
            
        query += '(%s[FAU]' % _name
        
        # If searching by author position, must use truncated query format (Bhargava HK)
        if poshandle == '[1au]' or poshandle == '[lastau]':
            trunc_name = truncated_name(_name)
            query += ' AND %s%s)' % (trunc_name, poshandle)
        else:
            query += ')'
        
    # Add affiliation tags and ORCID if present
    if affiliations != None:
        query += ' AND ('
        for i, affil in enumerate(affiliations):
            if i > 0:
                query += ' OR '
            query += '%s[affil]' % affil 
            
        if orcid is not None:
            query += ' OR %s[auid]' % orcid
        
        query += ')'

    else:
        # No affiliations, just ORCID
        if orcid is not None:
            query += ' AND %s[auid]' % orcid
        
    query += ')'
    return query

def author_query_from_row(row):
    author_position = 'last'
    orcid = None
    affiliations = None
    altnames = None
    
    if row['Position'] != '':
        author_position = row['Position']
    
    if row['ORCID'] != '':
        orcid = row['ORCID']
        
    if row['Affiliations'] != '':
        affiliations = row['Affiliations'].split(',')
        affiliations = [a.strip() for a in affiliations]
        
    if row['Alt Names'] != '':
        altnames = row['Alt Names'].split(',')
        altnames = [a.strip() for a in altnames]
    
    return author_query(fullname=row['Name'], altnames=altnames, affiliations=affiliations, author_position=author_position, orcid=orcid)

# Read Data from Google Sheet
Google sheet here: https://docs.google.com/spreadsheets/d/108U4eq7zkwbqaygDMlCKe1cp0q0BlJ6Z0beI7Q9Egzs/edit#gid=0

In [5]:
sheet_id = '108U4eq7zkwbqaygDMlCKe1cp0q0BlJ6Z0beI7Q9Egzs'
sheet_name = 'people'
url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"
people_df = pd.read_csv(url, keep_default_na=False)

sheet_id = '108U4eq7zkwbqaygDMlCKe1cp0q0BlJ6Z0beI7Q9Egzs'
sheet_name = 'cell-engineering-keywords'
url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"
keywords = pd.read_csv(url, keep_default_na=False)['Keyword'].to_list()

sheet_id = '108U4eq7zkwbqaygDMlCKe1cp0q0BlJ6Z0beI7Q9Egzs'
sheet_name = 'journal-whitelist'
url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"
journals = pd.read_csv(url, keep_default_na=False)['Journal'].to_list()

# 1. Build the queries

## 1A. People

For each person: `((ANY name+author_position) AND ((ANY affiliation) OR orcid)`

In [7]:
people_df['query'] = people_df.apply(lambda row: author_query_from_row(row), axis=1)

people_query = ''
for i, row in people_df.iterrows():
    people_query += row['query'] 
    if i < len(people_df)-1:
        people_query += ' OR '

# Copy to clipboard
print(people_query)
!echo "$people_query" | pbcopy

((Wendell A Lim[FAU])) OR ((Hana El-Samad[FAU])) OR ((Kole T Roybal[FAU])) OR ((Alexander Marson[FAU])) OR ((Marcela Maus[FAU])) OR ((Carl H June[FAU] AND June CH[lastau])) OR ((Crystal L Mackall[FAU])) OR ((Michel Sadelain[FAU])) OR ((Justin Eyquem[FAU])) OR ((Wilson W Wong[FAU] AND Wong WW[lastau])) OR ((Steven A Rosenberg[FAU] AND Rosenberg SA[lastau])) OR ((Rogelio A Hernandez-Lopez[FAU])) OR ((Kyle G Daniels[FAU])) OR ((Yvonne Y Chen[FAU] AND Chen YY[lastau])) OR ((Stanley R Riddell[FAU] AND Riddell SR[lastau])) OR ((Darrell J Irvine[FAU] AND Irvine DJ[lastau])) OR ((Gregoire Altan-Bonnet[FAU] AND Altan-Bonnet G[lastau])) OR ((Art Weiss[FAU] AND Weiss A[lastau])) OR ((Ronald D Vale[FAU] AND Vale RD[lastau])) OR ((Alfred Singer[FAU] AND Singer A[lastau])) OR ((Ira Mellman[FAU] AND Mellman I[lastau])) OR ((Matthew H Spitzer[FAU] AND Spitzer MH[lastau])) OR ((Hideho Okada[FAU] AND Okada H[lastau])) OR ((Matthew F Krummel[FAU] AND Krummel MF[lastau])) OR ((Jeffrey A Bluestone[FAU] AND

## 1B. Topics + Journals
`(ANY keyword) AND (ANY journal) AND hasabstract`

In [6]:
kwquery = "("
for i, kw in enumerate(keywords):
    kwquery += '\"%s\"' % kw
    
    if i < len(keywords) - 1:
        kwquery += ' OR '
        
kwquery += ')'

jquery = '('
for i, jn in enumerate(journals):
    jquery += '\"%s\"[journal]' % jn
    
    if i < len(journals) - 1:
        jquery += ' OR '
        
jquery += ')'

query = kwquery + ' AND ' + jquery + ' AND hasabstract'

# Copy to clipboard
# !echo $query | pbcopy
print(query)

("chimeric antigen receptor" OR "car-t" OR "car t" OR "synnotch" OR "synthetic notch" OR "cell therapy" OR "cell engineering") AND ("Cell"[journal] OR "Science"[journal] OR "Nature*"[journal] OR "N Engl J Med"[journal] OR "Lancet*"[journal] OR "JAMA*"[journal] OR "Blood Adv"[journal] OR "Blood"[journal] OR "Proc Natl Acad Sci U S A"[journal] OR "J Clin Oncol"[journal] OR "PLoS Computational Biology"[journal] OR "PLoS Biology"[journal] OR "Science Advances"[journal] OR "Neuron"[journal] OR "ACS Synthetic Biology"[journal] OR "Cell Reports"[journal] OR "Cell Systems"[journal] OR "Science Signaling"[journal] OR "Science Immunology"[journal] OR "Front Immunol"[journal] OR "Science Translational Medicine"[journal] OR "J Clin Invest"[journal] OR "J Immunol"[journal] OR "Elife"[journal] OR "bioRxiv"[journal] OR "Cancer Research"[journal] OR "Clin Cancer Res"[journal] OR "Immunity"[journal] OR "J Theor Biol"[journal] OR "Cancer Cell"[journal] OR "Annu Rev*"[journal] OR "Blood Rev"[journal]) AN

# Analyze PEOPLE queries

### Check avg N Papers per year for each query (last 5 years)

In [None]:
people_df['papers_per_year'] = df.apply(lambda row: avg_articles_per_year_last5(row['query']), axis=1)

In [None]:
people_df.sort_values(by='papers_per_year', ascending=False)

# Analyze Cell Therapies Query

## Check avg N results per year for last 5 years)

In [98]:
avg_articles_per_year_last5(query)

327.6


327.6