In [1]:
%run notebook_setup.ipynb

## Define search terms

In [2]:
from search_terms import primary_terms, secondary_terms, descriptive_terms

In [3]:
primary_terms

{'multi-omics': '("multi-omic"[Text Words]) OR ("multiomic"[Text Words]) OR ("multi omic"[Text Words]) OR ("multi-omics"[Text Words]) OR ("multiomics"[Text Words]) OR ("multi omics"[Text Words])',
 'pan-omics': '("pan-omic"[Text Words]) OR ("panomic"[Text Words]) OR ("pan omic"[Text Words]) OR ("pan-omics"[Text Words]) OR ("panomics"[Text Words]) OR ("pan omics"[Text Words])',
 'trans-omics': '("trans-omic"[Text Words]) OR ("transomic"[Text Words]) OR ("trans omic"[Text Words]) OR ("trans-omics"[Text Words]) OR ("transomics"[Text Words]) OR ("trans omics"[Text Words])',
 'poly-omics': '("poly-omic"[Text Words]) OR ("polyomic"[Text Words]) OR ("poly omic"[Text Words]) OR ("poly-omics"[Text Words]) OR ("polyomics"[Text Words]) OR ("poly omics"[Text Words])'}

In [4]:
secondary_terms

{'multi-table omics': '(("multi-table"[Text Words]) AND (omic[Text Words])) OR (("multi-table"[Text Words]) AND (omics[Text Words])) OR (("multitable"[Text Words]) AND (omic[Text Words])) OR (("multitable"[Text Words]) AND (omics[Text Words])) OR (("multi table"[Text Words]) AND (omic[Text Words])) OR (("multi table"[Text Words]) AND (omics[Text Words])) OR (("multi-tables"[Text Words]) AND (omic[Text Words])) OR (("multi-tables"[Text Words]) AND (omics[Text Words])) OR (("multitables"[Text Words]) AND (omic[Text Words])) OR (("multitables"[Text Words]) AND (omics[Text Words])) OR (("multi tables"[Text Words]) AND (omic[Text Words])) OR (("multi tables"[Text Words]) AND (omics[Text Words]))',
 'multi-source omics': '(("multi-source"[Text Words]) AND (omic[Text Words])) OR (("multi-source"[Text Words]) AND (omics[Text Words])) OR (("multisource"[Text Words]) AND (omic[Text Words])) OR (("multisource"[Text Words]) AND (omics[Text Words])) OR (("multi source"[Text Words]) AND (omic[Text W

In [5]:
descriptive_terms

{'integrative omics': '"integrative omic"[Text Words] OR "integrative omics"[Text Words]',
 'integrated omics': '"integrated omic"[Text Words] OR "integrated omics"[Text Words]'}

## Perform search in PubMed

In [6]:
from easy_entrez import EntrezAPI
from config import ENTREZ_API_NAME, ENTREZ_API_EMAIL

entrez_api = EntrezAPI(
    tool=ENTREZ_API_NAME,
    email=ENTREZ_API_EMAIL,
    minimal_interval=2
)

In [7]:
from tqdm import tqdm

search_terms = {
    **primary_terms,
    **secondary_terms,
    **descriptive_terms
}

In [8]:
%%cache search_results pubmed_results

pubmed_results = {}

MAX_RESULTS = 10_000

for term in tqdm(search_terms):
    result = entrez_api.search(
        search_terms[term],
        database='pubmed',
        max_results=MAX_RESULTS
    )
    esearch = result.data['esearchresult']
    count = int(esearch['count'])
    assert count >= 0
    assert count < MAX_RESULTS

    pubmed_results[term] = result

Reusing the results from cache/search_results.pickle


In [9]:
all_papers = sorted(set(sum(
    [
        result.data['esearchresult']['idlist']
        for result in pubmed_results.values()
    ],
    []
)))

In [10]:
len(all_papers)

3206

In [11]:
%%cache pubmed_documents_data documents

documents_by_batch = (
    entrez_api
    .in_batches_of(size=100)
    .fetch(all_papers, max_results=10_000, return_type='xml')
)

documents = sum(
    (
        list(result.data)
        for result in documents_by_batch.values()
    ),
    []
)

Reusing the results from cache/pubmed_documents_data.pickle


In [12]:
from utils import xml_element_to_json
documents = [xml_element_to_json(document) for document in list(documents)]

In [13]:
assert len(documents) == len(all_papers)

## Create a data frame with PubMed documents and covariates

In [14]:
from pandas import Series, DataFrame, read_csv, to_datetime

In [15]:
# create a frame with 0 columns and UID of each paper on the index
literature = Series(all_papers).to_frame('uid').set_index('uid')
# add columns for the occurrences of the terms
for term, result in pubmed_results.items():
    literature[term] = False
    for uid in result.data['esearchresult']['idlist']:
        literature.loc[uid, term] = True
literature

Unnamed: 0_level_0,multi-omics,pan-omics,trans-omics,poly-omics,multi-table omics,multi-source omics,multi-view omics,multi-modal omics,multi-block omics,integrative omics,integrated omics
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
15687700,False,False,True,False,False,False,False,False,False,False,False
15687839,False,False,False,False,False,False,False,False,False,True,False
15763567,True,False,False,False,False,False,False,False,False,False,False
16338138,False,True,False,False,False,False,False,False,False,False,False
16479594,True,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
32528899,True,False,False,False,False,False,False,False,False,False,False
32529434,True,False,False,False,False,False,False,False,False,False,False
32532869,True,False,False,False,False,False,False,False,False,False,False
32533167,True,False,False,False,False,False,False,False,False,False,False


## Parse the PubMed metadata on articles

Reference:
  - Medline: https://www.nlm.nih.gov/bsd/mms/medlineelements.html
  - Publication types: https://www.nlm.nih.gov/mesh/pubtypes.html (fun fact: includes "Wit and Humor" type)

In [16]:
from warnings import warn
from parse_pubmed import listify, extract_abstract, parse_date, parse_doi

missing_abstract = []
authors = []
affiliations = []

publication_types = []

for document in documents:
    # TODO review or original

    kind = None
    date = None
    doi = None

    if 'PubmedBookArticle' in document:
        kind = 'article in book'
        book_document = document['PubmedBookArticle']['BookDocument']
        pmid = book_document['PMID']['#text']

        title = book_document['ArticleTitle']['#text']
        abstract = extract_abstract(book_document)

        #book_document['PublicationType']
        #book_document['KeywordList']

    if 'PubmedArticle' in document:
        pubmed_article = document['PubmedArticle']
        assert not kind
        kind = 'article'
        medline_citation = pubmed_article['MedlineCitation']
        pmid = medline_citation['PMID']['#text']
        article = medline_citation['Article']
        literature.loc[pmid, 'journal'] = article['Journal']['Title']

        if 'ELocationID' in article:
            doi = parse_doi(article['ELocationID'])

        issue = article['Journal']['JournalIssue']
        if 'PubDate' in issue:
            date = parse_date(issue['PubDate'])

        for author in listify(article['AuthorList']['Author'] if 'AuthorList' in article else None):
            author_id = len(authors)
            authors.append(
                {
                    'ID': author_id,
                    'ForeName': author.get('ForeName'),
                    'LastName': author.get('LastName'),
                    'CollectiveName': author.get('CollectiveName'),
                    'PMID': pmid
                }
            )
            for affiliation in listify(author.get('AffiliationInfo')):
                affiliations.append({
                    'Affiliation': affiliation['Affiliation'],
                    'PMID': pmid,
                    'AuthorID': author_id
                })

        for publication_type in listify(article['PublicationTypeList']['PublicationType'] if 'PublicationTypeList' in article else None):
            type_name = publication_type['#text']
            publication_types.append(type_name)
            literature.loc[pmid, f'Is {type_name}'] = True

        try:
            literature.loc[pmid, 'journal_issn'] = article['Journal']['ISSN']['#text']
        except KeyError:
            warn(f'{article["Journal"]} had no ISSN assigned')
        if 'ArticleTitle' in article:
            title = article['ArticleTitle']
            if isinstance(title, dict):
                title = title['#text']

        abstract = extract_abstract(article)

    if not abstract:
        missing_abstract.append(pmid)

    assert kind

    literature.loc[pmid, 'kind'] = kind
    literature.loc[pmid, 'doi'] = doi
    literature.loc[pmid, 'title'] = title
    literature.loc[pmid, 'abstract'] = abstract
    literature.loc[pmid, 'date'] = date

publication_types = Series(publication_types)

  warn(f'{article["Journal"]} had no ISSN assigned')
  warn(f'{article["Journal"]} had no ISSN assigned')
  warn(f'{article["Journal"]} had no ISSN assigned')


In [17]:
affiliations = DataFrame(affiliations)
authors = DataFrame(authors)

authors['JointName'] = authors['ForeName'] + ' ' + authors['LastName']

In [18]:
literature['has_doi'] = ~literature.doi.isnull()
literature.date = to_datetime(literature.date)
literature['year'] = literature.date.dt.year

In [19]:
terms = list(pubmed_results.keys())

In [20]:
def which_term(term):
    term = list(term[term].index)
    if len(term) == 1:
        return term[0]
    else:
        return 'multiple'

In [21]:
literature['term'] = literature[terms].apply(which_term, axis=1)

In [22]:
from pandas import Categorical
literature['term'] = Categorical(literature['term'], ordered=True, categories=list(literature['term'].value_counts().index))

## Add PubmedCentral mapping

In [23]:
%%cache pubmed_central_metadata pmc_metadata
# approx 2GB in RAM, best to subset early
pmc_metadata_all = read_csv('PMC-ids.csv.gz')
pmid_of_interest = set(literature.index)
pmc_metadata = pmc_metadata_all[pmc_metadata_all.PMID.isin(pmid_of_interest)]
del pmc_metadata_all

Reusing the results from cache/pubmed_central_metadata.pickle


In [24]:
len(pmc_metadata)

1921

In [25]:
pmc_metadata.head()

Unnamed: 0,Journal Title,ISSN,eISSN,Year,Volume,Issue,Page,DOI,PMCID,PMID,Manuscript Id,Release Date
817169,J Virol,0022-538X,1098-5514,2006,80,9,4356,10.1128/JVI.80.9.4356-4362.2006,PMC1472023,16611894.0,,live
1155415,Proc Natl Acad Sci U S A,0027-8424,1091-6490,2007,104,15,6478,10.1073/pnas.0611629104,PMC1849962,17420480.0,,live
1212422,J Bacteriol,0021-9193,1098-5530,2007,189,13,4635,10.1128/JB.00128-07,PMC1913438,17449607.0,,live
1430120,Osteoarthritis Cartilage,1063-4584,1522-9653,2007,15,12,1367,10.1016/j.joca.2007.04.011,PMC2153443,17604656.0,NIHMS34878,live
1532139,Br J Pharmacol,0007-1188,1476-5381,2008,153,Suppl 1,S133,10.1038/sj.bjp.0707658,PMC2268044,18193070.0,,live


In [26]:
literature['PMC'] = pmc_metadata.set_index('PMID').reindex(literature.index.astype(float))['PMCID']
assert len(pmc_metadata) == sum(~literature['PMC'].isnull())

literature['has_pmc'] = (~literature['PMC'].isnull())

Note can also try to find missing PMCs in the summaries:

In [27]:
# result = entrez_api.search(primary_terms['poly-omics'], max_results=10_000)
# summary = entrez_api.summarize(result.data['esearchresult']['idlist'][:5], max_results=10_000)
# summary.data

### Download full texts as XML

In [28]:
pmc_ids = literature[literature['has_pmc']]['PMC'].tolist()
pmc_ids[:4]

['PMC1472023', 'PMC1849962', 'PMC1913438', 'PMC2153443']

In [29]:
%%cache pubmed_central_xml pmc_xmls
pmc_full_texts = entrez_api.in_batches_of(size=100).fetch(pmc_ids, max_results=5_000, database='pmc', return_type='xml')

pmc_xmls = sum(
    [
        list(response.data)
        for response in pmc_full_texts.values()
    ],
    []
)

Reusing the results from cache/pubmed_central_xml.pickle


In [30]:
len(pmc_xmls)

1921

In [31]:
ignore_text = {'xref', 'table', 'thead', 'th', 'td', 'tr', 'graphic'}


def extract_text(body) -> str:
    fragments = []
    for i in body.iter():
        if i.tag in ignore_text:
            continue
        text = i.text
        if i.tag == 'label' and text and text.startswith('Figure'):
            continue
        if text:
            fragments.append(text)
    return '\n'.join(fragments)

In [32]:
for xml in pmc_xmls:
    pmid = xml.find('front/article-meta/article-id[@pub-id-type="pmid"]').text
    body = xml.find('body')
    has_full_text = body is not None
    literature.loc[pmid, 'has_full_text'] = has_full_text
    literature.loc[pmid, 'full_text'] = extract_text(body) if has_full_text else None

In [33]:
sum(literature['has_full_text'] == True)

1493

In [34]:
# from utils import display_xml
# display_xml(pmc_xmls[-2].find('body'))

## Code archives and repositories extraction

Limitations:
 - does not count gitlab in custom domains

In [35]:
from re import escape


def collapse_lists(lists):
    return sorted(set(sum(lists, [])))


fields = ['abstract', 'full_text']
platforms = {
    'github': r'(github\.com/\S+/\S+)',
    'gitlab': r'(gitlab\.com/\S+/\S+)',
    'sourceforge': r'(sourceforge\.net/\S+)',
    'bitbucket': r'(bitbucket\.org/\S+)',
    r'.git': r'(\S+:\S+\.git\S*)',
}

for platform in platforms:
    for field in fields:
        literature[f'{field}_mentions_{platform}'] = literature[field].str.lower().str.contains(escape(platform)) == True
        matches = literature[field].astype(str).str.findall(platforms[platform])
        literature[f'{field}_{platform}_matches'] = matches
    literature[f'mentions_{platform}'] = literature[[f'{field}_mentions_{platform}' for field in fields]].any(axis=1)
    literature[f'{platform}_matches'] = literature[[f'{field}_{platform}_matches' for field in fields]].apply(collapse_lists, axis=1)

TODO extract "ext-link", as in:

In [36]:
literature[[f'abstract_mentions_{platform}' for platform in platforms]].sum()

abstract_mentions_github         50
abstract_mentions_gitlab          2
abstract_mentions_sourceforge     4
abstract_mentions_bitbucket       3
abstract_mentions_.git            2
dtype: int64

In [37]:
literature[[f'full_text_mentions_{platform}' for platform in platforms]].sum()

full_text_mentions_github         205
full_text_mentions_gitlab           5
full_text_mentions_sourceforge     44
full_text_mentions_bitbucket       15
full_text_mentions_.git            54
dtype: int64

The sourceforge mentions might be uses of tools.

In [38]:
literature[[f'mentions_{platform}' for platform in platforms]].sum()

mentions_github         241
mentions_gitlab           5
mentions_sourceforge     46
mentions_bitbucket       17
mentions_.git            56
dtype: int64

In [39]:
literature[[f'{platform}_matches' for platform in platforms]].sum()

github_matches         [github.com/Magdoll//ECE, github.com/wizardfan...
gitlab_matches         [gitlab.com/Gustafsson-lab/lassim, gitlab.com/...
sourceforge_matches    [sourceforge.net/., sourceforge.net/projects/a...
bitbucket_matches      [bitbucket.org/hbc/galaxy-central-hbc, bitbuck...
.git_matches           [http://networkx.github.io/, http://broadinsti...
dtype: object

In [40]:
Series(literature['.git_matches'].sum()).value_counts()

http://broadinstitute.github.io/picard/                                                                                              11
http://broadinstitute.github.io/picard                                                                                                4
http://cole-trapnell-lab.github.io/cufflinks/                                                                                         3
https://broadinstitute.github.io/picard/                                                                                              3
http://zwdzwd.github.io/InfiniumAnnotation                                                                                            2
https://trinotate.github.io                                                                                                           2
https://rrshieldscutler.github.io/splinectomeR/                                                                                       1
https://gist.github.com/yannabraham/c1f9de9b23fb

### Abstrace cleaning

Many abstracts contains sections/organising headers, such as:

In [41]:
['BACKGROUND', 'MOTIVATION', 'OBJECTIVE', 'SCOPE']

['BACKGROUND', 'MOTIVATION', 'OBJECTIVE', 'SCOPE']

By convention those are upper case in PubMed. Here We filter those out:

In [42]:
from re import findall


def extract_upper_case(abstract: str, min_len: int = 3):
    if abstract:
        return findall('([A-Z]{' + str(min_len) + ',})', abstract)
    return []


def count_upper_case_phrases(data: Series, min_len: int = 3) -> Series:
    return Series(sum(data.apply(extract_upper_case, min_len=min_len), [])).value_counts()

In [43]:
potential_headers = count_upper_case_phrases(literature['abstract'])
potential_headers[potential_headers > 100]

RNA            2173
DNA             907
RESULTS         428
BACKGROUND      321
CONCLUSIONS     268
METHODS         228
HCC             189
TCGA            189
SNP             154
GWAS            153
CONCLUSION      136
QTL             128
CRC             125
IBD             114
GBM             109
dtype: int64

There are many disease abbreviations making the list too long to browse:

In [44]:
len(potential_headers[potential_headers > 3])

1096

So we will look at longer words:

In [45]:
potential_headers[potential_headers > 3].index.map(len).value_counts()

3     663
4     273
5      79
6      31
7      16
12      9
10      7
8       7
9       6
14      2
11      2
13      1
dtype: int64

In [46]:
potential_headers_long = count_upper_case_phrases(literature['abstract'], min_len=5)
potential_headers_long.head(20)

RESULTS           428
BACKGROUND        321
CONCLUSIONS       268
METHODS           228
CONCLUSION        136
NAFLD              63
PURPOSE            56
OBJECTIVE          50
NSCLC              47
HNSCC              46
AVAILABILITY       44
CRISPR             42
MOTIVATION         40
INFORMATION        37
OMICS              37
IMPLEMENTATION     36
FINDINGS           36
SUPPLEMENTARY      36
LASSO              32
MALDI              32
dtype: int64

I manually chosen headers from among top 100 hits:

In [47]:
ABSTRACT_HEADERS = [
    # manually added to prevent hanging "OF"
    'PURPOSE OF REVIEW',
    # chosen from top 100 most frequent
    'RESULTS',
    'BACKGROUND',
    'CONCLUSIONS',
    'METHODS',
    'CONCLUSION',
    'PURPOSE',
    'OBJECTIVE',
    'AVAILABILITY',
    'MOTIVATION',
    'INFORMATION',
    'SUPPLEMENTARY',
    'FINDINGS',
    'SIGNIFICANCE',
    'INTRODUCTION',
    'DESIGN',
    'OBJECTIVES',
    'REVIEW',
    'SUMMARY',
    'MATERIALS',
    'STUDY',
    'EXPERIMENTAL',
    'DISCUSSION',
    'REGISTRATION',
    'METHOD',
    'CONTACT',
    'FUTURE',
    'INTERPRETATION',
]

In [48]:
literature['abstract_clean'] = literature['abstract'].str.replace('|'.join(ABSTRACT_HEADERS), '')

In [50]:
%vault store literature in pubmed_derived_data

Stored `literature` (None → 6B6823C9) at Monday, 15. Jun 2020 01:28