<a href="https://colab.research.google.com/github/jansoe/covid-chestxray-dataset/blob/master/PRISMAPubMed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PubMed Scraping

## Imports

In [1]:
import requests
from bs4 import BeautifulSoup as bs

import pandas as pd
import numpy as np

import collections
import time

#### Google authorization

In [4]:
from google.colab import auth
auth.authenticate_user()

In [5]:
import gspread
from oauth2client.client import GoogleCredentials

#### PubMed scraping functions

In [6]:
!pip install biopython

from Bio import Entrez

def search(query):
    Entrez.email = 'jan.soelterl@uni.lu'
    handle = Entrez.esearch(
        db='pubmed', 
        sort='relevance', 
        retmax='500',
        retmode='xml', 
        term=query
    )
    results = Entrez.read(handle)
    return results

def fetch_details(id_list):
    ids = ','.join(id_list)
    Entrez.email = 'jan.soelterl@uni.lu'
    handle = Entrez.efetch(
        db='pubmed',
        retmode='xml',
        id=ids
    )
    results = Entrez.read(handle)
    return results

Collecting biopython
[?25l  Downloading https://files.pythonhosted.org/packages/3a/cd/0098eaff841850c01da928c7f509b72fd3e1f51d77b772e24de9e2312471/biopython-1.78-cp37-cp37m-manylinux1_x86_64.whl (2.3MB)
[K     |████████████████████████████████| 2.3MB 2.8MB/s 
Installing collected packages: biopython
Successfully installed biopython-1.78


## Identification

In [7]:
terms = [
  'covid AND x-ray AND dataset',
  'covid AND x-ray AND data set',
  'covid AND x-ray AND machine learning',
  'covid AND x-ray AND deep learning',
  'covid-19 AND x-ray AND dataset',
  'covid-19 AND x-ray AND data set',
  'covid-19 AND x-ray AND machine learning',
  'covid-19 AND x-ray AND deep learning'
]

known_ids = set()
id_list = {}
for term in terms:
    results = search(term)
    obtained_ids = set(results['IdList'])
    new_ids = obtained_ids.difference(known_ids)
    print(f'{len(new_ids)} out of {len(obtained_ids)} newly added by {term}')
    if len(new_ids)>0:
        id_list[term] = new_ids
    known_ids = known_ids.union(new_ids)

189 out of 189 newly added by covid AND x-ray AND dataset
20 out of 41 newly added by covid AND x-ray AND data set
170 out of 275 newly added by covid AND x-ray AND machine learning
85 out of 360 newly added by covid AND x-ray AND deep learning
8 out of 196 newly added by covid-19 AND x-ray AND dataset
1 out of 42 newly added by covid-19 AND x-ray AND data set
3 out of 280 newly added by covid-19 AND x-ray AND machine learning
3 out of 366 newly added by covid-19 AND x-ray AND deep learning


#### Paper details

In [8]:
info = []

for term, ids in id_list.items():
    papers = fetch_details(ids)

    for paper in papers['PubmedArticle']:
        data = {'term': term}
        #if not 'eng' in paper['MedlineCitation']['Article']['Language']:
        #    print(f"skipped not english {paper['MedlineCitation']['Article']['ArticleTitle']}")
        #    continue
        for id in paper['MedlineCitation']['Article']['ELocationID']:
            if id.attributes['EIdType'] == 'doi':
                data['doi'] = str(id)
        data['title'] = paper['MedlineCitation']['Article']['ArticleTitle']
        data['status'] = paper['PubmedData']['PublicationStatus']
        data['PMID'] = [int(i) for i in paper['PubmedData']['ArticleIdList'] if i.attributes['IdType'] == 'pubmed'][0]

        # get article date
        date = paper['MedlineCitation']['Article']['ArticleDate']
        assert len(date) < 2
        
        # get publication date if article date not available
        if len(date) == 0:
            y = int(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year'])
            m = paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate'].get('Month', 0)
            if type(m) == Entrez.Parser.StringElement:
                if m.isnumeric():
                    m = int(m)
                else:
                    m = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dez'].index(m) + 1
            d = 'Journal'
        else:
            y = int(date[0]['Year'])
            m = int(date[0]['Month'])
            d = date[0].attributes['DateType']
        data['year'] = y
        data['month'] = m
        data['datetype'] = d

        info.append(data)

#### Set time intervall 

In [9]:
identified = pd.DataFrame(info).sort_values(by=['year', 'month'])
identified = identified[identified.year > 2019]
identified = identified[np.logical_or((identified.year != 2021), (identified.month < 4))]
identified['doi_link'] = identified.doi.apply(lambda x: 'https://doi.org/' + str(x))
identified.doi = identified.doi.fillna('')

In [10]:
display(identified.head(3))
display(identified.tail(3))

Unnamed: 0,term,doi,title,status,PMID,year,month,datetype,doi_link
31,covid AND x-ray AND dataset,10.3233/XST-200715,Identification of COVID-19 samples from chest ...,ppublish,32773400,2020,0,Journal,https://doi.org/10.3233/XST-200715
112,covid AND x-ray AND dataset,10.3233/XST-200720,Detection of coronavirus disease from X-ray im...,ppublish,32804113,2020,0,Journal,https://doi.org/10.3233/XST-200720
173,covid AND x-ray AND dataset,10.3233/XST-200689,Differentiating pneumonia with and without COV...,ppublish,32568167,2020,0,Journal,https://doi.org/10.3233/XST-200689


Unnamed: 0,term,doi,title,status,PMID,year,month,datetype,doi_link
435,covid AND x-ray AND deep learning,10.1016/j.compbiomed.2021.104356,An automated COVID-19 detection based on fused...,ppublish,33799219,2021,3,Electronic,https://doi.org/10.1016/j.compbiomed.2021.104356
442,covid AND x-ray AND deep learning,10.1016/j.asoc.2021.107330,Federated learning for COVID-19 screening from...,ppublish,33776607,2021,3,Electronic,https://doi.org/10.1016/j.asoc.2021.107330
459,covid AND x-ray AND deep learning,10.1038/s41598-021-85694-5,Generalized chest CT and lab curves throughout...,epublish,33767213,2021,3,Electronic,https://doi.org/10.1038/s41598-021-85694-5


## Screening step

#### Load manual annotation

In [29]:
gc = gspread.authorize(GoogleCredentials.get_application_default())
worksheet = gc.open('DatasetScraping').worksheet('PubmedPaper')

rows = worksheet.get_all_values()
manual_annotated = pd.DataFrame.from_records(rows[1:], columns=rows[0])
manual_annotated['done'] = True

In [30]:
annotated = pd.merge(
    identified, 
    manual_annotated[['doi', 'month_manual', 'relevant', 'extracted datasets', 'done']], #'title',
    on=['doi'], #, 'title'], #apperently title changes sometimes post-publication 
    how='left'
)

#### Check if all scraped data is already annotated

In [31]:
non_annotated = annotated[annotated.done.isnull()]

In [32]:
non_annotated = non_annotated[manual_annotated.columns[:-1]]
non_annotated

Unnamed: 0,doi,doi_link,year,month,month_manual,title,relevant,extracted datasets,term


#### Add all not yet annotated results to google doc
(than annotate manually and rerun Notebook)

In [33]:
add_newfound = False

if add_newfound:
    gc = gspread.authorize(GoogleCredentials.get_application_default())
    worksheet = gc.open('DatasetScraping').worksheet('PubmedPaper')
    old_len = manual_annotated.shape[0]
    for ix in range(non_annotated.shape[0]):
        time.sleep(.5)
        worksheet.insert_row([str(i) for i in non_annotated.iloc[ix].fillna('').values.tolist()], old_len+ix+2)

### Screening resuts
ct (computer tomographie) and us (ultra sound) are mapped to nochestxraydata

In [34]:
(annotated.relevant
    .replace('', 'PASSED')
    .replace('nochestimagingdata', 'nochestxraydata')
    .replace('ct', 'nochestxraydata')
    .replace('us', 'nochestxraydata')
    .value_counts()
)

PASSED             201
nochestxraydata    190
nocovid             10
noenglish            2
Name: relevant, dtype: int64

In [35]:
chestxray_paper = annotated[annotated.relevant == '']

### Dataset Extraction

Check if all screened paper are annotated with dataset id

In [36]:
chestxray_paper[chestxray_paper['extracted datasets'].str.len() == 0]

Unnamed: 0,term,doi,title,status,PMID,year,month,datetype,doi_link,month_manual,relevant,extracted datasets,done


Extract the set of unique datasets
 * exclude private
 * exclude not_identified

In [37]:
all_datasets = sum(chestxray_paper['extracted datasets'].str.split(', ').to_list(), [])

set_unique = pd.DataFrame(
    collections.Counter([
        i for i in all_datasets 
        if i and ('privat' not in i) and ('not_identified' not in i)
    ]).most_common()
)
set_unique.columns = ['name', 'count']

In [38]:
print(f'Found {len(all_datasets)} dataset references, containing {set_unique["count"].sum()} public with {set_unique.shape[0]} thereof unique')

Found 512 dataset references, containing 463 public with 50 thereof unique


In [39]:
print(f'{set_unique.shape[0]} unique datasets extracted')

50 unique datasets extracted


## Manual annotation of Dataset eligibility

Load annotations

In [52]:
gc = gspread.authorize(GoogleCredentials.get_application_default())
worksheet = gc.open('DatasetScraping').worksheet('PaperIdentifiedDatasets')

rows = worksheet.get_all_values()
datasets_eligibility = pd.DataFrame.from_records(rows[1:], columns=rows[0]) #rows, columns=['name', 'eligibility', 'origin'])

Check if all datasets are annotated

In [53]:
is_anaylsed = pd.merge(set_unique, datasets_eligibility, on='name', how='left')
is_anaylsed[is_anaylsed.eligibility.isnull()]

Unnamed: 0,name,count,eligibility,paper_name,origin


In [54]:
is_anaylsed.eligibility.replace('', 'ELIGIBLE').value_counts()

remix             18
nocovid           13
casecollection    10
ELIGIBLE           7
noinfo             2
Name: eligibility, dtype: int64

In [55]:
is_anaylsed[is_anaylsed.eligibility == '']

Unnamed: 0,name,count,eligibility,paper_name,origin
9,actualmed,10,,ACUTALMED,https://github.com/agchung/Actualmed-COVID-che...
21,bimcv,3,,BIMCV-COVID19,https://bimcv.cipf.es/bimcv-projects/bimcv-cov...
27,mlhannover,2,,ML HANNOVER,https://github.com/ml-workgroup/covid-19-image...
33,tcia_rural,1,,COVID-19-AR,https://wiki.cancerimagingarchive.net/pages/vi...
38,covidgr,1,,COVIDGR,https://dasci.es/transferencia/open-data/covid...
42,ricord,1,,RICORD,https://wiki.cancerimagingarchive.net/pages/vi...
49,brixia,1,,BRIXIA,https://brixia.github.io/
