In [19]:
from bs4 import BeautifulSoup
import requests
from pypdf import PdfReader
import io
from tabula import read_pdf
import pandas as pd
import spacy 
from spacy.training import Example
from spacy import displacy


## Extract 10 most recent submissions from medRxiv medical papers - put into dataframe

In [8]:
def extract_article_text():

    res = requests.get('https://www.medrxiv.org/archive')
    content = res.content
    soup = BeautifulSoup(content)
    
    a = soup.find_all('a', {"class": "highwire-cite-linked-title"})
    b = soup.find_all('span',{"class":'highwire-citation-authors'})
    c = soup.find_all('span', {"class": "highwire-cite-metadata-doi highwire-cite-metadata"})
    
    assert len(a) > 0, 'No Articles Found'
    assert len(a) == len(b) == len(c), 'Mismatched article metadata'

    list_dict = []
    for ix,(i,j,k) in enumerate(zip(a,b,c)):
        list_dict.append({'title':i.text,'authors':j.text,'href':k.text.partition('doi: ')[2].strip().replace('doi.org','www.medrxiv.org/content') +'v1','full_pdf':k.text.partition('doi: ')[2].strip().replace('doi.org','www.medrxiv.org/content') + 'v1.full.pdf'})

    return pd.DataFrame(list_dict)

In [7]:
extract_article_text()

Unnamed: 0,title,authors,href,full_pdf
0,PREVALENCE OF DEPRESSION AND ANXIETY IN COLOMB...,"Sandra Martínez-Cabezas, Mónica Pinilla-Roncan...",https://www.medrxiv.org/content/10.1101/2023.0...,https://www.medrxiv.org/content/10.1101/2023.0...
1,Peruvian National Survey of Mental Health and ...,"Victor Orlando Cruz, Andres Pariamachi, Nataly...",https://www.medrxiv.org/content/10.1101/2023.0...,https://www.medrxiv.org/content/10.1101/2023.0...
2,The impact of prolonged walking on fasting pla...,"Anxious J. Niwaha, Lauren R. Rodgers, Andrew T...",https://www.medrxiv.org/content/10.1101/2023.0...,https://www.medrxiv.org/content/10.1101/2023.0...
3,A unique cytotoxic CD4+ T cells signature defi...,"Sarah Baird, Caroline L Ashley, Felix Marsh-Wa...",https://www.medrxiv.org/content/10.1101/2023.0...,https://www.medrxiv.org/content/10.1101/2023.0...
4,Epidemiology and aetiology of moderate to seve...,"Siobhan Lindsay Johnstone, Linda Erasmus, Jun...",https://www.medrxiv.org/content/10.1101/2023.0...,https://www.medrxiv.org/content/10.1101/2023.0...
5,Drug-combination wide association studies of c...,"Rachel D Melamed, Panagiotis Nikolaos Lalagkas",https://www.medrxiv.org/content/10.1101/2022.0...,https://www.medrxiv.org/content/10.1101/2022.0...
6,Vitamin B12 in pregnancy and its relationship ...,"Rameesha Muzaffar, Jameeha Khursheed, Anum Yousaf",https://www.medrxiv.org/content/10.1101/2023.0...,https://www.medrxiv.org/content/10.1101/2023.0...
7,Association of Cancer History with Structural ...,"Jingxuan Wang, Kendra D Sims, Sarah F Ackley, ...",https://www.medrxiv.org/content/10.1101/2023.0...,https://www.medrxiv.org/content/10.1101/2023.0...
8,Training parameters and longitudinal adaptatio...,"Pierce Boyne, Allison Miller, Sarah M Schwab, ...",https://www.medrxiv.org/content/10.1101/2023.0...,https://www.medrxiv.org/content/10.1101/2023.0...
9,"Knowledge, Attitude and Practices Regarding Di...","Amey Ambike, Shirish Rao, Raghav Paranjape, Sh...",https://www.medrxiv.org/content/10.1101/2023.0...,https://www.medrxiv.org/content/10.1101/2023.0...


In [13]:
df1 = extract_article_text()

## Loop through above df preview papers and extract abstracts

In [25]:
def access_archive_listing(input_df):

    abstracts = []  
    for ix,href in enumerate(input_df['href']):
        res = requests.get(href)
        content = res.content
        soup = BeautifulSoup(content)
        for ix,i in enumerate(soup.find_all('div',{"class":"section abstract"})):
            abstracts.append(i.text.partition('Abstract')[2].replace('\n',''))

    abstracts = list(set(abstracts))

    try:
        assert len(abstracts) == len(input_df['href']),f"Warning: # Abstracts != # Links {len(abstracts),len(input_df['href'])}"
    except Exception as e:
        print(e)

    input_df['abstract'] = abstracts

    return input_df

In [26]:
access_archive_listing(df1)

Unnamed: 0,title,authors,href,full_pdf,abstracts,abstract
0,PREVALENCE OF DEPRESSION AND ANXIETY IN COLOMB...,"Sandra Martínez-Cabezas, Mónica Pinilla-Roncan...",https://www.medrxiv.org/content/10.1101/2023.0...,https://www.medrxiv.org/content/10.1101/2023.0...,"AimsIn many low-income countries, fasting gluc...","AimsIn many low-income countries, fasting gluc..."
1,Peruvian National Survey of Mental Health and ...,"Victor Orlando Cruz, Andres Pariamachi, Nataly...",https://www.medrxiv.org/content/10.1101/2023.0...,https://www.medrxiv.org/content/10.1101/2023.0...,Background and objectives. SARS-CoV-2 infectio...,Background and objectives. SARS-CoV-2 infectio...
2,The impact of prolonged walking on fasting pla...,"Anxious J. Niwaha, Lauren R. Rodgers, Andrew T...",https://www.medrxiv.org/content/10.1101/2023.0...,https://www.medrxiv.org/content/10.1101/2023.0...,Diarrhoea is a recognised complication of HIV-...,Diarrhoea is a recognised complication of HIV-...
3,A unique cytotoxic CD4+ T cells signature defi...,"Sarah Baird, Caroline L Ashley, Felix Marsh-Wa...",https://www.medrxiv.org/content/10.1101/2023.0...,https://www.medrxiv.org/content/10.1101/2023.0...,Background: Peru is the worst affected country...,Background: Peru is the worst affected country...
4,Epidemiology and aetiology of moderate to seve...,"Siobhan Lindsay Johnstone, Linda Erasmus, Jun...",https://www.medrxiv.org/content/10.1101/2023.0...,https://www.medrxiv.org/content/10.1101/2023.0...,Introduction:Gestational diabetes mellitus (GD...,Introduction:Gestational diabetes mellitus (GD...
5,Drug-combination wide association studies of c...,"Rachel D Melamed, Panagiotis Nikolaos Lalagkas",https://www.medrxiv.org/content/10.1101/2022.0...,https://www.medrxiv.org/content/10.1101/2022.0...,Background & Objective: Digital Service Provid...,Background & Objective: Digital Service Provid...
6,Vitamin B12 in pregnancy and its relationship ...,"Rameesha Muzaffar, Jameeha Khursheed, Anum Yousaf",https://www.medrxiv.org/content/10.1101/2023.0...,https://www.medrxiv.org/content/10.1101/2023.0...,"Combinations of common drugs may, when taken t...","Combinations of common drugs may, when taken t..."
7,Association of Cancer History with Structural ...,"Jingxuan Wang, Kendra D Sims, Sarah F Ackley, ...",https://www.medrxiv.org/content/10.1101/2023.0...,https://www.medrxiv.org/content/10.1101/2023.0...,The COVID-19 pandemic has impacted the well-be...,The COVID-19 pandemic has impacted the well-be...
8,Training parameters and longitudinal adaptatio...,"Pierce Boyne, Allison Miller, Sarah M Schwab, ...",https://www.medrxiv.org/content/10.1101/2023.0...,https://www.medrxiv.org/content/10.1101/2023.0...,Background: Locomotor high-intensity interval ...,Background: Locomotor high-intensity interval ...
9,"Knowledge, Attitude and Practices Regarding Di...","Amey Ambike, Shirish Rao, Raghav Paranjape, Sh...",https://www.medrxiv.org/content/10.1101/2023.0...,https://www.medrxiv.org/content/10.1101/2023.0...,Background and Objectives: Cancer survivors ar...,Background and Objectives: Cancer survivors ar...


In [16]:
df2 = access_archive_listing(df1)

## Extract Paper Abstract Information Using NER Model

In [23]:
# test_text2 = '''
# Background and Objectives: Cancer survivors are less likely than comparably-aged individuals without a cancer history to develop Alzheimer's disease and related dementias (ADRD). We investigated the association between cancer history and structural magnetic resonance imaging (MRI) markers for ADRD risk, using linear mixed-effects models to assess differences at the mean values of MRI markers and quantile regression to examine whether the association varies across the distribution of MRI markers of brain aging. Methods: Among UK Biobank participants with ≥1 brain MRI, we considered total gray matter volume, total brain volume, hippocampal volume, white matter hyperintensity volume, and mean cortical thickness in the Alzheimer's disease (AD) signature region. Cancer history was ascertained from national registry and self-report. We first specified linear mixed models with random intercepts to assess mean differences in MRI markers according to cancer history. Next, to examine whether effects of cancer history on these markers varies across the ADRD risk distribution, we specified quantile regression models to assess differences in quantile cut-points of the distribution of MRI markers according to cancer history. Models adjusted for demographics, APOE-ε4 status, and health behaviors. Results: The sample included 42,242 MRIs on 37,588 participants with no cancer history (mean age 64.1 years), and 6,073 MRIs on 5,514 participants with a cancer diagnosis prior to MRI (mean age 66.7 years). Cancer history was associated with smaller mean hippocampal volume (b=-19 mm3, 95% confidence interval [CI]=-36, -1) and lower mean cortical thickness in the AD signature region (b=-0.004 mm, 95% CI=-0.007, -0.000). Quantile regressions indicated cancer history had larger effects on high quantiles of white matter hyperintensities (10th percentile b=-49 mm3, 95% CI=-112, 19; 90th percentile b=552 mm3, 95% CI= 250, 1002) and low quantiles of cortical thickness (10th percentile b=-0.006 mm, 95% CI=-0.011, -0.000; 90th percentile b=0.003 mm3, 95% CI=-0.003, 0.007), indicating individuals most vulnerable to ADRD were more affected by cancer history. Discussion: We found no evidence that cancer history was associated with less ADRD-related neurodegeneration. To the contrary, adults with cancer history had worse MRI indicators of dementia risk. Adverse associations were largest in the highest-risk quantiles of neuroimaging markers.
# '''

In [27]:
abstract_ner_model = spacy.load('./abstract_ner_trainer/model/')

In [28]:
def extract_abstract_entities(input_abstracts):
    for i in input_abstracts:
        doc1 = abstract_ner_model(i)
        displacy.render(doc1,style = 'ent')
        

In [29]:
extract_abstract_entities(df2['abstract'])

## Extract PDF tables 

In [15]:
def extract_pdf_tables(input_pdfs):

    full_tables = []
    for i in input_pdfs:
        dfs = read_pdf(i, pages='all')
        for table in dfs:
            full_tables.append(table)

    return full_tables

In [17]:
extract_pdf_tables(df2['full_pdf'])

Got stderr: Feb 23, 2023 9:33:39 PM org.apache.pdfbox.pdmodel.font.PDSimpleFont toUnicode

Got stderr: Feb 23, 2023 9:34:30 PM org.apache.fontbox.ttf.CmapSubtable processSubtype14
Feb 23, 2023 9:34:31 PM org.apache.fontbox.ttf.CmapSubtable processSubtype14



[    139                                       Data sources
 0   140                                                NaN
 1   141                      National mental health survey
 2   142                                                NaN
 3   143  We used data from the National Mental Health S...
 4   144  national and regional levels of the country. T...
 5   145  older with a positive dementia screen using th...
 6   146  over 12 years of age with cognitive limitation...
 7   147  did not speak Spanish (18). The final sample i...
 8   148  aged 12 to 17 years and 10870 adults 18 years ...
 9   149  excluded children from 7 to11 years old becaus...
 10  150                                         caregiver.
 11  151                                                NaN
 12  152    Individual Registry of Health Services Delivery
 13  153                                                NaN
 14  154  The Individual Registry of Health Services Del...
 15  155  of healthcare providers in Col