In [1]:
import os, glob, shutil
import sys

#Import config file. Update config.py according to your environment
from config import path_to_data

import pandas as pd
import numpy as np
from scipy import stats

import requests, json, re

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from urllib3.exceptions import ReadTimeoutError
from selenium.common.exceptions import TimeoutException, WebDriverException

from pdfminer.high_level import extract_text, extract_pages
from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTTextBox

from tqdm import tqdm
import time
from datetime import datetime
import multiprocessing

In [2]:
dataset_source = 'semanticsscholar' #'arxiv'

In [3]:
def load_dataset(dataset_source='semanticsscholar', years=None, data_types=None):
    """Fetch dataset metadata and text data for given years and data types."""
    years = (
        [str(year) for year in years] if isinstance(years, list) else
        [str(years)] if years else
        [year for year in os.listdir(os.path.join(path_to_data, 'metadata')) if year.isdigit()]
    )
    data_types = data_types or ['metadata', 'text']

    load_metadata = 'metadata' in data_types
    load_textdata = 'text' in data_types

    metadata, textdata = [], []
    metadata_years, textdata_years = [], []

    for year in years:
        if load_metadata:
            metadata_path = os.path.join(path_to_data, 'metadata', year, f'{dataset_source}_metadata_{year}.parquet')
            if os.path.isfile(metadata_path):
                metadata.append(pd.read_parquet(metadata_path, engine="pyarrow"))
                metadata_years.append(year)
        
        if load_textdata:
            textdata_path = os.path.join(path_to_data, 'text', year, f'{dataset_source}_text_{year}.parquet')
            if os.path.isfile(textdata_path):
                textdata.append(pd.read_parquet(textdata_path, engine="pyarrow"))
                textdata_years.append(year)
    
    metadata = pd.concat(metadata, axis=0) if metadata else []
    textdata = pd.concat(textdata, axis=0) if textdata else []
    
    msg_parts = []
    if load_metadata:
        msg_parts.append(f'metadata loaded for years: {metadata_years}')
    if load_textdata:
        msg_parts.append(f'text data loaded for years: {textdata_years}')
    
    if msg_parts:
        print("; ".join(msg_parts))
    
    if load_metadata and load_textdata and len(metadata) != len(textdata):
        raise ValueError("Metadata and text data don't have the same length.")
    
    output = (data for data in [metadata, textdata] if len(data) > 0)
    return output

In [37]:
def pdf_main_fontsize(pdf_path):
    sizes = [
        character.size
        for page_layout in extract_pages(pdf_path)
        for element in page_layout if isinstance(element, LTTextContainer)
        for text_line in element if isinstance(text_line, LTTextLine)
        for character in text_line if isinstance(character, LTChar)
    ]
    return stats.mode(sizes, keepdims=True)[0][0] if sizes else 0

def extract_pdf_sections(pdf_path, authorlist, paperId, possible_section_headings=None, size_threshold=None):
    author_last_names = [name.split()[-1] for name in authorlist.split(',')]
    last_section_names = {'acknowledgment', 'acknowledgement', 'acknowlegment', 'reference'}
    month_markers = {'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'november', 'december',
                     'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'nov', 'dec'}
    year_markers = {str(y) for y in range(1900, 2030)}
    section_list = possible_section_headings or {
        'introduction', 'results', 'discussion', 'conclusions', 'methods', 'materials', 'experimental',
        'materials and methods', 'experimental procedure', 'related work', 'i.', 'ii.', 'iii.', 'iv.', 'v.', 'vi.'
    }

    size_threshold = size_threshold if size_threshold is not None else 0.9

    page_width, page_height = 595, 842
    nword_abstract_th, nword_sections_th = 30, 30

    #Determine the most common font size (mode)
    size_mode = pdf_main_fontsize(pdf_path)

    text_blocks, tag_blocks, nwords_in_blocks = [], [], []
    section_to_find, section_heading, tag = 'AUTHORS', 'UNDEFINED', 'UNDEFINED'
    reached_end = False
    for p, page_layout in enumerate(extract_pages(pdf_path)):
        if reached_end:
            break

        for element in page_layout:
            if not isinstance(element, LTTextBox):
                continue

            x0, y0, x1, y1 = element.bbox
            if not (y0 > 0.05*page_height and y1 < 0.95*page_height and x0 > 0.05*page_width and x1 < 0.95*page_width):
                continue

            filtered_text, sizes = [], []
            for text_line in element:
                if isinstance(text_line, LTTextLine):
                    line_text = [character.get_text() for character in text_line]
                    sizes.extend([character.size for character in text_line if isinstance(character, LTChar)])
                    line_str = "".join(line_text).strip()
                    if not re.fullmatch(r"\d+", line_str): # Keeps lines that are NOT just numbers
                        filtered_text.append(line_str)
                    filtered_text.append("\n") # Preserve line breaks

            #joining characters in a single text while removing weird markers generated by pdfminer
            filtered_text = re.sub(r'\(cid:[^\)]+\)', '', "".join(filtered_text).strip())
            word_list = re.split(r'[\n\s]+', filtered_text.lower().strip())
            nwords = len(word_list)

            if any(end_section in word_list[:3] for end_section in last_section_names):
                reached_end = True
                continue
            
            #removing everything before the author block as well as the correspondance fields
            if p <= 1:
                nauthors_detected = sum(lastname.lower() in ' '.join(word_list) for lastname in author_last_names)
                if nauthors_detected >= 0.5 * len(author_last_names) and y0 > 0.3 * page_height:
                    text_blocks, tag_blocks, nwords_in_blocks = [], [], []
                    section_to_find = 'ABSTRACT'
                    filtered_text = []
                if nauthors_detected > 0 and '@' in filtered_text:
                    filtered_text = []

            #removing blocks likely headers with publication date
            if any([m in word_list for m in month_markers]) and any([y in word_list for y in year_markers]) and nwords < 10:
                continue

            #removing figure captions
            if any([figname in word_list[0] for figname in ['fig', 'figure', 'table', 'image']]):
                continue

            #removing previous block if likely a header but not followed by capitalized paragraph
            if filtered_text and not filtered_text[0].isupper() and nwords_in_blocks and nwords_in_blocks[-1] <= 3:
                text_blocks.pop()
                tag_blocks.pop()
                nwords_in_blocks.pop()
            elif (filtered_text and filtered_text.strip() and filtered_text.strip()[0].isupper() and nwords_in_blocks and 
                  nwords_in_blocks[-1] <= 3 and text_blocks and any(h in re.sub(r'[\n\s]+', ' ', text_blocks[-1].lower()) for h in section_list)):
                section_heading = ''.join([w.upper() for w in re.sub(r'[\d.]', '', text_blocks[-1])])
                if nwords > nword_sections_th:
                    tag_blocks[-1] = section_heading

            if not reached_end and filtered_text and (max(sizes, default=0) >= size_threshold * size_mode or nwords > 50):
                if section_to_find == 'ABSTRACT' and nwords > nword_abstract_th:
                    tag = 'ABSTRACT'
                    if word_list[-1][-1] == '.':
                        section_to_find = 'INTRODUCTION'
                elif section_to_find == 'INTRODUCTION':
                    if nwords > nword_sections_th:
                        tag = 'INTRODUCTION'
                        section_heading, section_to_find = 'INTRODUCTION', 'NEXTHEADING'
                    else:
                        tag = 'UNDEFINED'
                elif section_to_find == 'NEXTHEADING' and nwords > nword_sections_th:
                    tag = section_heading
                
                text_blocks.append(filtered_text)
                tag_blocks.append(tag)
                nwords_in_blocks.append(nwords)

    sections = {'paperId': paperId,
            'pdf_abstract': '\n'.join(text for t, text in zip(tag_blocks, text_blocks) if any(s in t.lower() for s in {'abstract'})),
            'pdf_introduction': '\n'.join(text for t, text in zip(tag_blocks, text_blocks) if any(s in t.lower() for s in {'introduction', 'related work', 'i.', 'ii.'})),
            'pdf_results': '\n'.join(text for t, text in zip(tag_blocks, text_blocks) if any(s in t.lower() for s in {'results', 'experiment', 'i.', 'ii.'})),
            'pdf_discussion': '\n'.join(text for t, text in zip(tag_blocks, text_blocks) if any(s in t.lower() for s in {'discussion', 'conclusion', 'v.', 'vi.'})),
            'pdf_methods': '\n'.join(text for t, text in zip(tag_blocks, text_blocks) if any(s in t.lower() for s in {'methods', 'materials', 'experimental', 'materials and methods', 'experimental procedure'})),
            'pdf_text': '\n'.join(text for t, text in zip(tag_blocks, text_blocks) if not any(s in t.lower() for s in {'undefined'})),
            'author_list': authorlist
            }
    
    return sections, text_blocks, tag_blocks

In [None]:
def download_pdfs(inputs):
    #you'll need to have ChromeDriver installed
    txtdata, downloads_dir = inputs
    service = Service(executable_path="/usr/bin/chromedriver")
    downloads_dir = downloads_dir or './data/pdf/downloads'
    os.makedirs(downloads_dir, exist_ok=True)
    output_dir = os.path.dirname(downloads_dir)

    txtdata, downloads_dir = inputs
    service = Service(executable_path="/usr/bin/chromedriver")
    if downloads_dir is None:
        downloads_dir = './data/pdf/downloads'

    output_dir = '/'.join(downloads_dir.split('/')[:-1])
    

    timeout_seconds = 10
    timeout_startdw_seconds = 5
    block_markers = {"captcha", "verify you are human", "forbidden", "this site can’t be reached"}

    options = webdriver.ChromeOptions()
    prefs = {
        "plugins.always_open_pdf_externally": True,
        "download.prompt_for_download": False,
        "download.default_directory": downloads_dir,
        "download.directory_upgrade": True
    }
    options.add_experimental_option("prefs", prefs)

    driver = webdriver.Chrome(service=service, options=options)
    text_pdfs = []
    fraction_print = 0
    processed_list = []

    for k, paper in enumerate(txtdata):
        try:
            custom_download_dir = os.path.join(downloads_dir, paper['paperId'])
            os.makedirs(custom_download_dir, exist_ok=True)

            # Set Chrome's download directory dynamically
            driver.execute_cdp_cmd(
                    "Page.setDownloadBehavior", 
                    {"behavior": "allow",
                    "downloadPath": custom_download_dir}
                    )
            
            clean_url = re.sub(r'^http://', 'https://', paper['openAccessPdf'])
            filename = []
            driver.set_page_load_timeout(timeout_seconds)
            driver.get(clean_url)
            end_time = time.time() + timeout_seconds
            end_time_startdw = time.time() + timeout_startdw_seconds
            
            while time.time() < end_time:
                time.sleep(0.1)
                filename = [fname for fname in glob.glob(os.path.join(custom_download_dir, '*.pdf')) if not any(id in fname for id in processed_list)]
                if filename:
                    processed_list = [id for id in processed_list if not any(id in fname for fname in filename)]
                    if len(filename) > 1:
                        raise ImportError('more than one download to process', filename, processed_list)
                    filename = filename[0]
                    break
                if any(marker.lower() in driver.page_source.lower() for marker in block_markers):
                    filename = None
                    break
                if time.time() > end_time_startdw and not glob.glob(os.path.join(custom_download_dir, '*.crdownload')):
                    print(f"Paper {k}: Download not started after {timeout_startdw_seconds} seconds for {clean_url}.")
                    filename = None
                    break
            else:
                print(f"Paper {k}: Download not finished after {timeout_seconds} seconds for {clean_url}.")
                filename = None
        
            if filename:
                filepath = os.path.join(custom_download_dir, filename)
                pdf_data,_,_ = extract_pdf_sections(filepath, paper['authorName'], paper['paperId'])
                processed_list.append(paper['paperId'])
                pdf_data['problem'] = None
                shutil.rmtree(custom_download_dir)
                # os.remove(filepath)
            else:
                pdf_data = {'paperId': txtdata[k]['paperId'], 'author_list': paper['authorName'], 'pdf_text': None, 'pdf_abstract': None, 'pdf_introduction': None, 
                            'pdf_discussion': None, 'pdf_results': None, 'pdf_methods': None, 'problem': 'captcha'}
        except (TimeoutException, WebDriverException):
            print(f"Paper {k}: Error loading {clean_url}")
            pdf_data = {'paperId': txtdata[k]['paperId'], 'author_list': paper['authorName'], 'pdf_text': None, 'pdf_abstract': None, 'pdf_introduction': None, 
                        'pdf_discussion': None, 'pdf_results': None, 'pdf_methods': None, 'problem': 'broken link'}
        
        text_pdfs.append(pdf_data)
        fraction_done = (k + 1) / len(txtdata) * 100
        if fraction_done > fraction_print:
            print(f"{fraction_done:.0f}% of papers processed in {downloads_dir}")
            fraction_print = fraction_print + 20

    driver.quit()

    #Extracting pdfs from downloads that took too long to be processed immediately
    if os.path.isdir(downloads_dir):
        for dirname in os.listdir(downloads_dir):
            dir_path = os.path.join(downloads_dir, dirname)
            if os.path.isdir(dir_path) and not glob.glob(os.path.join(dir_path, '*.pdf')):
                shutil.rmtree(dir_path)
            elif glob.glob(os.path.join(dir_path, '*.pdf')):
                filepath = glob.glob(os.path.join(dir_path, '*.pdf'))[0]
                paper_data = next((pdf_data for pdf_data in text_pdfs if pdf_data['paperId'] == dirname), None)
                if paper_data:
                    pdf_data,_,_ = extract_pdf_sections(filepath, paper_data['author_list'], paper_data['paperId'])
                    for key in pdf_data.keys():
                        paper_data[key] = pdf_data[key]
                    paper_data['problem'] = None
                
        shutil.rmtree(downloads_dir)

    return text_pdfs

In [12]:
metadata, textdata = load_dataset(dataset_source, years=2005)

metadata loaded for years: ['2005']; text data loaded for years: ['2005']


In [None]:
#implement extraction of cited articles before running the batch
start_year = 2020
end_year = 2025
num_processes = 10
download_basedir = '/home/jul/DST/recoRAG/data/pdf'

for year in range(start_year, end_year+1):
    metadata, textdata = load_dataset(dataset_source, years=year)
    N = len(textdata)
    batch_size = 100 #len(textdata) // num_processes
    data4pdf = pd.merge(textdata[['paperId','openAccessPdf']], metadata[['paperId','authorName']], on='paperId', how='left')
    textdata_list = [(data4pdf.iloc[k*batch_size:min(N, (k+1)*batch_size)].to_dict(orient='records'), os.path.join(download_basedir, str(k))) for k in range(N // batch_size + 1)]
    textdata_list = [batch for batch in textdata_list if len(batch[0]) > 0]
    with multiprocessing.Pool(processes=num_processes) as pool:
        results = list(pool.imap(download_pdfs, textdata_list))

    pdf_texts = pd.DataFrame([pdf for result in results for pdf in result])
    textdata_pdf = pd.merge(left=textdata, right=pdf_texts, on='paperId', how='left')

    filepath = os.path.join(path_to_data, 'text', str(year), f'{dataset_source}_textpdf_{year}.parquet')
    textdata_pdf.to_parquet(filepath, engine="pyarrow", compression="snappy", index=True)

In [34]:
#Merging with text database and saving
pdf_texts = pd.DataFrame([pdf for result in results for pdf in result])
textdata_pdf = pd.merge(left=textdata, right=pdf_texts, on='paperId', how='left')
# textdata_pdf.to_parquet(file_path + '_text_pdf.parquet', engine="pyarrow", compression="snappy", index=True)

In [42]:
display(textdata)
pd.DataFrame([pdf for result in results for pdf in result])

Unnamed: 0,paperId,title,abstract,referenceTitles,openAccessPdf
0,0001d5fbff6f7c763a78dd5d141292416dcfae59,Disruption of Fibroblast Growth Factor Signal ...,Purpose: Synovial sarcoma is a soft tissue sar...,,https://aacrjournals.org/clincancerres/article...
1,000372290caae5482dcdd7954feb895ccbab921d,Shuffled iterative decoding,Shuffled versions of iterative decoding of low...,Low-Density Parity-Check Codes Turbo decoder a...,http://www.merl.com/publications/docs/TR2005-0...
2,000376ec6f6ade6261fc12df9eddd25a88c1e9ca,Diphasic effects of Astragalus membranaceus BU...,This study was designed to investigate the eff...,,https://www.jstage.jst.go.jp/article/bpb/28/8/...
3,0003a5747ee3676be1b6d2c72c13a04e41811274,Critical roles of CD146 in zebrafish vascular ...,"In this report, we use zebrafish as a model sy...",,https://onlinelibrary.wiley.com/doi/pdfdirect/...
4,00043e28cffe744909bf69cde70e6dac0bce0a10,Differential Contribution of Troponin I Phosph...,Cardiac troponin I is a phosphorylation target...,,http://www.jbc.org/article/S0021925820589394/pdf
...,...,...,...,...,...
69983,fffb5a27f191307c8f2fed3a7d1162f93cbb8cb6,Irregular cycles and steroid hormones in polyc...,BACKGROUND\nThis cross-sectional study was und...,Predictors for treatment failure after laparos...,https://academic.oup.com/humrep/article-pdf/20...
69984,fffbc9b37f20515de2ff4fe60353d52dc37e65f5,An organizational grid of federated MOSIX clus...,MOSIX is a cluster management system that uses...,From sandbox to playground: dynamic virtual en...,http://www.cs.virginia.edu/~jl3aq/courses/CS85...
69985,fffd2d6d54b4023ece28bfdebffdfb00502a09ec,Mutations in the RNA Polymerase III Subunit Rp...,ABSTRACT Termination by RNA polymerase III (Po...,Nuclear surveillance and degradation of hypomo...,https://europepmc.org/articles/pmc543423?pdf=r...
69986,ffff785a92f7cacc7ecb6f3ac3f20ba91d5213b3,The Core Histone N-terminal Tail Domains Funct...,Salt-dependent oligomerization of nucleosomal ...,Histone modifications: combinatorial complexit...,http://www.jbc.org/article/S0021925820789519/pdf


Unnamed: 0,paperId,pdf_abstract,pdf_introduction,pdf_results,pdf_discussion,pdf_methods,pdf_text,author_list,problem
0,0001d5fbff6f7c763a78dd5d141292416dcfae59,Abstract Purpose: Synovial sarcoma is a soft t...,Synovial sarcoma is the most frequent soft-tis...,Results\nSynovial sarcoma cell lines expressed...,Discussion\nFGFs may activate genetic programs...,Materials and Methods\nTumor tissues were obta...,Abstract Purpose: Synovial sarcoma is a soft t...,"T. Ishibe,T. Nakayama,T. Okamoto,T. Aoyama,K. ...",
1,000372290caae5482dcdd7954feb895ccbab921d,Replica shufﬂed versions of interactive decode...,This work may not be copied or reproduced in w...,D. Simulation results\nIII. ITERATIVE DECODING...,,,Replica shufﬂed versions of interactive decode...,"Juntan Zhang,M. Fossorier",
2,000376ec6f6ade6261fc12df9eddd25a88c1e9ca,This study was designed to investigate the eff...,The roots of Astragalus membranaceus BUNGE...,M) induced a similar sustained contractio...,DISCUSSION\nThe present study ﬁrst and detaile...,MATERIALS AND METHODS\nPlant Material and P...,This study was designed to investigate the eff...,"Bi-qi Zhang,Shen-Jiang Hu,Lihong Qiu,Q. Shan,J...",
3,0003a5747ee3676be1b6d2c72c13a04e41811274,"In this report, we use zebraﬁsh as a model sys...","INTRODUCTION\nCD146, also known as MUC18, A32,...","A cDNA clone (clone number, 3512;\nGenBank acc...",,Total RNA was collected from ze-\nbraﬁsh embry...,"In this report, we use zebraﬁsh as a model sys...","Barden Chan,S. Sinha,Dan Cho,R. Ramchandran,V....",
4,00043e28cffe744909bf69cde70e6dac0bce0a10,Cardiac troponin I is a phosphorylation target...,Protein kinase C (PKC)2 activation is an impor...,EXPERIMENTAL PROCEDURES\nMutagenesis Strategy—...,DISCUSSION\nResults from the present study dem...,EXPERIMENTAL PROCEDURES\nMutagenesis Strategy—...,Cardiac troponin I is a phosphorylation target...,"M. Westfall,A. M. Lee,D. Robinson",
...,...,...,...,...,...,...,...,...,...
95,004d7fddf20a6932e6391fc9dcf9dca8c52abcaa,There are several factors that inﬂuence the ra...,"INTRODUCTION\nIn the recent years, a lot of ef...",RESULTS\nThe estimated relative uncertainty in...,DISCUSSION\nMany interesting results can be fo...,Introductory remarks\nIn the paper by Ha˚kanss...,There are several factors that inﬂuence the ra...,"M. Båth,M. Håkansson,S. Börjesson,S. Kheddache...",
96,004dbb1be00568e82684981115029880f24f1967,,,,,,,"Y. Dewoody,J. A. Dewoody",captcha
97,004e863417f890203b7a5464569bfe5ea0b7b35c,The innate immune system serves\nas an initial...,"At first glance, NKT cells may\nbe easily mist...",,,,The innate immune system serves\nas an initial...,"L. Kaer,S. Joyce",
98,005032c8f2b4ae916505f4fb43c2d638b6b72b78,"Abstract Purpose: t(12;21)(p13; q22), present ...",The t(12;21)(p13;q22) occurs in f25% of childh...,"Results\nThe mRNA expression levels of TEL, AM...","Discussion\nIn the present study, we examined ...",Materials and Methods\nPatient samples. Bone m...,"Abstract Purpose: t(12;21)(p13; q22), present ...","W. Stams,M. D. den Boer,H. Beverloo,J. Meijeri...",


In [35]:
display(metadata)
display(textdata_pdf)

Unnamed: 0,paperId,openAccessPdf,fieldsOfStudy,venue,authorName,authorId,citationCount,influentialCitationCount,publicationDate,publicationYear,referenceIds,referenceTitles
0,0001d5fbff6f7c763a78dd5d141292416dcfae59,https://aacrjournals.org/clincancerres/article...,"Biology,Medicine",Clinical Cancer Research,"T. Ishibe,T. Nakayama,T. Okamoto,T. Aoyama,K. ...","14839966,10239302,2000448727,3201672,7147569,2...",88,6.0,2005-04-01,2005,,
1,000372290caae5482dcdd7954feb895ccbab921d,http://www.merl.com/publications/docs/TR2005-0...,"Computer Science,Mathematics",IEEE Transactions on Communications,"Juntan Zhang,M. Fossorier",319335048891339,319,26.0,2005-03-07,2005,"17d0afe4f814e5abc6252e572e3b039082bf6f07,02464...",Low-Density Parity-Check Codes Turbo decoder a...
2,000376ec6f6ade6261fc12df9eddd25a88c1e9ca,https://www.jstage.jst.go.jp/article/bpb/28/8/...,Medicine,Biological and Pharmaceutical Bulletin,"Bi-qi Zhang,Shen-Jiang Hu,Lihong Qiu,Q. Shan,J...","11809650,6617669,123635715,11305673,2152146096...",24,2.0,2005-08-01,2005,,
3,0003a5747ee3676be1b6d2c72c13a04e41811274,https://onlinelibrary.wiley.com/doi/pdfdirect/...,"Biology,Medicine",Developmental Dynamics,"Barden Chan,S. Sinha,Dan Cho,R. Ramchandran,V....",356909232114125286211325257421347733658233,53,2.0,2005-01-01,2005,,
4,00043e28cffe744909bf69cde70e6dac0bce0a10,http://www.jbc.org/article/S0021925820589394/pdf,"Chemistry,Medicine",Journal of Biological Chemistry,"M. Westfall,A. M. Lee,D. Robinson",50259834924991148706510,70,3.0,2005-12-16,2005,,
...,...,...,...,...,...,...,...,...,...,...,...,...
69983,fffb5a27f191307c8f2fed3a7d1162f93cbb8cb6,https://academic.oup.com/humrep/article-pdf/20...,"Medicine,Biology",Human Reproduction,"S. Doi,M. Al-Zaid,P. Towers,C. Scott,K. A. Al-...",40151220140419556737556460787452101403335938,46,0.0,2005-09-01,2005,"3d2aaa390e58ff4d10cb2b6bfb4813e48565998c,8abd1...",Predictors for treatment failure after laparos...
69984,fffbc9b37f20515de2ff4fe60353d52dc37e65f5,http://www.cs.virginia.edu/~jl3aq/courses/CS85...,Computer Science,CCGrid 2005. IEEE International Symposium on C...,"A. Barak,A. Shiloh,Lior Amar",1442149964039977234848329,42,1.0,2005-05-09,2005,"1713c96f258a9925b3ca0d814fdd0641c07bb3f1,3342b...",From sandbox to playground: dynamic virtual en...
69985,fffd2d6d54b4023ece28bfdebffdfb00502a09ec,https://europepmc.org/articles/pmc543423?pdf=r...,"Biology,Medicine",Molecular and Cellular Biology,"Ying Huang,R. Intine,Amy M Mozlin,Samuel A. Ha...",4835581165340851500362756628433935489,64,1.0,2005-01-01,2005,"49bb10549894f0596e527139cf4a3bfb28fd6f2c,ced74...",Nuclear surveillance and degradation of hypomo...
69986,ffff785a92f7cacc7ecb6f3ac3f20ba91d5213b3,http://www.jbc.org/article/S0021925820789519/pdf,"Biology,Medicine",Journal of Biological Chemistry,"Faye Gordon,K. Luger,J. Hansen",48200695386471034597814,143,3.0,2005-10-07,2005,"83a02dee50f2f83987d5490c8c9501609fd8f2df,c2150...",Histone modifications: combinatorial complexit...


Unnamed: 0,paperId,title,abstract,referenceTitles,openAccessPdf,pdf_abstract,pdf_introduction,pdf_results,pdf_discussion,pdf_methods,pdf_text,problem
0,0001d5fbff6f7c763a78dd5d141292416dcfae59,Disruption of Fibroblast Growth Factor Signal ...,Purpose: Synovial sarcoma is a soft tissue sar...,,https://aacrjournals.org/clincancerres/article...,Abstract Purpose: Synovial sarcoma is a soft t...,Synovial sarcoma is the most frequent soft-tis...,Results\nSynovial sarcoma cell lines expressed...,Discussion\nFGFs may activate genetic programs...,Materials and Methods\nTumor tissues were obta...,Abstract Purpose: Synovial sarcoma is a soft t...,
1,000372290caae5482dcdd7954feb895ccbab921d,Shuffled iterative decoding,Shuffled versions of iterative decoding of low...,Low-Density Parity-Check Codes Turbo decoder a...,http://www.merl.com/publications/docs/TR2005-0...,Replica shufﬂed versions of interactive decode...,This work may not be copied or reproduced in w...,D. Simulation results\nIII. ITERATIVE DECODING...,,,Replica shufﬂed versions of interactive decode...,
2,000376ec6f6ade6261fc12df9eddd25a88c1e9ca,Diphasic effects of Astragalus membranaceus BU...,This study was designed to investigate the eff...,,https://www.jstage.jst.go.jp/article/bpb/28/8/...,This study was designed to investigate the eff...,The roots of Astragalus membranaceus BUNGE...,M) induced a similar sustained contractio...,DISCUSSION\nThe present study ﬁrst and detaile...,MATERIALS AND METHODS\nPlant Material and P...,This study was designed to investigate the eff...,
3,0003a5747ee3676be1b6d2c72c13a04e41811274,Critical roles of CD146 in zebrafish vascular ...,"In this report, we use zebrafish as a model sy...",,https://onlinelibrary.wiley.com/doi/pdfdirect/...,"In this report, we use zebraﬁsh as a model sys...","INTRODUCTION\nCD146, also known as MUC18, A32,...","A cDNA clone (clone number, 3512;\nGenBank acc...",,Total RNA was collected from ze-\nbraﬁsh embry...,"In this report, we use zebraﬁsh as a model sys...",
4,00043e28cffe744909bf69cde70e6dac0bce0a10,Differential Contribution of Troponin I Phosph...,Cardiac troponin I is a phosphorylation target...,,http://www.jbc.org/article/S0021925820589394/pdf,Cardiac troponin I is a phosphorylation target...,Protein kinase C (PKC)2 activation is an impor...,EXPERIMENTAL PROCEDURES\nMutagenesis Strategy—...,DISCUSSION\nResults from the present study dem...,EXPERIMENTAL PROCEDURES\nMutagenesis Strategy—...,Cardiac troponin I is a phosphorylation target...,
...,...,...,...,...,...,...,...,...,...,...,...,...
69983,fffb5a27f191307c8f2fed3a7d1162f93cbb8cb6,Irregular cycles and steroid hormones in polyc...,BACKGROUND\nThis cross-sectional study was und...,Predictors for treatment failure after laparos...,https://academic.oup.com/humrep/article-pdf/20...,,,,,,,
69984,fffbc9b37f20515de2ff4fe60353d52dc37e65f5,An organizational grid of federated MOSIX clus...,MOSIX is a cluster management system that uses...,From sandbox to playground: dynamic virtual en...,http://www.cs.virginia.edu/~jl3aq/courses/CS85...,,,,,,,
69985,fffd2d6d54b4023ece28bfdebffdfb00502a09ec,Mutations in the RNA Polymerase III Subunit Rp...,ABSTRACT Termination by RNA polymerase III (Po...,Nuclear surveillance and degradation of hypomo...,https://europepmc.org/articles/pmc543423?pdf=r...,,,,,,,
69986,ffff785a92f7cacc7ecb6f3ac3f20ba91d5213b3,The Core Histone N-terminal Tail Domains Funct...,Salt-dependent oligomerization of nucleosomal ...,Histone modifications: combinatorial complexit...,http://www.jbc.org/article/S0021925820789519/pdf,,,,,,,


In [None]:
'https://www.cell.com/article/S009286740500098X/pdf'

In [None]:
pdf_data, pdf_sections, pdf_tags = extract_pdf_sections(os.path.join(path_to_data, 'pdf', '0', 'PIIS009286740500098X (1).pdf'),'I. Krylova,E. Sablin,Jamie M. R. Moore,R. Xu,G. Waitt,J. A. MacKay,D. Juzumiene,J. Bynum,K. Madauss,V. Montana,L. Lebedeva,M. Suzawa,Jon D. Williams,Shawn P. Williams,R. Guy,J. W. Thornton,R. Fletterick,T. Willson,H. Ingraham', 0)

In [24]:
[name.split(' ')[-1] for name in data4pdf['authorName'].iloc[0].split(',')]

['Krylova',
 'Sablin',
 'Moore',
 'Xu',
 'Waitt',
 'MacKay',
 'Juzumiene',
 'Bynum',
 'Madauss',
 'Montana',
 'Lebedeva',
 'Suzawa',
 'Williams',
 'Williams',
 'Guy',
 'Thornton',
 'Fletterick',
 'Willson',
 'Ingraham']

In [27]:
[print(tag, '\n', txt, end='\n\n') for txt, tag in zip(pdf_sections, pdf_tags)];

UNDEFINED 
 Summary

ABSTRACT 
 Vertebrate members of the nuclear receptor NR5A sub-
family, which includes steroidogenic factor 1 (SF-1) and
liver receptor homolog 1 (LRH-1), regulate crucial as-
pects of development, endocrine homeostasis, and
metabolism. Mouse LRH-1 is believed to be a ligand-
independent transcription factor with a large and
empty hydrophobic pocket. Here we present struc-
tural and biochemical data for three other NR5A mem-
bers—mouse and human SF-1 and human LRH-1—
which reveal that these receptors bind phosphatidyl
inositol second messengers and that ligand binding
is required for maximal activity. Evolutionary analysis
of structure-function relationships across the SF-1/
LRH-1 subfamily indicates that ligand binding is the
ancestral state of NR5A receptors and was uniquely
diminished or altered in the rodent LRH-1 lineage. We
propose that phospholipids regulate gene expression
by directly binding to NR5A nuclear receptors.

INTRODUCTION 
 Introduction

INTRODUC

In [None]:
pdf_text = extract_text(os.path.join(config.path_to_data, 'pdf', '002dea70291555c99aa2d88b87db7e8d3ccad66a.pdf'))

def process_pdf(pdf_text):
    
    # Find sections. We’ll guess some common next headings for each section:
    abstract_text = find_section(pdf_text, "abstract", 
                                 ["\n\n"], max_chars=3000)
    introduction_text = find_section(pdf_text, "introduction", 
                                     ["methods", "methodology", "materials and methods", "results", "discussion"], max_chars=5000)
    discussion_text = find_section(pdf_text, "discussion", 
                                   ["conclusion", "references", "acknowledgements"], max_chars=5000)
    
    all_text = find_section(pdf_text, " ", 
                                   ["conclusion", "references", "acknowledgements"], max_chars=5000)
    
    return abstract_text, introduction_text, discussion_text, all_text

abstract_text, introduction_text, discussion_text, all_text = process_pdf(pdf_text)

In [620]:
idx = 19
print(metadata['authorName'].iloc[idx])
sections, text_blocks, tag_blocks = extract_pdf_sections(os.path.join(config.path_to_data, 'pdf', metadata['paperId'].iloc[idx] + '.pdf'), metadata['authorName'].iloc[idx])
print(metadata['paperId'].iloc[idx])
# [print(tag + '\n', txt, end='\n\n') for txt, tag in zip(text_blocks, tag_blocks)];

print(sections['abstract'])



L. Poirel,M. Lartigue,J. Decousser,P. Nordmann
0016746ad49041402c38da533049ebe0fe9539d1
Several expanded-spectrum -lactamase blaCTX-M genes are associated with ISEcp1-like elements in Enter-
obacteriaceae. We found that ISEcp1B was able to mobilize the adjacent blaCTX-M-19 gene by a transpositional
mechanism in Escherichia coli by recognizing a variety of DNA sequences as right inverted repeats.


In [594]:
print(sections['all'])

Abstract Purpose: Synovial sarcoma is a soft tissue sarcoma, the growth regulatory mechanisms of which
are unknown.We investigated the involvement of fibroblast growth factor (FGF) signals in synovial
sarcoma and evaluated the therapeutic effect of inhibiting the FGF signal.
Experimental Design:The expression of 22 FGF and 4 FGF receptor (FGFR) genes in18 prima-
ry tumors and five cell lines of synovial sarcoma were analyzed by reverse transcription-PCR.
Effects of recombinant FGF2, FGF8, and FGF18 for the activation of mitogen-activated protein
kinase (MAPK) and the growth of synovial sarcoma cell lines were analyzed. Growth inhibitory
effects of FGFR inhibitors on synovial sarcoma cell lines were investigated in vitro and in vivo.
Results: Synovial sarcoma cell lines expressed multiple FGF genes especially those expressed
in neural tissues, among which FGF8 showed growth stimulatory effects in all synovial sarcoma
cell lines. FGF signals in synovial sarcoma induced the phosphorylatio