## Extract the reference list from the given paper PDF, and look for the referenced papers as well as their metadata on PubMed

### Extract text from the paper's pdf and build a references list accordingly

In [1]:
import os
import pandas as pd
import re
from pypdf import PdfReader

# Function to extract references from a PDF file using a context-aware pattern
def parse_references(pdf_path):
    reader = PdfReader(pdf_path)
    refs = []
    ref_section = False # Use as a flag to find the references section
    last_ref_num = 0 # To track the last reference number

    for page in reader.pages:
        # Extract text on a page by page basis
        text = page.extract_text()
        # When the "References" bit is found, cut out the text preceding it
        if "References" in text:
            ref_section = True
            text = text.split("References", 1)[1]
        if ref_section:
            # Find all matches of reference pattern
            for match in re.finditer(r'(\d+)\. (.*?\..*?)\.', text, re.DOTALL):
                current_ref_num = int(match.group(1))
                # Append reference if the current reference number is sequential
                if current_ref_num == last_ref_num + 1:
                    refs.append(match.group(2))
                    last_ref_num = current_ref_num
    # Use dot as column separator for Author and Title (as in the PDF)
    references = pd.DataFrame(refs)[0].str.split('.', n=1, expand=True)
    references.columns = ['Author', 'Title']
    # Replace newline characters with space and strip leading whitespace
    return references.applymap(lambda x: x.replace('\n', ' ').strip())

pdf = os.path.join('..','paper','s40798-019-0202-3.pdf')

# Extract references using the context-aware pattern from the uploaded PDF file
references_full = parse_references(pdf)

references_full.to_csv(os.path.join('..','results','paper_refs.csv'), index=False)

references_full.head()

Unnamed: 0,Author,Title
0,"Russell S, Norvig P",Artificial Intelligence: a modern approach
1,"Witten IH, Frank E, Hall MA, et al",Data Mining: practical Machine Learning tools ...
2,"Zaki MJ, Meira Jr, W",Data Mining and analysis: fundamental concepts...
3,"Passfield L, Hopker JG",A mine of information: can sports analytics pr...
4,"Rein R, Memmert D",Big data and tactical analysis in elite soccer...


#### Only include the studies that were relevant to the systematic review (i.e., exclude papers that were used as methodology references)

In [2]:
# According to the paper, 58 studies in total were included in the systematic review [15, 20–76]

# Save only the studies that are relevant to the systematic review
references = references_full.iloc[[14] + list(range(19, 76))]

references.to_csv(os.path.join('..','results','refs_systematic.csv'), index=False)

references.head()

Unnamed: 0,Author,Title
14,"López-Valenciano A, Ayala F, Puerta JM, et al",A preventive model for muscle injuries: a nove...
19,Li C,Predict the neural network mathematical model ...
20,Lu G,Evaluation model of young basketball players ’...
21,Wu L,The participating team ’s technical analysis o...
22,Zhang Q,Prediction based on basketball competition vid...


### Scrape PubMed's database in order to find study IDs corresponding to our references

In [7]:
# https://medium.com/@felipe.odorcyk/scrapping-data-from-pubmed-database-78a9b53de8ca

from Bio import Entrez

def search_pubmed(query, email):
    # The Pubmed API requires an email to be used
    Entrez.email = email
    # Set how the search should be performed
    handle = Entrez.esearch(db='pubmed',sort='relevance',
    retmax='1',
    retmode='xml',
    term=query)
    results = Entrez.read(handle)
    return results

mazza = 'matteo.mazzarelli@gmail.com'

studyids = []
for index, row in references.iterrows():
    # Splitting the author field and taking the first author only
    author = row['Author'].split(',')[0].strip()
    # Splitting the title field and taking the first five words
    title = ' '.join(row['Title'].split()[:5])

    # Forming the query and searching
    studies = search_pubmed(f"{author} {title}", mazza)
    # Get the first PubMed ID from the results, if any
    pubmed_id = studies['IdList'][:1]
    if pubmed_id:
        # Append a tuple of (index, PubMed ID)
        studyids.append((index, pubmed_id[0]))

studyids[:5]

[(14, '29283933'),
 (19, '37182376'),
 (23, '24993662'),
 (35, '28692649'),
 (37, '24034723')]

#### Using the study IDs we found, fetch the information we are looking for from PubMed's DB (in particular, abstracts)

In [10]:
def fetch_details(id_tuples, email):
    # Flatten the list of lists into a single list
    id_list = [pubmed_id for _, pubmed_id in id_tuples]
    # Join the IDs with commas
    ids = ','.join(id_list)
    # The Pubmed API requires an email to be used
    Entrez.email = email
    handle = Entrez.efetch(db='pubmed', retmode='xml', id=ids)
    results = Entrez.read(handle)
    return results

# Create a dictionary to quickly find the index by PubMed ID
index_dict = {pubmed_id: index for index, pubmed_id in studyids}

index_list = []
author_list = []
title_list = []
abstract_list = []
journal_list = []
year_list = []

papers = fetch_details(studyids, mazza)

for i, paper in enumerate(papers['PubmedArticle']):
    pubmed_id = paper['MedlineCitation']['PMID']
    index = index_dict.get(pubmed_id, 'No Index') # Get the index from the dictionary
    index_list.append(index)
    authors = paper['MedlineCitation']['Article']['AuthorList']
    author_names = []
    for author in authors:
        # Some authors might not have a LastName or Initial, handle these cases
        last_name = author.get('LastName', '')
        initials = author.get('Initials', '')
        author_names.append(f"{last_name} {initials}".strip())

    # Join all authors' names with commas
    all_authors = ', '.join(author_names)
    author_list.append(all_authors)

    title_list.append(paper['MedlineCitation']['Article']['ArticleTitle'])

    # Fetch abstract, journal & publication year. If not found, set NA
    try:
        abstract_list.append(paper['MedlineCitation']['Article']['Abstract']['AbstractText'][0])
    except:
        abstract_list.append('')
    try:
        journal_list.append(paper['MedlineCitation']['Article']['Journal']['Title'])
    except:
        journal_list.append('')
    try:
        year_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year'])
    except:
        year_list.append('')

# Create a pandas df with the results
abstracts = pd.DataFrame(list(zip(
    index_list, abstract_list, journal_list, year_list
)),
columns=['Ref #', 'Abstract', 'Journal', 'Year'])

abstracts.head()

Unnamed: 0,Ref #,Abstract,Journal,Year
0,14,The application of contemporary statistical ap...,Medicine and science in sports and exercise,2018
1,19,Network science provides a set of tools for th...,Physics of life reviews,2023
2,23,"The soaring amount of data, especially spatial...",European journal of sport science,2015
3,35,This paper describes models for detecting indi...,PloS one,2017
4,37,Cardiovascular autonomic neuropathy (CAN) is a...,Computers in biology and medicine,2013


#### Merge the references data frame with the data frame produced by scraping PubMed

In [11]:
# Perform a left join intersection between the original references df and the one w/ abstracts
ref_abs = references.merge(abstracts, left_index=True, right_on='Ref #', how='left')
ref_abs.drop(columns=['Ref #'], inplace=True)
ref_abs.index = range(len(ref_abs))

# Save it to csv for future use
ref_abs.to_csv(os.path.join('..','results','refs_abstracts_sys.csv'), index=False)

ref_abs.head()

Unnamed: 0,Author,Title,Abstract,Journal,Year
0,"López-Valenciano A, Ayala F, Puerta JM, et al",A preventive model for muscle injuries: a nove...,The application of contemporary statistical ap...,Medicine and science in sports and exercise,2018.0
1,Li C,Predict the neural network mathematical model ...,Network science provides a set of tools for th...,Physics of life reviews,2023.0
2,Lu G,Evaluation model of young basketball players ’...,,,
3,Wu L,The participating team ’s technical analysis o...,,,
4,Zhang Q,Prediction based on basketball competition vid...,,,
