Import pandas and publication metadata fetching function

In [1]:
import pandas as pd
from request_functions import pub_met

Read queries

In [2]:
q1_txt = open('query_1.txt', 'r')
query_1 = q1_txt.read()
q1_txt.close()

In [3]:
q2_txt = open('query_2.txt', 'r')
query_2 = q2_txt.read()
q2_txt.close()

Call publication metadata fetching function for Pubmed and PMC with cut-off by end of March 2023

In [4]:
pmd_df = pub_met(query_1, 'pubmed', '2023/03/31', 100)

In [5]:
pmc_df = pub_met(query_2, 'pmc', '2023/03/31', 400)

Define PMC metadata categories so they match the names of corresponding categories in Pubmed <br>
When the two dataframes are concatenated, corresponding data types will be in the same columns

In [6]:
pmc_df.columns = [
    'uid',
    'pubdate',
    'epubdate',
    'printpubdate',
    'source',
    'authors',
    'title',
    'volume',
    'issue',
    'pages',
    'articleids',
    'fulljournalname',
    'sortpubdate',
    'pmclivedate',
    'pubmed',
    'doi',
    'pmc',
    'mid',
]

Save intermediate metadata tables to output folder

In [7]:
pmd_df.to_csv('../output/pmd_result.csv')
pmc_df.to_csv('../output/pmc_result.csv')

Concatenate the two tables and remove duplicate publications by titles and IDs

In [8]:
both_df = pd.concat([pmd_df, pmc_df]).reset_index(drop=True)

In [9]:
both_df = both_df.fillna('')

In [10]:
for i in range(len(both_df)):
    for j in ['pubmed', 'pmc', 'doi']:
        if both_df.iloc[i][j] == '':
            both_df.at[i, j] = 'none_' + str(i)

In [11]:
for i in ['title', 'pubmed', 'pmc', 'doi']:
    both_df = both_df.drop_duplicates(i)
both_df = both_df.reset_index(drop=True)

In [12]:
for i in range(len(both_df)):
    for j in ['pubmed', 'pmc', 'doi']:
        if 'none_' in both_df.iloc[i][j]:
            both_df.at[i, j] = ''

Leave relevant columns and give them human-readable names

In [13]:
both_df = both_df[
    ['pubdate', 'authors', 'title', 'fulljournalname', 'pubmed', 'doi', 'pmc']
]

In [14]:
both_df.columns = [
    'Date',
    'Author',
    'Title',
    'Publication Title',
    'PMID',
    'DOI',
    'PMCID',
]

Create link attachments using available publication IDs <br>
Parse publication authors

In [15]:
for i in range(len(both_df)):
    if both_df.iloc[i]['PMCID'] != '':
        both_df.at[i, 'Link Attachments'] = (
            'https://www.ncbi.nlm.nih.gov/pmc/articles/' + both_df.iloc[i]['PMCID']
        )
    elif both_df.iloc[i]['PMID'] != '':
        both_df.at[i, 'Link Attachments'] = (
            'https://pubmed.ncbi.nlm.nih.gov/' + both_df.iloc[i]['PMID']
        )
    else:
        both_df.at[i, 'Link Attachments'] = 'https://doi.org/' + both_df.iloc[i]['DOI']

    names = []
    for j in both_df.iloc[i]['Author']:
        names.append(j['name'])
    both_df.at[i, 'Author'] = ', '.join(names)

Save resulting metadata table to output folder

In [16]:
both_df.to_csv('../output/merged_result.csv')