In [3]:
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

In [4]:
df_pubmed = pd.read_parquet("../data/processed/medline/medline_data_processed.parquet")
df_scopus = pd.read_parquet("../data/processed/scopus/scopus_data_processed.parquet")

In [4]:
pubmed_no_nans = df_pubmed.dropna(subset=["doi", "pubmed_id"])
# make integer
pubmed_no_nans["pubmed_id"] = pubmed_no_nans["pubmed_id"].astype(int)
# create dictionary with doi as key and pubmed_id as value
pubmed_dict = pubmed_no_nans.set_index("pubmed_id")["doi"].to_dict()

# same for scopus
scopus_no_nans = df_scopus.dropna(subset=["doi", "pubmed_id"])
scopus_no_nans["pubmed_id"] = scopus_no_nans["pubmed_id"].astype(int)
scopus_dict = scopus_no_nans.set_index("pubmed_id")["doi"].to_dict()

# Check if there are any DOIs in common
common_dois = set(pubmed_dict.values()).intersection(set(scopus_dict.values()))

# Check if there are any PubMed IDs in common
common_pmid = set(pubmed_dict.keys()).intersection(set(scopus_dict.keys()))

print(f"Number of common DOIs: {len(common_dois)}")
print(f"Number of common PubMed IDs: {len(common_pmid)}")

Number of common DOIs: 11
Number of common PubMed IDs: 0


# PubMed


In [5]:
# Fill missing DOIs in df_pubmed using the mapping from df_scopus
pm_nodoi = df_pubmed["doi"].isna().sum()

df_pubmed["doi"] = df_pubmed.apply(
    lambda row: scopus_dict[row["pubmed_id"]]
    if pd.isna(row["doi"]) and row["pubmed_id"] in scopus_dict
    else row["doi"],
    axis=1,
)

pm_nodoi_after = df_pubmed["doi"].isna().sum()
print(
    f"The number of missing DOIs in df_pubmed decreased from {pm_nodoi} to {pm_nodoi_after}"
)

# Fill missing DOIs in df_scopus using the mapping from df_pubmed
sc_nodoi = df_scopus["doi"].isna().sum()

df_scopus["doi"] = df_scopus.apply(
    lambda row: pubmed_dict[row["pubmed_id"]]
    if pd.isna(row["doi"]) and row["pubmed_id"] in pubmed_dict
    else row["doi"],
    axis=1,
)

sc_nodoi_after = df_scopus["doi"].isna().sum()
print(
    f"The number of missing DOIs in df_scopus decreased from {sc_nodoi} to {sc_nodoi_after}"
)

The number of missing DOIs in df_pubmed decreased from 1341 to 1341
The number of missing DOIs in df_scopus decreased from 4463 to 4463


In [8]:
from src.data.DoiFetcherFromPMID import DoiFetcherFromPMID

for those with pmid but no doi, use the doi fetcher to get the doi


In [7]:
doi_fetcher = DoiFetcherFromPMID()

pm_nodoi_prior = df_pubmed["doi"].isna().sum()

df_pubmed["doi"] = df_pubmed.apply(
    lambda row: doi_fetcher.fetch_doi(row["pubmed_id"])
    if pd.isna(row["doi"])
    else row["doi"],
    axis=1,
)

pm_nodoi_after = df_pubmed["doi"].isna().sum()

print(
    f"The number of missing DOIs in df_pubmed decreased from {pm_nodoi_prior} to {pm_nodoi_after}"
)

The number of missing DOIs in df_pubmed decreased from 1341 to 456


In [8]:
df_pubmed.to_parquet(
    "../data/processed/medline/medline_data_processed_doisfetched.parquet"
)

# Scopus


In [9]:
df_scopus["pubmed_id"] = df_scopus["pubmed_id"].astype(str)
# replace .0 with empty string
df_scopus["pubmed_id"] = df_scopus["pubmed_id"].str.replace(".0", "")

In [10]:
# same for scopus
doi_fetcher = DoiFetcherFromPMID()

sc_nodoi_prior = df_scopus["doi"].isna().sum()

df_scopus["doi"] = df_scopus.apply(
    lambda row: doi_fetcher.fetch_doi(row["pubmed_id"])
    if pd.isna(row["doi"]) and row["pubmed_id"] != "nan"
    else row["doi"],
    axis=1,
)

sc_nodoi_after = df_scopus["doi"].isna().sum()

print(
    f"The number of missing DOIs in df_scopus decreased from {sc_nodoi_prior} to {sc_nodoi_after}"
)

ERROR:root:HTTP error fetching DOI from E-utilities for PMID 8220653: 429 Client Error: Too Many Requests for url: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=8220653&retmode=xml
ERROR:root:Error fetching DOI from pmidcite for PMID 1998131803: 'NoneType' object has no attribute 'dct'


404 Not Found URL[44]: https://icite.od.nih.gov/api/pubs/1998131803


ERROR:root:Error fetching DOI from pmidcite for PMID 2001143319: 'NoneType' object has no attribute 'dct'


404 Not Found URL[44]: https://icite.od.nih.gov/api/pubs/2001143319


Traceback (most recent call last):
  File "/Users/jlq293/Projects/Study-1-Bibliometrics/.conda/lib/python3.11/site-packages/urllib3/connectionpool.py", line 404, in _make_request
    self._validate_conn(conn)
  File "/Users/jlq293/Projects/Study-1-Bibliometrics/.conda/lib/python3.11/site-packages/urllib3/connectionpool.py", line 1058, in _validate_conn
    conn.connect()
  File "/Users/jlq293/Projects/Study-1-Bibliometrics/.conda/lib/python3.11/site-packages/urllib3/connection.py", line 419, in connect
    self.sock = ssl_wrap_socket(
                ^^^^^^^^^^^^^^^^
  File "/Users/jlq293/Projects/Study-1-Bibliometrics/.conda/lib/python3.11/site-packages/urllib3/util/ssl_.py", line 449, in ssl_wrap_socket
    ssl_sock = _ssl_wrap_socket_impl(
               ^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/jlq293/Projects/Study-1-Bibliometrics/.conda/lib/python3.11/site-packages/urllib3/util/ssl_.py", line 493, in _ssl_wrap_socket_impl
    return ssl_context.wrap_socket(sock, server_hostname=serve

**ERROR: ConnectionError = ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))



ERROR:root:Error fetching DOI from pmidcite for PMID 2002150357: 'NoneType' object has no attribute 'dct'


404 Not Found URL[44]: https://icite.od.nih.gov/api/pubs/2002150357


ERROR:root:Error fetching DOI from pmidcite for PMID 2002163431: 'NoneType' object has no attribute 'dct'


404 Not Found URL[44]: https://icite.od.nih.gov/api/pubs/2002163431


ERROR:root:Error fetching DOI from pmidcite for PMID 2002163426: 'NoneType' object has no attribute 'dct'


404 Not Found URL[44]: https://icite.od.nih.gov/api/pubs/2002163426


ERROR:root:HTTP error fetching DOI from E-utilities for PMID 12429073: HTTPSConnectionPool(host='eutils.ncbi.nlm.nih.gov', port=443): Read timed out. (read timeout=10)
ERROR:root:HTTP error fetching DOI from E-utilities for PMID 12955904: HTTPSConnectionPool(host='eutils.ncbi.nlm.nih.gov', port=443): Read timed out. (read timeout=10)
ERROR:root:Error fetching DOI from pmidcite for PMID 15055748: 'NoneType' object has no attribute 'dct'


**ERROR: ConnectionError = ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))



ERROR:root:Error fetching DOI from pmidcite for PMID 2003051576: 'NoneType' object has no attribute 'dct'


404 Not Found URL[44]: https://icite.od.nih.gov/api/pubs/2003051576


ERROR:root:Error fetching DOI from pmidcite for PMID 14608242: 'NoneType' object has no attribute 'dct'


**ERROR: ConnectionError = ('Connection aborted.', OSError(9, 'Bad file descriptor'))



ERROR:root:Error fetching DOI from pmidcite for PMID 15160261: 'NoneType' object has no attribute 'dct'


**ERROR: ConnectionError = ('Connection aborted.', OSError(9, 'Bad file descriptor'))



ERROR:root:Error fetching DOI from pmidcite for PMID 2005026906: 'NoneType' object has no attribute 'dct'


404 Not Found URL[44]: https://icite.od.nih.gov/api/pubs/2005026906


ERROR:root:HTTP error fetching DOI from E-utilities for PMID 15075045: HTTPSConnectionPool(host='eutils.ncbi.nlm.nih.gov', port=443): Read timed out. (read timeout=10)
ERROR:root:Error fetching DOI from pmidcite for PMID 2004154443: 'NoneType' object has no attribute 'dct'


404 Not Found URL[44]: https://icite.od.nih.gov/api/pubs/2004154443


ERROR:root:Error fetching DOI from pmidcite for PMID 2004082844: 'NoneType' object has no attribute 'dct'


404 Not Found URL[44]: https://icite.od.nih.gov/api/pubs/2004082844


ERROR:root:Error fetching DOI from pmidcite for PMID 2004137986: 'NoneType' object has no attribute 'dct'


404 Not Found URL[44]: https://icite.od.nih.gov/api/pubs/2004137986


ERROR:root:Error fetching DOI from pmidcite for PMID 2004200763: 'NoneType' object has no attribute 'dct'


404 Not Found URL[44]: https://icite.od.nih.gov/api/pubs/2004200763


ERROR:root:Error fetching DOI from pmidcite for PMID 15083699: 'NoneType' object has no attribute 'dct'


**ERROR: ConnectionError = ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))



ERROR:root:HTTP error fetching DOI from E-utilities for PMID 15944745: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))
ERROR:root:Error fetching DOI from pmidcite for PMID 2005095290: 'NoneType' object has no attribute 'dct'


404 Not Found URL[44]: https://icite.od.nih.gov/api/pubs/2005095290


Traceback (most recent call last):
  File "/Users/jlq293/Projects/Study-1-Bibliometrics/.conda/lib/python3.11/site-packages/urllib3/connectionpool.py", line 404, in _make_request
    self._validate_conn(conn)
  File "/Users/jlq293/Projects/Study-1-Bibliometrics/.conda/lib/python3.11/site-packages/urllib3/connectionpool.py", line 1058, in _validate_conn
    conn.connect()
  File "/Users/jlq293/Projects/Study-1-Bibliometrics/.conda/lib/python3.11/site-packages/urllib3/connection.py", line 419, in connect
    self.sock = ssl_wrap_socket(
                ^^^^^^^^^^^^^^^^
  File "/Users/jlq293/Projects/Study-1-Bibliometrics/.conda/lib/python3.11/site-packages/urllib3/util/ssl_.py", line 449, in ssl_wrap_socket
    ssl_sock = _ssl_wrap_socket_impl(
               ^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/jlq293/Projects/Study-1-Bibliometrics/.conda/lib/python3.11/site-packages/urllib3/util/ssl_.py", line 493, in _ssl_wrap_socket_impl
    return ssl_context.wrap_socket(sock, server_hostname=serve

**ERROR: ConnectionError = ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))



Traceback (most recent call last):
  File "/Users/jlq293/Projects/Study-1-Bibliometrics/.conda/lib/python3.11/site-packages/urllib3/connectionpool.py", line 404, in _make_request
    self._validate_conn(conn)
  File "/Users/jlq293/Projects/Study-1-Bibliometrics/.conda/lib/python3.11/site-packages/urllib3/connectionpool.py", line 1058, in _validate_conn
    conn.connect()
  File "/Users/jlq293/Projects/Study-1-Bibliometrics/.conda/lib/python3.11/site-packages/urllib3/connection.py", line 419, in connect
    self.sock = ssl_wrap_socket(
                ^^^^^^^^^^^^^^^^
  File "/Users/jlq293/Projects/Study-1-Bibliometrics/.conda/lib/python3.11/site-packages/urllib3/util/ssl_.py", line 449, in ssl_wrap_socket
    ssl_sock = _ssl_wrap_socket_impl(
               ^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/jlq293/Projects/Study-1-Bibliometrics/.conda/lib/python3.11/site-packages/urllib3/util/ssl_.py", line 493, in _ssl_wrap_socket_impl
    return ssl_context.wrap_socket(sock, server_hostname=serve

404 Not Found URL[44]: https://icite.od.nih.gov/api/pubs/2006111347


Traceback (most recent call last):
  File "/Users/jlq293/Projects/Study-1-Bibliometrics/.conda/lib/python3.11/site-packages/urllib3/connectionpool.py", line 404, in _make_request
    self._validate_conn(conn)
  File "/Users/jlq293/Projects/Study-1-Bibliometrics/.conda/lib/python3.11/site-packages/urllib3/connectionpool.py", line 1058, in _validate_conn
    conn.connect()
  File "/Users/jlq293/Projects/Study-1-Bibliometrics/.conda/lib/python3.11/site-packages/urllib3/connection.py", line 419, in connect
    self.sock = ssl_wrap_socket(
                ^^^^^^^^^^^^^^^^
  File "/Users/jlq293/Projects/Study-1-Bibliometrics/.conda/lib/python3.11/site-packages/urllib3/util/ssl_.py", line 449, in ssl_wrap_socket
    ssl_sock = _ssl_wrap_socket_impl(
               ^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/jlq293/Projects/Study-1-Bibliometrics/.conda/lib/python3.11/site-packages/urllib3/util/ssl_.py", line 493, in _ssl_wrap_socket_impl
    return ssl_context.wrap_socket(sock, server_hostname=serve

**ERROR: ConnectionError = ('Connection aborted.', OSError(9, 'Bad file descriptor'))



ERROR:root:Error fetching DOI from pmidcite for PMID 19035066: 'NoneType' object has no attribute 'dct'


**ERROR: ConnectionError = ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))



ERROR:root:HTTP error fetching DOI from E-utilities for PMID 19112394: HTTPSConnectionPool(host='eutils.ncbi.nlm.nih.gov', port=443): Read timed out. (read timeout=10)
ERROR:root:Error fetching DOI from pmidcite for PMID 18392071: 'NoneType' object has no attribute 'dct'


**ERROR: ConnectionError = ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))



Traceback (most recent call last):
  File "/Users/jlq293/Projects/Study-1-Bibliometrics/.conda/lib/python3.11/site-packages/urllib3/connectionpool.py", line 404, in _make_request
    self._validate_conn(conn)
  File "/Users/jlq293/Projects/Study-1-Bibliometrics/.conda/lib/python3.11/site-packages/urllib3/connectionpool.py", line 1058, in _validate_conn
    conn.connect()
  File "/Users/jlq293/Projects/Study-1-Bibliometrics/.conda/lib/python3.11/site-packages/urllib3/connection.py", line 419, in connect
    self.sock = ssl_wrap_socket(
                ^^^^^^^^^^^^^^^^
  File "/Users/jlq293/Projects/Study-1-Bibliometrics/.conda/lib/python3.11/site-packages/urllib3/util/ssl_.py", line 449, in ssl_wrap_socket
    ssl_sock = _ssl_wrap_socket_impl(
               ^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/jlq293/Projects/Study-1-Bibliometrics/.conda/lib/python3.11/site-packages/urllib3/util/ssl_.py", line 493, in _ssl_wrap_socket_impl
    return ssl_context.wrap_socket(sock, server_hostname=serve

**ERROR: ConnectionError = ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))



ERROR:root:HTTP error fetching DOI from E-utilities for PMID 23690183: HTTPSConnectionPool(host='eutils.ncbi.nlm.nih.gov', port=443): Read timed out. (read timeout=10)
ERROR:root:Error fetching DOI from pmidcite for PMID 24434345: 'NoneType' object has no attribute 'dct'


**ERROR: ConnectionError = ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))



Traceback (most recent call last):
  File "/Users/jlq293/Projects/Study-1-Bibliometrics/.conda/lib/python3.11/site-packages/urllib3/connectionpool.py", line 404, in _make_request
    self._validate_conn(conn)
  File "/Users/jlq293/Projects/Study-1-Bibliometrics/.conda/lib/python3.11/site-packages/urllib3/connectionpool.py", line 1058, in _validate_conn
    conn.connect()
  File "/Users/jlq293/Projects/Study-1-Bibliometrics/.conda/lib/python3.11/site-packages/urllib3/connection.py", line 419, in connect
    self.sock = ssl_wrap_socket(
                ^^^^^^^^^^^^^^^^
  File "/Users/jlq293/Projects/Study-1-Bibliometrics/.conda/lib/python3.11/site-packages/urllib3/util/ssl_.py", line 449, in ssl_wrap_socket
    ssl_sock = _ssl_wrap_socket_impl(
               ^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/jlq293/Projects/Study-1-Bibliometrics/.conda/lib/python3.11/site-packages/urllib3/util/ssl_.py", line 493, in _ssl_wrap_socket_impl
    return ssl_context.wrap_socket(sock, server_hostname=serve

The number of missing DOIs in df_scopus decreased from 4463 to 4043


In [11]:
# save both dataframes
df_scopus.to_parquet(
    "../data/processed/scopus/scopus_data_processed_doisfetched.parquet"
)