# Fetch PMIDs from Medline


In [1]:
from src.data.QueryMedlineForPMIDs import PMIDsFromQuery

In [2]:
email = "lukas.westphal@sund.ku.dk"
fetcher = PMIDsFromQuery(email)
pubmed_ids = fetcher.get_pubmed_ids_from_query()

Using default query:
('selective serotonin reuptake inhibitor'[Title/Abstract] OR ('ssri'[Title/Abstract] OR 'fluvoxamine'[Title/Abstract] OR 'fluoxetine'[Title/Abstract] OR 'citalopram'[Title/Abstract] OR 'paroxetine'[Title/Abstract] OR 'sertraline'[Title/Abstract] OR 'escitalopram'[Title/Abstract]) AND 1983/01/01:2022/12/31[Date - Publication] AND 'english'[Language] )
In the range 1983 to 1987, we found 453 articles. We fetched 453 articles.
In the range 1988 to 1992, we found 1464 articles. We fetched 1464 articles.
In the range 1993 to 1997, we found 2823 articles. We fetched 2823 articles.
In the range 1998 to 2002, we found 3876 articles. We fetched 3876 articles.
In the range 2003 to 2007, we found 4983 articles. We fetched 4983 articles.
In the range 2008 to 2012, we found 5594 articles. We fetched 5594 articles.
In the range 2013 to 2017, we found 6258 articles. We fetched 6258 articles.
In the range 2018 to 2022, we found 6509 articles. We fetched 6509 articles.
In the range

In [4]:
flow0_all_pmids_from_query = pubmed_ids
print("Number of PMIDs from query: ", len(flow0_all_pmids_from_query))

Number of PMIDs from query:  31306


#### save raw pmids


In [5]:
# save the list of pubmed ids
with open("../data/raw/medline/pubmed_ids.txt", "w") as f:
    f.write("\n".join(pubmed_ids))

# remove pmids already in Scopus data


In [6]:
import pandas as pd

In [8]:
df_scopus = pd.read_parquet("../data/processed/scopus/scopus_data_processed.parquet")

pmids_from_scopus = df_scopus["pubmed_id"].dropna().astype(int).astype(str).tolist()

print("Articles in Scopus:", len(df_scopus))
print("Articles in Medline:", len(pubmed_ids))

print(
    "Articles in Scopus with a PMID: ", len(df_scopus[df_scopus["pubmed_id"].notnull()])
)

Articles in Scopus: 37776
Articles in Medline: 31306
Articles in Scopus with a PMID:  29730


In [9]:
def intersection_difference_union(pmids_in_pubmed, pmids_in_scopus):
    print(f"Total number of articles in PubMed: {len(pmids_in_pubmed)}")
    # create sets
    pmids_in_pubmed_set = set(pmids_in_pubmed)
    pmids_in_scopus_set = set(pmids_in_scopus)
    print("Intersection (represented in both sets):")
    intersection_pmids = pmids_in_pubmed_set & pmids_in_scopus_set
    print(f"{len(intersection_pmids)}")
    print("Union (represented in either set):")
    union_pmids = pmids_in_pubmed_set | pmids_in_scopus_set
    print(f"{len(union_pmids)}")
    print("Difference (represented in one set but not the other):")
    pmids_in_scopus_only = pmids_in_scopus_set - pmids_in_pubmed_set
    print(f"Only in scopus: {len(pmids_in_scopus_only)}")
    pmids_in_pubmed_only = pmids_in_pubmed_set - pmids_in_scopus_set
    print(f"Only in pubmed: {len(pmids_in_pubmed_only)}")
    # print("returned those only in pubmed and the intersection.")

    return list(pmids_in_pubmed_only), list(intersection_pmids)

In [11]:
pmids_in_pubmed_only, intersection_pmids = intersection_difference_union(
    pubmed_ids, pmids_from_scopus
)

flow2_all_pmids_from_query_not_in_scopus = pmids_in_pubmed_only

Total number of articles in PubMed: 31306
Intersection (represented in both sets):
26515
Union (represented in either set):
34515
Difference (represented in one set but not the other):
Only in scopus: 3209
Only in pubmed: 4791


# Fetch article details from Medline


In [12]:
from src.data.QueryMedlineForArticleDetails import ArticleDetailsFromPMID

In [13]:
email = "lukas.westphal@sund.ku.dk"

article_details_fetcher = ArticleDetailsFromPMID(email)

article_details_fetcher.fetch_details_and_create_dataframe(pmids_in_pubmed_only)

Fetching details for 4791 articles in batches of 100...


100%|██████████| 48/48 [01:25<00:00,  1.78s/it]


Unnamed: 0,title,year,url,abstract,doi,journal,authors,keywords,pubmed_id,publication_type,raw_xml
0,A comparison of citalopram and paroxetine in t...,2001,https://pubmed.ncbi.nlm.nih.gov/11434404/,Serotonin Selective Re-uptake Inhibitors (SSRI...,,Pharmacopsychiatry,"[Perna, G, Bertani, A, Caldirola, D, Smeraldi,...",[],11434404,Clinical Trial,"<?xml version=""1.0"" ?>\n<!DOCTYPE PubmedArticl..."
1,"Chronic schizophrenia: response to clozapine, ...",1997,https://pubmed.ncbi.nlm.nih.gov/9090343/,,,The American journal of psychiatry,"[Patel, J K, Salzman, C, Green, A I, Tsuang, M T]",[],9090343,Case Reports,"<?xml version=""1.0"" ?>\n<!DOCTYPE PubmedArticl..."
2,Multigram-scale flow synthesis of the chiral k...,2019,https://pubmed.ncbi.nlm.nih.gov/32206263/,The catalytic enantioselective synthesis of th...,10.1039/c9sc04752b,Chemical science,"[Ötvös, Sándor B, Pericàs, Miquel A, Kappe, C ...",[],32206263,Journal Article,"<?xml version=""1.0"" ?>\n<!DOCTYPE PubmedArticl..."
3,Preliminary trial of photic stimulation for pr...,1997,https://pubmed.ncbi.nlm.nih.gov/15511777/,"In an open study 17 women with confirmed, seve...",,Journal of obstetrics and gynaecology : the jo...,"[Anderson, D J, Legg, N J, Ridout, D A]",[],15511777,Journal Article,"<?xml version=""1.0"" ?>\n<!DOCTYPE PubmedArticl..."
4,GSK is fined £38m for delaying generic paroxet...,2016,https://pubmed.ncbi.nlm.nih.gov/26873503/,,10.1136/bmj.i917,BMJ (Clinical research ed.),"[Dyer, Clare]",[],26873503,News,"<?xml version=""1.0"" ?>\n<!DOCTYPE PubmedArticl..."
...,...,...,...,...,...,...,...,...,...,...,...
4712,The Effect of Selective Serotonin Releasing Ag...,1997,https://pubmed.ncbi.nlm.nih.gov/9787258/,Chronic exposure to mild unpredictable stress ...,,"Stress (Amsterdam, Netherlands)","[Marona-Lewicka, D, Nichols, DE]",[],9787258,Journal Article,"<?xml version=""1.0"" ?>\n<!DOCTYPE PubmedArticl..."
4713,Electroacupuncture as a rapid-onset and safer ...,2022,https://pubmed.ncbi.nlm.nih.gov/36684018/,Electroacupuncture (EA) is a promising therapy...,10.3389/fpsyt.2022.1012606,Frontiers in psychiatry,"[Zhang, Zhinan, Cai, Xiaowen, Liang, Yuying, Z...","[depression, electroacupuncture, meta-analysis...",36684018,Systematic Review,"<?xml version=""1.0"" ?>\n<!DOCTYPE PubmedArticl..."
4714,Comparison of sertraline to fluoxetine with re...,1999,https://pubmed.ncbi.nlm.nih.gov/10633973/,"In the present study, two different serotonin ...",,Archivos espanoles de urologia,"[Murat Başar, M, Atan, A, Yildiz, M, Baykam, M...",[],10633973,Clinical Trial,"<?xml version=""1.0"" ?>\n<!DOCTYPE PubmedArticl..."
4715,Predicting Treatment Outcome in Major Depressi...,2020,https://pubmed.ncbi.nlm.nih.gov/32792991/,Between 30 and 50% of patients with major depr...,10.3389/fpsyt.2020.00641,Frontiers in psychiatry,"[Köhler-Forsberg, Kristin, Jorgensen, Anders, ...","[biomarker, cognition, electroencephalogram, f...",32792991,Journal Article,"<?xml version=""1.0"" ?>\n<!DOCTYPE PubmedArticl..."


In [14]:
article_details_fetcher.clean_dataframe()
df_pubmed_clean = article_details_fetcher.df

Cleaning dataframe...
Removed on year 2023: 198
Document types removed: ['Bibliography', 'Lecture', 'News', 'Newspaper Article', 'Consensus Development Conference', 'Congress', 'Interview', 'Published Erratum', 'Biography', 'Preprint', 'Patient Education Handout']
Removed on document types: 41
New length: 4478


In [15]:
df_pubmed_clean[["pubmed_id", "raw_xml"]].to_parquet(
    "../data/raw/medline/pmid_raw_xml_mapping.parquet", index=False
)

In [18]:
# drop the raw xml column
df_pubmed_clean = df_pubmed_clean.drop(columns=["raw_xml"])

# save the dataframe
df_pubmed_clean.to_parquet("../data/processed/medline/medline_data_processed.parquet")