## Acessing Pub Med data

First check to see if a library has a 

Libraries

In [1]:
import requests
import pandas as pd
import pickle



In [2]:

def search_pubmed(keyword):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {
        "db": "pubmed",
        "term": keyword,
        "retmode": "json"
    }
    
    response = requests.get(base_url, params=params)
    data = response.json()
    
    # Check if 'idlist' exists in the response
    pmids = data["esearchresult"].get("idlist", [])
    return pmids

def fetch_details(pmids):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    params = {
        "db": "pubmed",
        "id": ",".join(pmids),
        "retmode": "xml"
    }
    
    response = requests.get(base_url, params=params)
    return response.text

def extract_details_from_xml(xml_data):
    import xml.etree.ElementTree as ET
    root = ET.fromstring(xml_data)
    articles = []
    for article in root.findall(".//PubmedArticle"):
        title = article.find(".//ArticleTitle").text
        abstract = article.find(".//AbstractText")
        authors = article.findall(".//Author")
        pmid = article.find(".//PMID").text  # Extracting the PMID
        link = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"  # Constructing the link
        author_list = []
        for author in authors:
            lastname = author.find("LastName")
            forename = author.find("ForeName")
            if lastname is not None and forename is not None:
                author_list.append(f"{forename.text} {lastname.text}")
        articles.append({
            "title": title,
            "abstract": abstract.text if abstract is not None else None,
            "authors": ", ".join(author_list),
            "link": link  # Adding the link to the dictionary
        })
    return articles




In [3]:
country_list = pickle.load(open("arrays.pkl", "rb"))
country_list

[['Papua New Guinea',
  'Singapore',
  'Sao Tome and Principe',
  'Czechia',
  'Djibouti',
  'Sri Lanka',
  'Burkina Faso',
  'Brunei Darussalam',
  'British Caribbean Territories - Montserrat'],
 ['Nicaragua',
  'Brazil',
  'Kuwait',
  'Ireland',
  'Poland',
  'Pakistan',
  'Bolivia (Plurinational State of)',
  'North Macedonia',
  'Tuvalu'],
 ['Morocco',
  'Kiribati',
  'Seychelles',
  'Uzbekistan',
  'Egypt',
  'Curaçao and Sint Maarten',
  "Democratic People's Republic of Korea",
  'Chad',
  'Azerbaijan'],
 ['Chile',
  'Zambia',
  'British Caribbean Territories - Anguilla',
  'Latvia',
  'Panama',
  'Ukraine',
  'Macao, China',
  'Antigua and Barbuda',
  'Saint Lucia'],
 ['Cameroon',
  'Croatia',
  'Hong Kong, China',
  'British Caribbean Territories',
  'Malta',
  'Kazakhstan',
  'United States of America',
  'Norway',
  'Madagascar'],
 ['Georgia',
  'Thailand',
  'Cambodia',
  'Guyana',
  'Afghanistan',
  'Bangladesh',
  'Iraq',
  'Slovakia',
  'Costa Rica'],
 ['Ghana',
  'Algeri

In [4]:
df_collector = pd.DataFrame()

for list in country_list:
    df_data = []
    for country in list:
        # print(country)
        keyword = f"(\"heatwave warning system\" OR \"heat alert system\" OR \"heatwave alert system\" OR \"heatwave response system\") AND \"{country}\""
        pmids = search_pubmed(keyword)
        
        # Insert the code here to print countries with no results
        if not pmids:
            print(f"No results for: {country}")  # This line is the new addition
            df_data.append({
                "country": country,
                "title": "Not found",
                "authors": "Not found",
                "abstract": "Not found",
                "link": "Not found"
            })
        else:
            xml_data = fetch_details(pmids)
            articles = extract_details_from_xml(xml_data)
            for article in articles:
                df_data.append({
                    "country": country,
                    "title": article["title"],
                    "authors": article["authors"],
                    "abstract": article["abstract"],
                    "link": article["link"]  # Assuming you've added the link extraction in the extract_details_from_xml function
                })
        df = pd.DataFrame(df_data)
        df_collector = pd.concat([df_collector, df], ignore_index=True)



No results for: Papua New Guinea
No results for: Sao Tome and Principe
No results for: Czechia
No results for: Djibouti
No results for: Burkina Faso
No results for: Brunei Darussalam
No results for: Nicaragua
No results for: Bolivia (Plurinational State of)
No results for: North Macedonia
No results for: Tuvalu
No results for: Morocco
No results for: Kiribati
No results for: Seychelles
No results for: Uzbekistan
No results for: Curaçao and Sint Maarten
No results for: Democratic People's Republic of Korea
No results for: Chad
No results for: Azerbaijan
No results for: Zambia
No results for: British Caribbean Territories - Anguilla
No results for: Latvia
No results for: Panama
No results for: Macao, China
No results for: Antigua and Barbuda
No results for: Saint Lucia
No results for: Cameroon
No results for: Hong Kong, China
No results for: Madagascar
No results for: Cambodia
No results for: Guyana
No results for: Afghanistan
No results for: Slovakia
No results for: Côte d'Ivoire
No res

In [5]:
# df_collector = pd.DataFrame()

# for list in country_list:
#     df_data = []
#     for country in list:
#         print(country)
#         keyword = f"(\"heatwave warning system\" OR \"heat health warning system\") AND \"{country}\""
#         pmids = search_pubmed(keyword)
#         if not pmids:  # If pmids is empty
#             df_data.append({
#                 "country": country,
#                 "title": "Not found",
#                 "authors": "Not found",
#                 "abstract": "Not found",
#                 "link": "Not found"
#             })
#         else:
#             xml_data = fetch_details(pmids)
#             articles = extract_details_from_xml(xml_data)
#             for article in articles:
#                 df_data.append({
#                     "country": country,
#                     "title": article["title"],
#                     "authors": article["authors"],
#                     "abstract": article["abstract"],
#                     "link": article["link"]  # Assuming you've added the link extraction in the extract_details_from_xml function
#                 })
#         df = pd.DataFrame(df_data)
#         df_collector = pd.concat([df_collector, df], ignore_index=True)

# df_collector.head()

In [6]:
country_variations = {
    "United States of America": ["USA", "United States", "U.S.", "U.S.A."],
    "United Kingdom of Great Britain and Northern Ireland": ["UK", "Britain", "Great Britain", "U.K.", "United Kingdom"],
    "United Arab Emirates": ["UAE", "U.A.E."],
    "Russian Federation": ["Russia"],
    "Republic of Korea": ["South Korea"],
    "Democratic People's Republic of Korea": ["North Korea"],
    "Bolivia (Plurinational State of)": ["Bolivia"],
    "Iran (Islamic Republic of)": ["Iran"],
    "Venezuela, Bolivarian Republic of": ["Venezuela"],
    "Taiwan, Province of China": ["Taiwan"],
    # ... add more as needed
}

In [7]:
# Step 1: Iterate through the df_collector dataframe
for index, row in df_collector.iterrows():
    # Step 2: Check if the country is a key in country_variations and its status is "Not found"
    if row["country"] in country_variations and row["title"] == "Not found":
        print(f"Trying to find alternative names for {row['country']}")
        # Step 3: Use the alternative names to search for articles
        for variation in country_variations[row["country"]]:
            print(f"Trying variation: {variation}")
            keyword = f"(\"heatwave warning system\" OR \"heat health warning system\") AND \"{variation}\""
            pmids = search_pubmed(keyword)
            
            if pmids:
                xml_data = fetch_details(pmids)
                articles = extract_details_from_xml(xml_data)
                if articles:
                    # Step 4: Update the dataframe with the details of the found article
                    df_collector.at[index, "title"] = articles[0]["title"]
                    df_collector.at[index, "authors"] = articles[0]["authors"]
                    df_collector.at[index, "abstract"] = articles[0]["abstract"]
                    df_collector.at[index, "link"] = articles[0]["link"]  # Adding the link
                    break  # Exit the loop once an article is found

# Step 5: If no articles are found after trying all variations, it will keep the "Not found" status in the dataframe

print(df_collector)

Trying to find alternative names for Bolivia (Plurinational State of)
Trying variation: Bolivia
Trying to find alternative names for Bolivia (Plurinational State of)
Trying variation: Bolivia
Trying to find alternative names for Bolivia (Plurinational State of)
Trying variation: Bolivia
Trying to find alternative names for Democratic People's Republic of Korea
Trying variation: North Korea
Trying to find alternative names for Democratic People's Republic of Korea
Trying variation: North Korea
Trying to find alternative names for Democratic People's Republic of Korea
Trying variation: North Korea
Trying to find alternative names for Venezuela, Bolivarian Republic of
Trying variation: Venezuela
Trying to find alternative names for Venezuela, Bolivarian Republic of
Trying variation: Venezuela
Trying to find alternative names for Russian Federation
Trying variation: Russia
Trying to find alternative names for Russian Federation
Trying variation: Russia
Trying to find alternative names for 

In [8]:
df_collector[df_collector['abstract'] == "Not found"].nunique()

country     123
title         1
authors       1
abstract      1
link          1
dtype: int64

In [9]:
df_collector[df_collector['abstract'] != "Not found"].nunique()

country      75
title       120
authors     120
abstract    119
link        120
dtype: int64

we can create an x list shows the results 

In [10]:
# Save as CSV
df_collector.to_csv("pubmed.csv", index=False)

# Save as pickle
df_collector.to_pickle("pubmed.pkl")

In [11]:
len(df_collector)

1979