<a href="https://colab.research.google.com/github/karegapauline/Analysis_papers_and_media_GS/blob/main/metaanalysis_papers_alone.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [None]:
#pip install tqdm

In [None]:
from Bio import Entrez
from tqdm import tqdm
import pandas as pd
import time
import re

# --------------------------------------------------------------------------------
# 0. Entrez Setup
# --------------------------------------------------------------------------------
Entrez.email = "paulinenyakairu.karega@postgrad.manchester.ac.uk"   # Required by NCBI
# Optional: Entrez.api_key = "YOUR_NCBI_API_KEY"

# --------------------------------------------------------------------------------
# 1. Build PubMed Query (STRICT: air AND health AND geography)
# --------------------------------------------------------------------------------
def build_pubmed_query(country: str) -> str:
    air_terms = [
        "air quality", "air pollution", "particulate matter", "PM2.5", "PM10",
        "outdoor air", "indoor air pollution", "ambient air", "household air pollution",
        "airborne particles", "atmospheric pollution", "ozone pollution",
        "NO2", "SO2", "carbon monoxide", "volatile organic compounds", "urban air"
    ]

    health_terms = [
        "health effects", "public health", "respiratory health", "respiratory disease",
        "asthma", "lung disease", "cardiovascular disease", "environmental health",
        "mortality", "exposure assessment", "hospital admissions"
    ]

    geography = {
        "Kenya": ["Kenya", "Nairobi", "Mombasa", "Kisumu"],
        "South Africa": ["South Africa", "Johannesburg", "Cape Town", "Durban", "Pretoria"],
        "UK": ["United Kingdom", "UK", "England", "Scotland", "Wales", "London", "Manchester"]
    }

    geo_terms = geography.get(country, [country])

    # STRICT query: must include air AND health AND geo
    query = (
        "(" + " OR ".join([f'"{t}"[tiab]' for t in air_terms]) + ")"
        + " AND (" + " OR ".join([f'"{h}"[tiab]' for h in health_terms]) + ")"
        + " AND (" + " OR ".join([f'"{g}"[tiab]' for g in geo_terms]) + ")"
        + ' AND ("1960/01/01"[PDAT] : "2025/12/31"[PDAT])'
        + " AND english[lang]"
    )
    return query

# --------------------------------------------------------------------------------
# 2. Fetch PubMed Studies with Robust Handling
# --------------------------------------------------------------------------------
def fetch_pubmed_studies(country: str, batch_size: int = 200) -> pd.DataFrame:
    print(f"\nFetching PubMed articles for: {country}")
    query = build_pubmed_query(country)
    print("Using query:", query)

    # First, get count
    try:
        handle = Entrez.esearch(db="pubmed", term=query, retmax=0)
        record = Entrez.read(handle)
        count = int(record["Count"])
    except Exception as e:
        print("Error in initial search:", str(e))
        return pd.DataFrame()

    if count == 0:
        print(f"No results found for {country}")
        return pd.DataFrame()

    print(f"Found {count} records")

    # Fetch in batches
    all_records = []
    for start in tqdm(range(0, count, batch_size), desc=f"Downloading {country}"):
        time.sleep(0.3)  # be gentle with the API
        retries, max_retries = 0, 3
        success = False
        while not success and retries < max_retries:
            try:
                search_handle = Entrez.esearch(
                    db="pubmed", term=query, retmax=batch_size, retstart=start
                )
                search_results = Entrez.read(search_handle)
                ids = search_results.get("IdList", [])
                if ids:
                    summary_handle = Entrez.esummary(db="pubmed", id=",".join(ids))
                    summaries = Entrez.read(summary_handle)
                    for r in summaries:
                        # Extract authors
                        authors = None
                        if "Authors" in r and r["Authors"]:
                            authors = "; ".join([a["Name"] for a in r["Authors"] if "Name" in a])

                        # Extract year
                        year = None
                        if "PubDate" in r:
                            m = re.search(r"\d{4}", r["PubDate"])
                            if m:
                                year = m.group(0)

                        all_records.append({
                            "pmid": r.get("Id"),
                            "title": r.get("Title"),
                            "authors": authors,
                            "year": year,
                            "journal": r.get("Source"),
                            "doi": r.get("DOI"),
                            "abstract": r.get("Abstract"),
                            "source": "PubMed",
                            "country": country
                        })
                    success = True
            except Exception:
                retries += 1
                time.sleep(1)

    df = pd.DataFrame(all_records)

    if not df.empty:
        # Clean up
        df["year"] = pd.to_numeric(df["year"], errors="coerce")
        df.loc[~df["doi"].str.match(r"^10\.\d{4,9}", na=False), "doi"] = None
        df = df.dropna(subset=["title", "year"])
        df = df[(df["year"] >= 1960) & (df["year"] <= 2025)]
        df = df.drop_duplicates(subset=["doi"], keep="first")

    return df

# --------------------------------------------------------------------------------
# 3. Run Pipeline for Kenya, South Africa, and UK
# --------------------------------------------------------------------------------
def run_pubmed_pipeline() -> pd.DataFrame:
    kenya = fetch_pubmed_studies("Kenya")
    sa = fetch_pubmed_studies("South Africa")
    uk = fetch_pubmed_studies("UK")

    combined = pd.concat([kenya, sa, uk], ignore_index=True)

    for col in combined.select_dtypes(include="object"):
        combined[col] = combined[col].str.strip()

    combined = combined.sort_values(by="year", ascending=False)
    return combined

# --------------------------------------------------------------------------------
# 4. Execute and Save
# --------------------------------------------------------------------------------
if __name__ == "__main__":
    final_data = run_pubmed_pipeline()
    final_data.to_csv("pubmed_air_quality_health.csv", index=False)
    print(f"✅ PubMed data collection complete! {len(final_data)} records saved to 'pubmed_air_quality_health.csv'")



Fetching PubMed articles for: Kenya
Using query: ("air quality"[tiab] OR "air pollution"[tiab] OR "particulate matter"[tiab] OR "PM2.5"[tiab] OR "PM10"[tiab] OR "outdoor air"[tiab] OR "indoor air pollution"[tiab] OR "ambient air"[tiab] OR "household air pollution"[tiab] OR "airborne particles"[tiab] OR "atmospheric pollution"[tiab] OR "ozone pollution"[tiab] OR "NO2"[tiab] OR "SO2"[tiab] OR "carbon monoxide"[tiab] OR "volatile organic compounds"[tiab] OR "urban air"[tiab]) AND ("health effects"[tiab] OR "public health"[tiab] OR "respiratory health"[tiab] OR "respiratory disease"[tiab] OR "asthma"[tiab] OR "lung disease"[tiab] OR "cardiovascular disease"[tiab] OR "environmental health"[tiab] OR "mortality"[tiab] OR "exposure assessment"[tiab] OR "hospital admissions"[tiab]) AND ("Kenya"[tiab] OR "Nairobi"[tiab] OR "Mombasa"[tiab] OR "Kisumu"[tiab]) AND ("1960/01/01"[PDAT] : "2025/12/31"[PDAT]) AND english[lang]
Found 51 records


Downloading Kenya: 100%|██████████| 1/1 [00:01<00:00,  1.12s/it]



Fetching PubMed articles for: South Africa
Using query: ("air quality"[tiab] OR "air pollution"[tiab] OR "particulate matter"[tiab] OR "PM2.5"[tiab] OR "PM10"[tiab] OR "outdoor air"[tiab] OR "indoor air pollution"[tiab] OR "ambient air"[tiab] OR "household air pollution"[tiab] OR "airborne particles"[tiab] OR "atmospheric pollution"[tiab] OR "ozone pollution"[tiab] OR "NO2"[tiab] OR "SO2"[tiab] OR "carbon monoxide"[tiab] OR "volatile organic compounds"[tiab] OR "urban air"[tiab]) AND ("health effects"[tiab] OR "public health"[tiab] OR "respiratory health"[tiab] OR "respiratory disease"[tiab] OR "asthma"[tiab] OR "lung disease"[tiab] OR "cardiovascular disease"[tiab] OR "environmental health"[tiab] OR "mortality"[tiab] OR "exposure assessment"[tiab] OR "hospital admissions"[tiab]) AND ("South Africa"[tiab] OR "Johannesburg"[tiab] OR "Cape Town"[tiab] OR "Durban"[tiab] OR "Pretoria"[tiab]) AND ("1960/01/01"[PDAT] : "2025/12/31"[PDAT]) AND english[lang]
Found 138 records


Downloading South Africa: 100%|██████████| 1/1 [00:01<00:00,  1.60s/it]



Fetching PubMed articles for: UK
Using query: ("air quality"[tiab] OR "air pollution"[tiab] OR "particulate matter"[tiab] OR "PM2.5"[tiab] OR "PM10"[tiab] OR "outdoor air"[tiab] OR "indoor air pollution"[tiab] OR "ambient air"[tiab] OR "household air pollution"[tiab] OR "airborne particles"[tiab] OR "atmospheric pollution"[tiab] OR "ozone pollution"[tiab] OR "NO2"[tiab] OR "SO2"[tiab] OR "carbon monoxide"[tiab] OR "volatile organic compounds"[tiab] OR "urban air"[tiab]) AND ("health effects"[tiab] OR "public health"[tiab] OR "respiratory health"[tiab] OR "respiratory disease"[tiab] OR "asthma"[tiab] OR "lung disease"[tiab] OR "cardiovascular disease"[tiab] OR "environmental health"[tiab] OR "mortality"[tiab] OR "exposure assessment"[tiab] OR "hospital admissions"[tiab]) AND ("United Kingdom"[tiab] OR "UK"[tiab] OR "England"[tiab] OR "Scotland"[tiab] OR "Wales"[tiab] OR "London"[tiab] OR "Manchester"[tiab]) AND ("1960/01/01"[PDAT] : "2025/12/31"[PDAT]) AND english[lang]
Found 965 recor

Downloading UK: 100%|██████████| 5/5 [00:08<00:00,  1.61s/it]

✅ PubMed data collection complete! 1113 records saved to 'pubmed_air_quality_health.csv'





We obtained 51 articles from Kenya, 138 from South Africa, and 965 from the UK. We combined these with articles we obtained earlier from iteration of an R code and got our desired articles. These were stored in a local folder for filtering and further analysis.

**Analysis**

Our interest was in articles

In [None]:
##Analysis
