In [1]:
##########################################
### Import Libraries and Dependencies ####
##########################################

from Bio import Entrez
import pandas as pd
import requests
from pprint import pprint
import xlsxwriter
import time
import logging
from tqdm import tqdm
from tqdm.notebook import tqdm
import os
import pickle
import spacy
import ast
from geopy.geocoders import Nominatim
import re
import shutil
from collections import Counter
from config import ENTREZ_EMAIL, ENTREZ_API_KEY

In [2]:
# ## Check if Spacy is working
# # Try loading the model
# try:
#     nlp = spacy.load("en_core_web_sm")
#     print("‚úì spaCy is working!")
    
#     # Test on sample text
#     doc = nlp("Study conducted in Chicago, Illinois and New York.")
#     locations = [ent.text for ent in doc.ents if ent.label_ == "GPE"]
#     print(f"Found locations: {locations}")
# except Exception as e:
#     print(f"Error: {e}")

In [3]:
##########################################################################
#### Functions for Sorting Data from Entrez PubMed API call ##############
##########################################################################

def fetch_pubmed_record(pubmed_id):
    handle = Entrez.efetch(db="pubmed", id=pubmed_id, rettype="xml")
    record = Entrez.read(handle)
    handle.close()
    return record


def get_journal_volume_issue(record):
    try:      
        journal_volume_issue = record['PubmedArticle'][0]['MedlineCitation']['Article']['Journal']['JournalIssue']
        volume = journal_volume_issue.get('Volume', '')
        issue = journal_volume_issue.get('Issue', '')
        return volume, issue
    except KeyError:
        return None, None

   

def get_article_title_page(record):
    try:      
        article_title_page = record['PubmedArticle'][0]['MedlineCitation']['Article']
        article_title= article_title_page.get('ArticleTitle', '')
        page_start = article_title_page.get('Pagination', {}).get('MedlinePgn', '').split('-')[0]
        page_end = article_title_page.get('Pagination', {}).get('MedlinePgn', '').split('-')[-1]
        return article_title, page_start, page_end
    except KeyError:
        return None, None, None

def get_journal_title(record):
    try:
        journal_title= record['PubmedArticle'][0]['MedlineCitation']['Article']['Journal']
        #print("Journal Title: ", journal_title_pmid)
        title = journal_title.get('Title', '')
        return title
    except KeyError:
        return None, None, 
    
def get_authors(record):
    try:
        authors = record['PubmedArticle'][0]['MedlineCitation']['Article']['AuthorList']
        return ', '.join(author.get('LastName', '') + ' ' + author.get('Initials', '') for author in authors)
    except KeyError:
        return None
    
def get_publication_date_year(record):
    try:
        date = record['PubmedArticle'][0]['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']
        return date.get('Year', '') if 'Year' in date else None
    except KeyError:
        return None

def get_publication_date_month_year(record):
    try:
        date = record['PubmedArticle'][0]['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']
        year = date.get('Year', '')
        month = date.get('Month', '')
        return f"{month} {year}" if month and year else None
    except KeyError:
        return None

def get_publication_date_month_day_year(record):
    try:
        date = record['PubmedArticle'][0]['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']
        year = date.get('Year', '')
        month = date.get('Month', '')
        day = date.get('Day', '')

        # Mapping of month abbreviations to numbers
        month_mapping = {
            'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05', 'Jun': '06',
            'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'
        }

        # Replace the month abbreviation with the corresponding number
        month_number = month_mapping.get(month, month)

        return f"{month_number}/{day}/{year}" if month and day and year else None
    except KeyError:
        return None

def get_abstract(record):
    """Extract abstract text from PubMed record"""
    try:
        abstract_texts = record['PubmedArticle'][0]['MedlineCitation']['Article'].get('Abstract', {}).get('AbstractText', [])
        if isinstance(abstract_texts, list):
            # Handle structured abstracts
            abstract = ' '.join(str(text) for text in abstract_texts)
        else:
            abstract = str(abstract_texts)
        return abstract if abstract else None
    except (KeyError, IndexError):
        return None
        
def get_pmid_pmcid_doi(record):
    try:
        pmid = next(
            (id_ for id_ in record.get('PubmedArticle', [{}])[0].get('PubmedData', {}).get('ArticleIdList', []) if id_.attributes.get('IdType') == 'pubmed'),
            None
        )
        #print("PMID: ", pmid)

        pmcid = next(
            (id_ for id_ in record.get('PubmedArticle', [{}])[0].get('PubmedData', {}).get('ArticleIdList', []) if id_.attributes.get('IdType') == 'pmc'),
            None
        )
        #print("PMCID: ", pmcid)

        doi = next(
            (id_ for id_ in record.get('PubmedArticle', [{}])[0].get('PubmedData', {}).get('ArticleIdList', []) if id_.attributes.get('IdType') == 'doi'),
            None
        )
        #print("DOI: ", doi)

        return pmid, pmcid, doi
    except (IndexError, KeyError):
        return None, None, None



def get_authors_with_affiliation_name_full(record, affiliations_to_check):
    try:
        authors = record['PubmedArticle'][0]['MedlineCitation']['Article']['AuthorList']
        authors_with_affiliation_name_full = []
              
        for author in authors:
            author_affiliations = author.get('AffiliationInfo', [])

            if not author_affiliations:
                # If there's no specific affiliation information, check the overall affiliation field
                affiliations = [author.get('Affiliation', '').lower()]
            else:
                # If there's specific affiliation information, extract and check each affiliation
                affiliations = [affiliation.get('Affiliation', '').lower() for affiliation in author_affiliations]

                

            # Check if any phrase in affiliations_to_check is a substring of affiliation
            if any(phrase.lower() in affiliation for phrase in affiliations_to_check for affiliation in affiliations):
                full_name = author.get('LastName', '') + ' ' + author.get('ForeName', '')
                authors_with_affiliation_name_full.append(full_name)
                              
        return authors_with_affiliation_name_full if authors_with_affiliation_name_full else None
    except KeyError:
        return None
    
def get_authors_with_affiliation_name_initial(record, affiliations_to_check):
    try:
        authors = record['PubmedArticle'][0]['MedlineCitation']['Article']['AuthorList']
        authors_with_affiliation_name_initial = []
              
        for author in authors:
            author_affiliations = author.get('AffiliationInfo', [])

            if not author_affiliations:
                # If there's no specific affiliation information, check the overall affiliation field
                affiliations = [author.get('Affiliation', '').lower()]
            else:
                # If there's specific affiliation information, extract and check each affiliation
                affiliations = [affiliation.get('Affiliation', '').lower() for affiliation in author_affiliations]

                

            # Check if any phrase in affiliations_to_check is a substring of affiliation
            if any(phrase.lower() in affiliation for phrase in affiliations_to_check for affiliation in affiliations):
                initial_name = author.get('LastName', '') + ' ' + author.get('Initials', '')
                authors_with_affiliation_name_initial.append(initial_name)
                              
        return authors_with_affiliation_name_initial if authors_with_affiliation_name_initial else None
    except KeyError:
        return None

                                                                             
def get_authors_with_affiliation_affiliation(record, affiliations_to_check):
    try:
        authors = record['PubmedArticle'][0]['MedlineCitation']['Article']['AuthorList']
        authors_with_affiliation_affiliation = []
        for author in authors:
            author_affiliations = author.get('AffiliationInfo', [])

            if not author_affiliations:
                affiliations = [author.get('Affiliation', '').lower()]
            else:
                affiliations = [affiliation.get('Affiliation', '').lower() for affiliation in author_affiliations]

            if any(phrase.lower() in affiliation for phrase in affiliations_to_check for affiliation in affiliations):
                authors_with_affiliation_affiliation.append(affiliations)

        return authors_with_affiliation_affiliation if authors_with_affiliation_affiliation else None
    except KeyError:
        return None
                                                                             
def get_authors_with_affiliation_formatted(record, authors_with_affiliation):
    #print(authors_with_affiliation)
    if authors_with_affiliation:
        return [f"{author.split()[0]} {author.split()[1][0]}" for author in authors_with_affiliation]
    else:
        return None

        
def get_all_affiliations(record):
    try:
        authors = record['PubmedArticle'][0]['MedlineCitation']['Article']['AuthorList']
        all_affiliations = set()

        for author in authors:
            author_affiliations = author.get('AffiliationInfo', [])
            
            if not author_affiliations:
                all_affiliations.add(author.get('Affiliation', '').lower())
            else:
                all_affiliations.update([affiliation.get('Affiliation', '').lower() for affiliation in author_affiliations])

        return list(all_affiliations)
    except KeyError:
        return None

def get_document_type(record):
    try:
        document_types = record['PubmedArticle'][0]['MedlineCitation']['Article']['PublicationTypeList']
        return ', '.join(document_type for document_type in document_types)
    except KeyError:
        return None

   
def process_pubmed_record(record, affiliations_to_check):
    # Extract relevant information from the PubMed record
    pubmed_data = {
        "Authors": get_authors(record),
        "AuthorsWithAffiliationNameFull": get_authors_with_affiliation_name_full(record, affiliations_to_check),
        "AuthorsWithAffiliationNameInitial": get_authors_with_affiliation_name_initial(record, affiliations_to_check),
        "AuthorsWithAffiliationAffiliation": get_authors_with_affiliation_affiliation(record, affiliations_to_check),
        "AllAffiliations": get_all_affiliations(record),
        "Abstract": get_abstract(record),
        "date_year": get_publication_date_year(record),
        "date_monthY": get_publication_date_month_year(record),
        "date_mdY": get_publication_date_month_day_year(record),
        "PMID": get_pmid_pmcid_doi(record)[0],
        "PMCID": get_pmid_pmcid_doi(record)[1],
        "DOI":get_pmid_pmcid_doi(record)[2],
        "JournalTitle": get_journal_title(record),
        "ArticleTitle": get_article_title_page(record)[0],
        "PageStart": get_article_title_page(record)[1],
        "PageEnd": get_article_title_page(record)[2],
        "Volume": get_journal_volume_issue(record)[0],
        "Issue": get_journal_volume_issue(record)[1],
        "DocumentType": get_document_type(record)
    }

    return pubmed_data


In [15]:
# ========================================
# Clear out checkpoints if running notebook more than once
# ========================================


# ========================================
# CONFIGURATION - CHANGE THIS FOR EACH RUN
# ========================================
#INPUT_FOLDER = 'hiv imp'  
INPUT_FOLDER = 'hiv not imp'  # Change to 'hiv imp' for the other dataset
# ========================================

# Delete checkpoint directory in the specified folder
checkpoint_dir = os.path.join(INPUT_FOLDER, 'pubmed_checkpoints')

if os.path.exists(checkpoint_dir):
    shutil.rmtree(checkpoint_dir)
    print(f"‚úì Checkpoint directory deleted: {checkpoint_dir}")
    print("Starting fresh - all checkpoints removed")
    
    # Show what was in there before deletion
    print(f"\nDeleted from: {INPUT_FOLDER}")
else:
    print(f"Directory not found: {checkpoint_dir}")
    print(f"\nAvailable directories in {INPUT_FOLDER}:")
    if os.path.exists(INPUT_FOLDER):
        for item in os.listdir(INPUT_FOLDER):
            if os.path.isdir(os.path.join(INPUT_FOLDER, item)):
                print(f"  üìÅ {item}")
    else:
        print(f"  ERROR: Folder '{INPUT_FOLDER}' doesn't exist!")

  if os.path.exists('hiv not implementation\pubmed_checkpoints'):
  shutil.rmtree('hiv not implementation\pubmed_checkpoints')


In [6]:
# ========================================
# Query PubMed via Entrez API
# ========================================

# ========================================
# CONFIGURATION - CHANGE THIS FOR EACH RUN
# ========================================
#OUTPUT_FOLDER = 'hiv imp'  # Change to 'hiv not imp' for the other dataset
OUTPUT_FOLDER = 'hiv not imp'  # Change to 'hiv not imp' for the other dataset
# ========================================

# Create output folder if it doesn't exist
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# Set up logging (save to folder)
log_file = os.path.join(OUTPUT_FOLDER, 'pubmed_errors.log')
logging.basicConfig(
    filename=log_file, 
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

Entrez.email = ENTREZ_EMAIL
Entrez.api_key = ENTREZ_API_KEY

# Checkpoint directory (inside output folder)
CHECKPOINT_DIR = os.path.join(OUTPUT_FOLDER, "pubmed_checkpoints")
CHECKPOINT_INTERVAL = 50
BATCH_SIZE = 200

os.makedirs(CHECKPOINT_DIR, exist_ok=True)

def save_checkpoint(batch_index, pubmed_data, failed_batches, total_count):
    checkpoint = {
        'batch_index': batch_index,
        'pubmed_data': pubmed_data,
        'failed_batches': failed_batches,
        'total_count': total_count,
        'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
    }
    checkpoint_file = os.path.join(CHECKPOINT_DIR, f'hiv_imp_checkpoint_{batch_index}.pkl')
    with open(checkpoint_file, 'wb') as f:
        pickle.dump(checkpoint, f)
    
    if pubmed_data:
        temp_df = pd.DataFrame(pubmed_data)
        csv_file = os.path.join(CHECKPOINT_DIR, f'data_hiv_imp_{batch_index}.csv')
        temp_df.to_csv(csv_file, index=False)
    
    logging.info(f"Checkpoint saved at record {batch_index}: {len(pubmed_data)} records")

def load_latest_checkpoint():
    if not os.path.exists(CHECKPOINT_DIR):
        return None
    
    checkpoint_files = [f for f in os.listdir(CHECKPOINT_DIR) if f.startswith('checkpoint_') and f.endswith('.pkl')]
    if not checkpoint_files:
        return None
    
    latest_file = max(checkpoint_files, key=lambda x: int(x.split('_')[1].split('.')[0]))
    checkpoint_path = os.path.join(CHECKPOINT_DIR, latest_file)
    
    with open(checkpoint_path, 'rb') as f:
        checkpoint = pickle.load(f)
    
    logging.info(f"Loaded checkpoint from record {checkpoint['batch_index']}: {len(checkpoint['pubmed_data'])} records")
    return checkpoint

def fetch_records_from_history(webenv, query_key, retstart, retmax, max_retries=3):
    """Fetch records using the history server"""
    retries = 0
    while retries < max_retries:
        try:
            handle = Entrez.efetch(
                db="pubmed",
                rettype="xml",
                retmode="xml",
                retstart=retstart,
                retmax=retmax,
                webenv=webenv,
                query_key=query_key
            )
            records = Entrez.read(handle)
            handle.close()
            return records
        except Exception as e:
            retries += 1
            wait_time = 2 ** retries
            logging.error(f"Error fetching records at position {retstart} (attempt {retries}/{max_retries}): {e}")
            time.sleep(wait_time)
            if retries == max_retries:
                logging.error(f"Failed after {max_retries} retries")
                return None

# ========================================
# BASE QUERY with {DATE_FILTER} placeholder
# ========================================
# HIV NOT implementation
#base_query = """(HIV[ti] OR HIV1[ti] OR HIVI[ti] OR HIVII[ti] OR HIV2[ti] OR human-immuno-deficiency-virus*[ti] OR human-immunodeficiency-virus*[ti] OR acquired-immune-deficiency-syndrome*[ti] OR acquired-immunodeficiency-syndrome*[ti] OR acquired-immuno-deficiency-syndrome*[ti] OR acute-retroviral-syndrome[ti] OR acute-seroconversion-syndrome[ti] OR Highly-Active-Antiretroviral-Therapy[ti] OR HAART[ti] OR PLWH[ti] OR WLWH[ti] OR MLWH[ti] OR ALWH[ti] OR PLHIV[ti] OR PLWHIV[ti] OR aids-related[ti] OR aids-infected[ti] OR aids-affected[ti] OR living-with-aids[ti] OR patients-with-aids[ti] OR people-with-aids[ti] OR women-with-aids[ti] OR men-with-aids[ti] OR aids-patients[ti] OR "AIDS"[Journal] OR "AIDS Behav"[Journal] OR "AIDS Care"[Journal] OR "AIDS Educ Prev"[Journal] OR "AIDS Patient Care STDS"[Journal] OR "AIDS Res Hum Retroviruses"[Journal] OR "AIDS Res Ther"[Journal] OR "AIDS Rev"[Journal] OR "Curr HIV/AIDS Rep"[Journal] OR "Curr HIV Res"[Journal] OR "Curr Opin HIV AIDS"[Journal] OR "HIV Med"[Journal] OR "HIV Res Clin Pract"[Journal] OR "Int J STD AIDS"[Journal] OR "J Acquir Immune Defic Syndr"[Journal] OR "J Assoc Nurses AIDS Care"[Journal] OR "J Int AIDS Soc"[Journal] OR "Lancet HIV"[Journal] OR "J Int Assoc Provid AIDS Care"[Journal] OR "SAHARA J"[Journal] OR (Aids[ti] NOT (decision-aid*[ti] OR decision-support-aid*[ti] OR decisionmaking-aid*[ti] OR decision-making-aid*[ti] OR hearing-aid*[ti] OR auditory-aid*[ti] OR audiovisual-aid*[ti] OR visual-aid*[ti] OR sensory-aid*[ti] OR communication-aid*[ti] OR cognitive-aid*[ti] OR mobility-aid*[ti] OR walking-aid*[ti] OR sleep-aid*[ti] OR vision-aid*[ti] OR cessation-aid*[ti] OR conversation-aid*[ti] OR memory-aid*[ti] OR diagnostic-aid*[ti] OR pharmacologic-aid*[ti] OR pharmaceutical-aid*[ti] OR medication-aid*[ti] OR job-aid*[ti] OR visualization-aid*[ti] OR pictorial-aid*[ti] OR training-aid*[ti] OR rehabilitation-aid*[ti]))) AND (english[lang] AND {DATE_FILTER} NOT ((africa[Mesh] OR "caribbean region"[Mesh] OR "central america"[Mesh] OR "latin america"[Mesh:NoExp] OR canada[Mesh] OR greenland[Mesh:NoExp] OR mexico[Mesh:NoExp] OR "south america"[Mesh] OR "antarctic regions"[Mesh:NoExp] OR "arctic regions"[Mesh:NoExp] OR asia[Mesh] OR europe[Mesh] OR oceania[Mesh] OR "afghanistan"[ti] OR "africa"[ti] OR "albania"[ti] OR "algeria"[ti] OR "american samoa"[ti] OR "andorra"[ti] OR "angola"[ti] OR "anguilla"[ti] OR "antarctica"[ti] OR "antigua and barbuda"[ti] OR "argentina"[ti] OR "armenia"[ti] OR "aruba"[ti] OR "asia"[ti] OR "australia"[ti] OR "austria"[ti] OR "azerbaijan"[ti] OR "bahamas"[ti] OR "bahrain"[ti] OR "balkan states"[ti] OR "baltic states"[ti] OR "bangladesh"[ti] OR "barbados"[ti] OR "belarus"[ti] OR "belgium"[ti] OR "belize"[ti] OR "benin"[ti] OR "bermuda"[ti] OR "bhutan"[ti] OR "bolivia"[ti] OR "bosnia-herzegovina"[ti] OR "botswana"[ti] OR "bouvet"[ti] OR "brazil"[ti] OR "brunei"[ti] OR "bulgaria"[ti] OR "burkina faso"[ti] OR "burundi"[ti] OR "cambodia"[ti] OR "cameroon"[ti] OR "canada"[ti] OR "cape verde"[ti] OR "central african republic"[ti] OR "central america"[ti] OR "chad"[ti] OR "chile"[ti] OR "china"[ti] OR "colombia"[ti] OR "commonwealth of independent states"[ti] OR "comoros"[ti] OR "cook"[ti] OR "coral sea"[ti] OR "costa rica"[ti] OR "croatia"[ti] OR "cuba"[ti] OR "curacao"[ti] OR "cyprus"[ti] OR "czech republic"[ti] OR "czechoslovakia"[ti] OR "denmark"[ti] OR "djibouti"[ti] OR "dominica"[ti] OR "dominican republic"[ti] OR "east timor"[ti] OR "eastern europe"[ti] OR "ecuador"[ti] OR "egypt"[ti] OR "el salvador"[ti] OR "england"[ti] OR "equatorial guinea"[ti] OR "eritrea"[ti] OR "estonia"[ti] OR "ethiopia"[ti] OR "europa"[ti] OR "europe"[ti] OR "falkland"[ti] OR "faroe"[ti] OR "fiji"[ti] OR "finland"[ti] OR "france"[ti] OR "french guiana"[ti] OR "french polynesia"[ti] OR "gabon"[ti] OR "gambia"[ti] OR "gaza strip"[ti] OR "georgia"[ti] OR "germany"[ti] OR "ghana"[ti] OR "gibraltar"[ti] OR "great britain"[ti] OR "greece"[ti] OR "greenland"[ti] OR "grenada"[ti] OR "guadeloupe"[ti] OR "guam"[ti] OR "guatemala"[ti] OR "guernsey"[ti] OR "guinea"[ti] OR "guyana"[ti] OR "haiti"[ti] OR "honduras"[ti] OR "hong kong"[ti] OR "hungary"[ti] OR "iceland"[ti] OR "india"[ti] OR "indonesia"[ti] OR "iran"[ti] OR "iraq"[ti] OR "ireland"[ti] OR "israel"[ti] OR "italy"[ti] OR "ivory coast"[ti] OR "jamaica"[ti] OR "jan mayen"[ti] OR "japan"[ti] OR "jersey"[ti] OR "johnston atoll"[ti] OR "jordan"[ti] OR "juan de nova"[ti] OR "kazakhstan"[ti] OR "kenya"[ti] OR "kiribati"[ti] OR "korea"[ti] OR "kosovo"[ti] OR "kuwait"[ti] OR "kyrgyzstan"[ti] OR "laos"[ti] OR "latin america"[ti] OR "latvia"[ti] OR "lebanon"[ti] OR "lesotho"[ti] OR "liberia"[ti] OR "libya"[ti] OR "liechtenstein"[ti] OR "lithuania"[ti] OR "luxembourg"[ti] OR "macao"[ti] OR "macedonia"[ti] OR "madagascar"[ti] OR "malawi"[ti] OR "malaysia"[ti] OR "maldives"[ti] OR "mali"[ti] OR "malta"[ti] OR "marshall"[ti] OR "martinique"[ti] OR "mauritania"[ti] OR "mauritius"[ti] OR "mexico"[ti] OR "micronesia"[ti] OR "middle east"[ti] OR "midway"[ti] OR "moldova"[ti] OR "monaco"[ti] OR "mongolia"[ti] OR "montserrat"[ti] OR "morocco"[ti] OR "mozambique"[ti] OR "myanmar"[ti] OR "namibia"[ti] OR "nauru"[ti] OR "navassa"[ti] OR "nepal"[ti] OR "netherlands"[ti] OR "netherlands antilles"[ti] OR "new caledonia"[ti] OR "new zealand"[ti] OR "nicaragua"[ti] OR "niger"[ti] OR "nigeria"[ti] OR "niue"[ti] OR "norfolk"[ti] OR "north macedonia"[ti] OR "northern ireland"[ti] OR "northern mariana"[ti] OR "norway"[ti] OR "oman"[ti] OR "pakistan"[ti] OR "palau"[ti] OR "palestine"[ti] OR "panama"[ti] OR "papua new guinea"[ti] OR "paraguay"[ti] OR "peru"[ti] OR "philippines"[ti] OR "pitcairn"[ti] OR "poland"[ti] OR "portugal"[ti] OR "puerto rico"[ti] OR "qatar"[ti] OR "reunion"[ti] OR "romania"[ti] OR "russia"[ti] OR "rwanda"[ti] OR "saint helena"[ti] OR "saint kitts and nevis"[ti] OR "saint lucia"[ti] OR "saint pierre and miquelon"[ti] OR "saint vincent and the grenadines"[ti] OR "samoa"[ti] OR "san marino"[ti] OR "sao tome and principe"[ti] OR "saudi arabia"[ti] OR "senegal"[ti] OR "serbia"[ti] OR "seychelles"[ti] OR "sierra leone"[ti] OR "singapore"[ti] OR "slovakia"[ti] OR "slovenia"[ti] OR "solomon"[ti] OR "somalia"[ti] OR "south africa"[ti] OR "south america"[ti] OR "south korea"[ti] OR "spain"[ti] OR "sri lanka"[ti] OR "sudan"[ti] OR "suriname"[ti] OR "svalbard"[ti] OR "swaziland"[ti] OR "sweden"[ti] OR "switzerland"[ti] OR "syria"[ti] OR "taiwan"[ti] OR "tajikistan"[ti] OR "tanzania"[ti] OR "thailand"[ti] OR "togo"[ti] OR "tokelau"[ti] OR "tonga"[ti] OR "Trinidad"[ti] OR "tobago"[ti] OR "tunisia"[ti] OR "turkey"[ti] OR "turkmenistan"[ti] OR "tuvalu"[ti] OR "uganda"[ti] OR "ukraine"[ti] OR "united arab emirates"[ti] OR "united kingdom"[ti] OR "united states"[ti] OR "uruguay"[ti] OR "uzbekistan"[ti] OR "vanuatu"[ti] OR "vatican city"[ti] OR "venezuela"[ti] OR "vietnam"[ti] OR "virgin"[ti] OR "wallis and futuna"[ti] OR "west bank"[ti] OR "western sahara"[ti] OR "yemen"[ti] OR "yugoslavia"[ti] OR "zaire"[ti] OR "zambia"[ti] OR "zimbabwe"[ti]) NOT ("north america"[Mesh:NoExp] OR "united states"[Mesh] OR "north American people"[Mesh:NoExp] OR "American indian or alaska native"[Mesh] OR "population groups, us"[Mesh:NoExp] OR north-america*[tiab] OR united-states[tiab] OR usa[tiab] OR us[tiab] OR "u s a"[tiab] OR "u s"[tiab] OR alabama[tiab] OR alaska[tiab] OR arizona[tiab] OR arkansas[tiab] OR california[tiab] OR colorado[tiab] OR connecticut[tiab] OR delaware[tiab] OR district-of-columbia[tiab] OR florida[tiab] OR georgia[tiab] OR hawaii[tiab] OR idaho[tiab] OR illinois[tiab] OR indiana[tiab] OR iowa[tiab] OR kansas[tiab] OR kentucky[tiab] OR louisiana[tiab] OR maine[tiab] OR maryland[tiab] OR massachusetts[tiab] OR michigan[tiab] OR minnesota[tiab] OR mississippi[tiab] OR missouri[tiab] OR montana[tiab] OR nebraska[tiab] OR nevada[tiab] OR new-hampshire[tiab] OR new-jersey[tiab] OR new-mexico[tiab] OR new-york[tiab] OR north-carolina[tiab] OR north-dakota[tiab] OR ohio[tiab] OR oklahoma[tiab] OR oregon[tiab] OR pennsylvania[tiab] OR rhode-island[tiab] OR south-carolina[tiab] OR south-dakota[tiab] OR tennessee[tiab] OR texas[tiab] OR utah[tiab] OR vermont[tiab] OR virginia[tiab] OR washington[tiab] OR west-virginia[tiab] OR wisconsin[tiab] OR wyoming[tiab]))) NOT ("Implementation Science"[majr] OR "Diffusion of Innovation"[majr] OR "Translational Research, Biomedical"[majr] OR implement*[ti] OR delivery-science[ti] OR dissemination-science[ti] OR dissemination-research[ti] OR knowledge-translation[ti] OR translational-research[ti] OR innovation[ti] OR real-world[ti] OR conceptual-determinant*[ti] OR contextual[ti] OR facilitator*[ti] OR barriers[ti] OR enabler*[ti] OR program-evaluation[ti] OR process-evaluation[ti] OR "Implement Sci"[Journal] OR "JBI Evid Implement"[Journal] OR "Transl Res"[Journal] OR ((intervention*[ti] OR initiative*[ti] OR program[ti] OR programs[ti]) AND (acceptability[ti] OR actual-fit[ti] OR adopt*[ti] OR Appropriateness[ti] OR compatibility[ti] OR continuation[ti] OR disseminat*[ti] OR durability[ti] OR feasib*[ti] OR fidelity[ti] OR incorporat*[ti] OR institutionalization[ti] OR integration[ti] OR integrity[ti] OR intention-to-try[ti] OR maintenance[ti] OR optimiz*[ti] OR penetrat*[ti] OR perceived-fit[ti] OR practicability[ti] OR practicable[ti] OR reach[ti] OR relevance[ti] OR retention[ti] OR routiniz*[ti] OR routine-use[ti] OR Suitab*[ti] OR Sustainab*[ti] OR sustained-use[ti] OR uptake[ti] OR usefulness[ti] OR utility[ti] OR utilization[ti]))) """

#HIV AND implementation (i.e. HIV Implementation)
base_query = """(HIV[ti] OR HIV1[ti] OR HIVI[ti] OR HIVII[ti] OR HIV2[ti] OR human-immuno-deficiency-virus*[ti] OR human-immunodeficiency-virus*[ti] OR acquired-immune-deficiency-syndrome*[ti] OR acquired-immunodeficiency-syndrome*[ti] OR acquired-immuno-deficiency-syndrome*[ti] OR acute-retroviral-syndrome[ti] OR acute-seroconversion-syndrome[ti] OR Highly-Active-Antiretroviral-Therapy[ti] OR HAART[ti] OR PLWH[ti] OR WLWH[ti] OR MLWH[ti] OR ALWH[ti] OR PLHIV[ti] OR PLWHIV[ti] OR aids-related[ti] OR aids-infected[ti] OR aids-affected[ti] OR living-with-aids[ti] OR patients-with-aids[ti] OR people-with-aids[ti] OR women-with-aids[ti] OR men-with-aids[ti] OR aids-patients[ti] OR "AIDS"[Journal] OR "AIDS Behav"[Journal] OR "AIDS Care"[Journal] OR "AIDS Educ Prev"[Journal] OR "AIDS Patient Care STDS"[Journal] OR "AIDS Res Hum Retroviruses"[Journal] OR "AIDS Res Ther"[Journal] OR "AIDS Rev"[Journal] OR "Curr HIV/AIDS Rep"[Journal] OR "Curr HIV Res"[Journal] OR "Curr Opin HIV AIDS"[Journal] OR "HIV Med"[Journal] OR "HIV Res Clin Pract"[Journal] OR "Int J STD AIDS"[Journal] OR "J Acquir Immune Defic Syndr"[Journal] OR "J Assoc Nurses AIDS Care"[Journal] OR "J Int AIDS Soc"[Journal] OR "Lancet HIV"[Journal] OR "J Int Assoc Provid AIDS Care"[Journal] OR "SAHARA J"[Journal] OR (Aids[ti] NOT (decision-aid*[ti] OR decision-support-aid*[ti] OR decisionmaking-aid*[ti] OR decision-making-aid*[ti] OR hearing-aid*[ti] OR auditory-aid*[ti] OR audiovisual-aid*[ti] OR visual-aid*[ti] OR sensory-aid*[ti] OR communication-aid*[ti] OR cognitive-aid*[ti] OR mobility-aid*[ti] OR walking-aid*[ti] OR sleep-aid*[ti] OR vision-aid*[ti] OR cessation-aid*[ti] OR conversation-aid*[ti] OR memory-aid*[ti] OR diagnostic-aid*[ti] OR pharmacologic-aid*[ti] OR pharmaceutical-aid*[ti] OR medication-aid*[ti] OR job-aid*[ti] OR visualization-aid*[ti] OR pictorial-aid*[ti] OR training-aid*[ti] OR rehabilitation-aid*[ti]))) AND (english[lang] AND {DATE_FILTER} NOT ((africa[Mesh] OR "caribbean region"[Mesh] OR "central america"[Mesh] OR "latin america"[Mesh:NoExp] OR canada[Mesh] OR greenland[Mesh:NoExp] OR mexico[Mesh:NoExp] OR "south america"[Mesh] OR "antarctic regions"[Mesh:NoExp] OR "arctic regions"[Mesh:NoExp] OR asia[Mesh] OR europe[Mesh] OR oceania[Mesh] OR "afghanistan"[ti] OR "africa"[ti] OR "albania"[ti] OR "algeria"[ti] OR "american samoa"[ti] OR "andorra"[ti] OR "angola"[ti] OR "anguilla"[ti] OR "antarctica"[ti] OR "antigua and barbuda"[ti] OR "argentina"[ti] OR "armenia"[ti] OR "aruba"[ti] OR "asia"[ti] OR "australia"[ti] OR "austria"[ti] OR "azerbaijan"[ti] OR "bahamas"[ti] OR "bahrain"[ti] OR "balkan states"[ti] OR "baltic states"[ti] OR "bangladesh"[ti] OR "barbados"[ti] OR "belarus"[ti] OR "belgium"[ti] OR "belize"[ti] OR "benin"[ti] OR "bermuda"[ti] OR "bhutan"[ti] OR "bolivia"[ti] OR "bosnia-herzegovina"[ti] OR "botswana"[ti] OR "bouvet"[ti] OR "brazil"[ti] OR "brunei"[ti] OR "bulgaria"[ti] OR "burkina faso"[ti] OR "burundi"[ti] OR "cambodia"[ti] OR "cameroon"[ti] OR "canada"[ti] OR "cape verde"[ti] OR "central african republic"[ti] OR "central america"[ti] OR "chad"[ti] OR "chile"[ti] OR "china"[ti] OR "colombia"[ti] OR "commonwealth of independent states"[ti] OR "comoros"[ti] OR "cook"[ti] OR "coral sea"[ti] OR "costa rica"[ti] OR "croatia"[ti] OR "cuba"[ti] OR "curacao"[ti] OR "cyprus"[ti] OR "czech republic"[ti] OR "czechoslovakia"[ti] OR "denmark"[ti] OR "djibouti"[ti] OR "dominica"[ti] OR "dominican republic"[ti] OR "east timor"[ti] OR "eastern europe"[ti] OR "ecuador"[ti] OR "egypt"[ti] OR "el salvador"[ti] OR "england"[ti] OR "equatorial guinea"[ti] OR "eritrea"[ti] OR "estonia"[ti] OR "ethiopia"[ti] OR "europa"[ti] OR "europe"[ti] OR "falkland"[ti] OR "faroe"[ti] OR "fiji"[ti] OR "finland"[ti] OR "france"[ti] OR "french guiana"[ti] OR "french polynesia"[ti] OR "gabon"[ti] OR "gambia"[ti] OR "gaza strip"[ti] OR "georgia"[ti] OR "germany"[ti] OR "ghana"[ti] OR "gibraltar"[ti] OR "great britain"[ti] OR "greece"[ti] OR "greenland"[ti] OR "grenada"[ti] OR "guadeloupe"[ti] OR "guam"[ti] OR "guatemala"[ti] OR "guernsey"[ti] OR "guinea"[ti] OR "guyana"[ti] OR "haiti"[ti] OR "honduras"[ti] OR "hong kong"[ti] OR "hungary"[ti] OR "iceland"[ti] OR "india"[ti] OR "indonesia"[ti] OR "iran"[ti] OR "iraq"[ti] OR "ireland"[ti] OR "israel"[ti] OR "italy"[ti] OR "ivory coast"[ti] OR "jamaica"[ti] OR "jan mayen"[ti] OR "japan"[ti] OR "jersey"[ti] OR "johnston atoll"[ti] OR "jordan"[ti] OR "juan de nova"[ti] OR "kazakhstan"[ti] OR "kenya"[ti] OR "kiribati"[ti] OR "korea"[ti] OR "kosovo"[ti] OR "kuwait"[ti] OR "kyrgyzstan"[ti] OR "laos"[ti] OR "latin america"[ti] OR "latvia"[ti] OR "lebanon"[ti] OR "lesotho"[ti] OR "liberia"[ti] OR "libya"[ti] OR "liechtenstein"[ti] OR "lithuania"[ti] OR "luxembourg"[ti] OR "macao"[ti] OR "macedonia"[ti] OR "madagascar"[ti] OR "malawi"[ti] OR "malaysia"[ti] OR "maldives"[ti] OR "mali"[ti] OR "malta"[ti] OR "marshall"[ti] OR "martinique"[ti] OR "mauritania"[ti] OR "mauritius"[ti] OR "mexico"[ti] OR "micronesia"[ti] OR "middle east"[ti] OR "midway"[ti] OR "moldova"[ti] OR "monaco"[ti] OR "mongolia"[ti] OR "montserrat"[ti] OR "morocco"[ti] OR "mozambique"[ti] OR "myanmar"[ti] OR "namibia"[ti] OR "nauru"[ti] OR "navassa"[ti] OR "nepal"[ti] OR "netherlands"[ti] OR "netherlands antilles"[ti] OR "new caledonia"[ti] OR "new zealand"[ti] OR "nicaragua"[ti] OR "niger"[ti] OR "nigeria"[ti] OR "niue"[ti] OR "norfolk"[ti] OR "north macedonia"[ti] OR "northern ireland"[ti] OR "northern mariana"[ti] OR "norway"[ti] OR "oman"[ti] OR "pakistan"[ti] OR "palau"[ti] OR "palestine"[ti] OR "panama"[ti] OR "papua new guinea"[ti] OR "paraguay"[ti] OR "peru"[ti] OR "philippines"[ti] OR "pitcairn"[ti] OR "poland"[ti] OR "portugal"[ti] OR "puerto rico"[ti] OR "qatar"[ti] OR "reunion"[ti] OR "romania"[ti] OR "russia"[ti] OR "rwanda"[ti] OR "saint helena"[ti] OR "saint kitts and nevis"[ti] OR "saint lucia"[ti] OR "saint pierre and miquelon"[ti] OR "saint vincent and the grenadines"[ti] OR "samoa"[ti] OR "san marino"[ti] OR "sao tome and principe"[ti] OR "saudi arabia"[ti] OR "senegal"[ti] OR "serbia"[ti] OR "seychelles"[ti] OR "sierra leone"[ti] OR "singapore"[ti] OR "slovakia"[ti] OR "slovenia"[ti] OR "solomon"[ti] OR "somalia"[ti] OR "south africa"[ti] OR "south america"[ti] OR "south korea"[ti] OR "spain"[ti] OR "sri lanka"[ti] OR "sudan"[ti] OR "suriname"[ti] OR "svalbard"[ti] OR "swaziland"[ti] OR "sweden"[ti] OR "switzerland"[ti] OR "syria"[ti] OR "taiwan"[ti] OR "tajikistan"[ti] OR "tanzania"[ti] OR "thailand"[ti] OR "togo"[ti] OR "tokelau"[ti] OR "tonga"[ti] OR "Trinidad"[ti] OR "tobago"[ti] OR "tunisia"[ti] OR "turkey"[ti] OR "turkmenistan"[ti] OR "tuvalu"[ti] OR "uganda"[ti] OR "ukraine"[ti] OR "united arab emirates"[ti] OR "united kingdom"[ti] OR "united states"[ti] OR "uruguay"[ti] OR "uzbekistan"[ti] OR "vanuatu"[ti] OR "vatican city"[ti] OR "venezuela"[ti] OR "vietnam"[ti] OR "virgin"[ti] OR "wallis and futuna"[ti] OR "west bank"[ti] OR "western sahara"[ti] OR "yemen"[ti] OR "yugoslavia"[ti] OR "zaire"[ti] OR "zambia"[ti] OR "zimbabwe"[ti]) NOT ("north america"[Mesh:NoExp] OR "united states"[Mesh] OR "north American people"[Mesh:NoExp] OR "American indian or alaska native"[Mesh] OR "population groups, us"[Mesh:NoExp] OR north-america*[tiab] OR united-states[tiab] OR usa[tiab] OR us[tiab] OR "u s a"[tiab] OR "u s"[tiab] OR alabama[tiab] OR alaska[tiab] OR arizona[tiab] OR arkansas[tiab] OR california[tiab] OR colorado[tiab] OR connecticut[tiab] OR delaware[tiab] OR district-of-columbia[tiab] OR florida[tiab] OR georgia[tiab] OR hawaii[tiab] OR idaho[tiab] OR illinois[tiab] OR indiana[tiab] OR iowa[tiab] OR kansas[tiab] OR kentucky[tiab] OR louisiana[tiab] OR maine[tiab] OR maryland[tiab] OR massachusetts[tiab] OR michigan[tiab] OR minnesota[tiab] OR mississippi[tiab] OR missouri[tiab] OR montana[tiab] OR nebraska[tiab] OR nevada[tiab] OR new-hampshire[tiab] OR new-jersey[tiab] OR new-mexico[tiab] OR new-york[tiab] OR north-carolina[tiab] OR north-dakota[tiab] OR ohio[tiab] OR oklahoma[tiab] OR oregon[tiab] OR pennsylvania[tiab] OR rhode-island[tiab] OR south-carolina[tiab] OR south-dakota[tiab] OR tennessee[tiab] OR texas[tiab] OR utah[tiab] OR vermont[tiab] OR virginia[tiab] OR washington[tiab] OR west-virginia[tiab] OR wisconsin[tiab] OR wyoming[tiab]))) AND ("Implementation Science"[majr] OR "Diffusion of Innovation"[majr] OR "Translational Research, Biomedical"[majr] OR implement*[ti] OR delivery-science[ti] OR dissemination-science[ti] OR dissemination-research[ti] OR knowledge-translation[ti] OR translational-research[ti] OR innovation[ti] OR real-world[ti] OR conceptual-determinant*[ti] OR contextual[ti] OR facilitator*[ti] OR barriers[ti] OR enabler*[ti] OR program-evaluation[ti] OR process-evaluation[ti] OR "Implement Sci"[Journal] OR "JBI Evid Implement"[Journal] OR "Transl Res"[Journal] OR ((intervention*[ti] OR initiative*[ti] OR program[ti] OR programs[ti]) AND (acceptability[ti] OR actual-fit[ti] OR adopt*[ti] OR Appropriateness[ti] OR compatibility[ti] OR continuation[ti] OR disseminat*[ti] OR durability[ti] OR feasib*[ti] OR fidelity[ti] OR incorporat*[ti] OR institutionalization[ti] OR integration[ti] OR integrity[ti] OR intention-to-try[ti] OR maintenance[ti] OR optimiz*[ti] OR penetrat*[ti] OR perceived-fit[ti] OR practicability[ti] OR practicable[ti] OR reach[ti] OR relevance[ti] OR retention[ti] OR routiniz*[ti] OR routine-use[ti] OR Suitab*[ti] OR Sustainab*[ti] OR sustained-use[ti] OR uptake[ti] OR usefulness[ti] OR utility[ti] OR utilization[ti]))) """

# DATE RANGES
date_ranges = [
    ("2000/01/01", "2000/06/30"), ("2000/07/01", "2000/12/31"),
    ("2001/01/01", "2001/06/30"), ("2001/07/01", "2001/12/31"),
    ("2002/01/01", "2002/06/30"), ("2002/07/01", "2002/12/31"),
    ("2003/01/01", "2003/06/30"), ("2003/07/01", "2003/12/31"),
    ("2004/01/01", "2004/06/30"), ("2004/07/01", "2004/12/31"),
    ("2005/01/01", "2005/06/30"), ("2005/07/01", "2005/12/31"),
    ("2006/01/01", "2006/06/30"), ("2006/07/01", "2006/12/31"),
    ("2007/01/01", "2007/06/30"), ("2007/07/01", "2007/12/31"),
    ("2008/01/01", "2008/06/30"), ("2008/07/01", "2008/12/31"),
    ("2009/01/01", "2009/06/30"), ("2009/07/01", "2009/12/31"),
    ("2010/01/01", "2010/06/30"), ("2010/07/01", "2010/12/31"),
    ("2011/01/01", "2011/06/30"), ("2011/07/01", "2011/12/31"),
    ("2012/01/01", "2012/06/30"), ("2012/07/01", "2012/12/31"),
    ("2013/01/01", "2013/06/30"), ("2013/07/01", "2013/12/31"),
    ("2014/01/01", "2014/06/30"), ("2014/07/01", "2014/12/31"),
    ("2015/01/01", "2015/06/30"), ("2015/07/01", "2015/12/31"),
    ("2016/01/01", "2016/06/30"), ("2016/07/01", "2016/12/31"),
    ("2017/01/01", "2017/06/30"), ("2017/07/01", "2017/12/31"),
    ("2018/01/01", "2018/06/30"), ("2018/07/01", "2018/12/31"),
    ("2019/01/01", "2019/06/30"), ("2019/07/01", "2019/12/31"),
    ("2020/01/01", "2020/06/30"), ("2020/07/01", "2020/12/31"),
    ("2021/01/01", "2021/06/30"), ("2021/07/01", "2021/12/31"),
    ("2022/01/01", "2022/06/30"), ("2022/07/01", "2022/12/31"),
    ("2023/01/01", "2023/06/30"), ("2023/07/01", "2023/12/31"),
    ("2024/01/01", "2024/06/30"), ("2024/07/01", "2024/12/31"),
    ("2025/01/01", "2025/12/03"),
]

affiliations_to_check = ["Northwestern University", "Feinberg School of Medicine"]

# Store all chunk data
all_chunk_files = []

# ========================================
#  MAIN LOOP
# ========================================

for chunk_num, (start_date, end_date) in enumerate(date_ranges, 1):
    print(f"\n{'='*70}")
    print(f"CHUNK {chunk_num}/{len(date_ranges)}: {start_date} to {end_date}")
    print(f"{'='*70}\n")
    
    # Check if chunk file already exists (save to OUTPUT_FOLDER)
    chunk_filename = os.path.join(OUTPUT_FOLDER, f'hiv_imp_chunk_{chunk_num:02d}_{start_date.replace("/", "-")}_{end_date.replace("/", "-")}.csv')
    
    if os.path.exists(chunk_filename):
        print(f"‚úì Chunk {chunk_num} already exists, skipping...")
        all_chunk_files.append(chunk_filename)
        continue
    
    # Create query for this date range
    date_filter = f"{start_date}:{end_date}[pdat]"
    search_query = base_query.replace("{DATE_FILTER}", date_filter)
    
    # Post search to history server
    print("Posting search to NCBI history server...")
    try:
        search_handle = Entrez.esearch(
            db="pubmed",
            term=search_query,
            usehistory="y",
            retmax=0
        )
        search_results = Entrez.read(search_handle)
        search_handle.close()
        
        count = int(search_results["Count"])
        webenv = search_results["WebEnv"]
        query_key = search_results["QueryKey"]
        
        print(f"Total results: {count:,}")
        print(f"WebEnv: {webenv[:20]}...")
        print(f"QueryKey: {query_key}")
        logging.info(f"Search posted to history server. Count: {count}, WebEnv: {webenv}, QueryKey: {query_key}")
        
    except Exception as e:
        logging.error(f"Failed to post search: {e}")
        print(f"Error: {e}")
        count = 0
    
    # Check for existing checkpoint
    checkpoint = load_latest_checkpoint()
    
    if checkpoint and checkpoint['total_count'] == count:
        pubmed_data = checkpoint['pubmed_data']
        failed_batches = checkpoint['failed_batches']
        start_index = checkpoint['batch_index']
        print(f"\nResuming from checkpoint: {len(pubmed_data):,} records already processed")
        print(f"Starting from record {start_index:,}")
    else:
        pubmed_data = []
        failed_batches = []
        start_index = 0
    
    if count > 0:
        print(f"\nProcessing {count:,} records in batches of {BATCH_SIZE}...")
        
        # Process records using history server
        for start in tqdm(range(start_index, count, BATCH_SIZE), desc="Processing records"):
            try:
                batch_records = fetch_records_from_history(webenv, query_key, start, BATCH_SIZE)
                
                if batch_records and 'PubmedArticle' in batch_records:
                    for article in batch_records['PubmedArticle']:
                        try:
                            # Process each article
                            processed = process_pubmed_record({'PubmedArticle': [article]}, affiliations_to_check)
                            pubmed_data.append(processed)
                        except Exception as e:
                            try:
                                pmid = article['MedlineCitation']['PMID']
                            except:
                                pmid = 'Unknown'
                            logging.error(f"Error processing PMID {pmid}: {e}")
                else:
                    failed_batches.append((start, min(start + BATCH_SIZE, count)))
                    logging.warning(f"Batch at position {start} failed")
                
                # Save checkpoint every 50 batches
                batch_number = (start // BATCH_SIZE) + 1
                if batch_number % CHECKPOINT_INTERVAL == 0:
                    save_checkpoint(start + BATCH_SIZE, pubmed_data, failed_batches, count)
                    print(f"\nCheckpoint saved at batch {batch_number} ({len(pubmed_data):,} records)")
                
                # Rate limiting
                time.sleep(0.1 if Entrez.api_key else 0.34)
                
            except Exception as e:
                logging.error(f"Error at position {start}: {e}")
                failed_batches.append((start, min(start + BATCH_SIZE, count)))
                time.sleep(2)
        
        # Save final checkpoint
        save_checkpoint(count, pubmed_data, failed_batches, count)
    
        # Retry failed batches
        if failed_batches:
            print(f"\nRetrying {len(failed_batches)} failed batches...")
            for start, end in tqdm(failed_batches, desc="Retrying failed batches"):
                try:
                    batch_records = fetch_records_from_history(webenv, query_key, start, end - start, max_retries=5)
                    
                    if batch_records and 'PubmedArticle' in batch_records:
                        for article in batch_records['PubmedArticle']:
                            try:
                                processed = process_pubmed_record({'PubmedArticle': [article]}, affiliations_to_check)
                                pubmed_data.append(processed)
                            except Exception as e:
                                try:
                                    pmid = article['MedlineCitation']['PMID']
                                except:
                                    pmid = 'Unknown'
                                logging.error(f"Error processing PMID {pmid}: {e}")
                    
                    time.sleep(0.5)
                except Exception as e:
                    logging.error(f"Failed retry at position {start}: {e}")
        
        # Save this chunk's data (to OUTPUT_FOLDER)
        if pubmed_data:
            chunk_df = pd.DataFrame(pubmed_data)
            chunk_df.to_csv(chunk_filename, index=False)
            all_chunk_files.append(chunk_filename)
            print(f"\n‚úì Chunk {chunk_num} complete: {len(chunk_df):,} records saved to {chunk_filename}")
            logging.info(f"Chunk {chunk_num} saved: {len(chunk_df)} records")
        
        # Brief pause between chunks
        time.sleep(2)


# ========================================
#  COMBINE ALL CHUNKS
# ========================================

print(f"\n{'='*70}")
print("COMBINING ALL CHUNKS")
print(f"{'='*70}\n")

if all_chunk_files:
    all_dfs = []
    for filename in all_chunk_files:
        df = pd.read_csv(filename)
        all_dfs.append(df)
        print(f"Loaded {filename}: {len(df):,} records")
    
    combined_df = pd.concat(all_dfs, ignore_index=True)
    combined_df = combined_df.drop_duplicates(subset='PMID', keep='first')
    
    # Save final output to OUTPUT_FOLDER
    final_output = os.path.join(OUTPUT_FOLDER, 'hiv_imp_us_2000_2025_FINAL.csv')
    combined_df.to_csv(final_output, index=False)
    
    print(f"\n{'='*70}")
    print("‚úì PROCESSING COMPLETE!")
    print(f"{'='*70}")
    print(f"Total chunks processed: {len(all_chunk_files)}")
    print(f"Total unique records: {len(combined_df):,}")
    print(f"Final output: {final_output}")
    print(f"{'='*70}\n")
    
    logging.info(f"All {len(all_chunk_files)} chunks combined: {len(combined_df)} unique records")
else:
    print("\n‚ö† No data retrieved from any chunks!")
    logging.warning("No chunks produced data")


CHUNK 1/51: 2000/01/01 to 2000/06/30

Posting search to NCBI history server...
Total results: 22
WebEnv: MCID_6930ecc6a245bf9...
QueryKey: 1

Processing 22 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 1 complete: 22 records saved to hiv_imp_chunk_01_2000-01-01_2000-06-30.csv

CHUNK 2/51: 2000/07/01 to 2000/12/31

Posting search to NCBI history server...
Total results: 12
WebEnv: MCID_6930ecca0da1ec2...
QueryKey: 1

Processing 12 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 2 complete: 12 records saved to hiv_imp_chunk_02_2000-07-01_2000-12-31.csv

CHUNK 3/51: 2001/01/01 to 2001/06/30

Posting search to NCBI history server...
Total results: 11
WebEnv: MCID_6930eccdbc26cbd...
QueryKey: 1

Processing 11 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 3 complete: 11 records saved to hiv_imp_chunk_03_2001-01-01_2001-06-30.csv

CHUNK 4/51: 2001/07/01 to 2001/12/31

Posting search to NCBI history server...
Total results: 7
WebEnv: MCID_6930ecd08ff117b...
QueryKey: 1

Processing 7 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 4 complete: 7 records saved to hiv_imp_chunk_04_2001-07-01_2001-12-31.csv

CHUNK 5/51: 2002/01/01 to 2002/06/30

Posting search to NCBI history server...
Total results: 12
WebEnv: MCID_6930ecd466b8716...
QueryKey: 1

Processing 12 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 5 complete: 12 records saved to hiv_imp_chunk_05_2002-01-01_2002-06-30.csv

CHUNK 6/51: 2002/07/01 to 2002/12/31

Posting search to NCBI history server...
Total results: 17
WebEnv: MCID_6930ecd7754c56d...
QueryKey: 1

Processing 17 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 6 complete: 17 records saved to hiv_imp_chunk_06_2002-07-01_2002-12-31.csv

CHUNK 7/51: 2003/01/01 to 2003/06/30

Posting search to NCBI history server...
Total results: 18
WebEnv: MCID_6930ecda98217a2...
QueryKey: 1

Processing 18 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 7 complete: 18 records saved to hiv_imp_chunk_07_2003-01-01_2003-06-30.csv

CHUNK 8/51: 2003/07/01 to 2003/12/31

Posting search to NCBI history server...
Total results: 9
WebEnv: MCID_6930ecdeb347068...
QueryKey: 1

Processing 9 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 8 complete: 9 records saved to hiv_imp_chunk_08_2003-07-01_2003-12-31.csv

CHUNK 9/51: 2004/01/01 to 2004/06/30

Posting search to NCBI history server...
Total results: 23
WebEnv: MCID_6930ece14985343...
QueryKey: 1

Processing 23 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 9 complete: 23 records saved to hiv_imp_chunk_09_2004-01-01_2004-06-30.csv

CHUNK 10/51: 2004/07/01 to 2004/12/31

Posting search to NCBI history server...
Total results: 13
WebEnv: MCID_6930ece54c1c56c...
QueryKey: 1

Processing 13 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 10 complete: 13 records saved to hiv_imp_chunk_10_2004-07-01_2004-12-31.csv

CHUNK 11/51: 2005/01/01 to 2005/06/30

Posting search to NCBI history server...
Total results: 18
WebEnv: MCID_6930ece80c15cc2...
QueryKey: 1

Processing 18 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 11 complete: 18 records saved to hiv_imp_chunk_11_2005-01-01_2005-06-30.csv

CHUNK 12/51: 2005/07/01 to 2005/12/31

Posting search to NCBI history server...
Total results: 21
WebEnv: MCID_6930ecebdca5a81...
QueryKey: 1

Processing 21 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 12 complete: 21 records saved to hiv_imp_chunk_12_2005-07-01_2005-12-31.csv

CHUNK 13/51: 2006/01/01 to 2006/06/30

Posting search to NCBI history server...
Total results: 16
WebEnv: MCID_6930ecefa1a0092...
QueryKey: 1

Processing 16 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 13 complete: 16 records saved to hiv_imp_chunk_13_2006-01-01_2006-06-30.csv

CHUNK 14/51: 2006/07/01 to 2006/12/31

Posting search to NCBI history server...
Total results: 41
WebEnv: MCID_6930ecf27bb1972...
QueryKey: 1

Processing 41 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 14 complete: 41 records saved to hiv_imp_chunk_14_2006-07-01_2006-12-31.csv

CHUNK 15/51: 2007/01/01 to 2007/06/30

Posting search to NCBI history server...
Total results: 28
WebEnv: MCID_6930ecf526883e9...
QueryKey: 1

Processing 28 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 15 complete: 28 records saved to hiv_imp_chunk_15_2007-01-01_2007-06-30.csv

CHUNK 16/51: 2007/07/01 to 2007/12/31

Posting search to NCBI history server...
Total results: 28
WebEnv: MCID_6930ecf9863d050...
QueryKey: 1

Processing 28 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 16 complete: 28 records saved to hiv_imp_chunk_16_2007-07-01_2007-12-31.csv

CHUNK 17/51: 2008/01/01 to 2008/06/30

Posting search to NCBI history server...
Total results: 23
WebEnv: MCID_6930ecfc85e1e33...
QueryKey: 1

Processing 23 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 17 complete: 23 records saved to hiv_imp_chunk_17_2008-01-01_2008-06-30.csv

CHUNK 18/51: 2008/07/01 to 2008/12/31

Posting search to NCBI history server...
Total results: 26
WebEnv: MCID_6930ed005538445...
QueryKey: 1

Processing 26 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 18 complete: 26 records saved to hiv_imp_chunk_18_2008-07-01_2008-12-31.csv

CHUNK 19/51: 2009/01/01 to 2009/06/30

Posting search to NCBI history server...
Total results: 31
WebEnv: MCID_6930ed03870cf0a...
QueryKey: 1

Processing 31 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 19 complete: 30 records saved to hiv_imp_chunk_19_2009-01-01_2009-06-30.csv

CHUNK 20/51: 2009/07/01 to 2009/12/31

Posting search to NCBI history server...
Total results: 37
WebEnv: MCID_6930ed0801f4c41...
QueryKey: 1

Processing 37 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 20 complete: 37 records saved to hiv_imp_chunk_20_2009-07-01_2009-12-31.csv

CHUNK 21/51: 2010/01/01 to 2010/06/30

Posting search to NCBI history server...
Total results: 46
WebEnv: MCID_6930ed0be666928...
QueryKey: 1

Processing 46 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 21 complete: 45 records saved to hiv_imp_chunk_21_2010-01-01_2010-06-30.csv

CHUNK 22/51: 2010/07/01 to 2010/12/31

Posting search to NCBI history server...
Total results: 43
WebEnv: MCID_6930ed0fe3c64ba...
QueryKey: 1

Processing 43 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 22 complete: 43 records saved to hiv_imp_chunk_22_2010-07-01_2010-12-31.csv

CHUNK 23/51: 2011/01/01 to 2011/06/30

Posting search to NCBI history server...
Total results: 50
WebEnv: MCID_6930ed139cce85a...
QueryKey: 1

Processing 50 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 23 complete: 50 records saved to hiv_imp_chunk_23_2011-01-01_2011-06-30.csv

CHUNK 24/51: 2011/07/01 to 2011/12/31

Posting search to NCBI history server...
Total results: 45
WebEnv: MCID_6930ed1601e0d3b...
QueryKey: 1

Processing 45 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 24 complete: 45 records saved to hiv_imp_chunk_24_2011-07-01_2011-12-31.csv

CHUNK 25/51: 2012/01/01 to 2012/06/30

Posting search to NCBI history server...
Total results: 53
WebEnv: MCID_6930ed1a80b828e...
QueryKey: 1

Processing 53 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 25 complete: 53 records saved to hiv_imp_chunk_25_2012-01-01_2012-06-30.csv

CHUNK 26/51: 2012/07/01 to 2012/12/31

Posting search to NCBI history server...
Total results: 41
WebEnv: MCID_6930ed1da2b481f...
QueryKey: 1

Processing 41 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 26 complete: 41 records saved to hiv_imp_chunk_26_2012-07-01_2012-12-31.csv

CHUNK 27/51: 2013/01/01 to 2013/06/30

Posting search to NCBI history server...
Total results: 62
WebEnv: MCID_6930ed21a41f3ba...
QueryKey: 1

Processing 62 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 27 complete: 62 records saved to hiv_imp_chunk_27_2013-01-01_2013-06-30.csv

CHUNK 28/51: 2013/07/01 to 2013/12/31

Posting search to NCBI history server...
Total results: 50
WebEnv: MCID_6930ed26a45350e...
QueryKey: 1

Processing 50 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 28 complete: 50 records saved to hiv_imp_chunk_28_2013-07-01_2013-12-31.csv

CHUNK 29/51: 2014/01/01 to 2014/06/30

Posting search to NCBI history server...
Total results: 80
WebEnv: MCID_6930ed2b6176e1a...
QueryKey: 1

Processing 80 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 29 complete: 80 records saved to hiv_imp_chunk_29_2014-01-01_2014-06-30.csv

CHUNK 30/51: 2014/07/01 to 2014/12/31

Posting search to NCBI history server...
Total results: 66
WebEnv: MCID_6930ed30edac90a...
QueryKey: 1

Processing 66 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 30 complete: 66 records saved to hiv_imp_chunk_30_2014-07-01_2014-12-31.csv

CHUNK 31/51: 2015/01/01 to 2015/06/30

Posting search to NCBI history server...
Total results: 70
WebEnv: MCID_6930ed34c9631e1...
QueryKey: 1

Processing 70 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 31 complete: 70 records saved to hiv_imp_chunk_31_2015-01-01_2015-06-30.csv

CHUNK 32/51: 2015/07/01 to 2015/12/31

Posting search to NCBI history server...
Total results: 69
WebEnv: MCID_6930ed3949af0e4...
QueryKey: 1

Processing 69 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 32 complete: 69 records saved to hiv_imp_chunk_32_2015-07-01_2015-12-31.csv

CHUNK 33/51: 2016/01/01 to 2016/06/30

Posting search to NCBI history server...
Total results: 111
WebEnv: MCID_6930ed3f803867d...
QueryKey: 1

Processing 111 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 33 complete: 111 records saved to hiv_imp_chunk_33_2016-01-01_2016-06-30.csv

CHUNK 34/51: 2016/07/01 to 2016/12/31

Posting search to NCBI history server...
Total results: 72
WebEnv: MCID_6930ed44778dbdd...
QueryKey: 1

Processing 72 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 34 complete: 72 records saved to hiv_imp_chunk_34_2016-07-01_2016-12-31.csv

CHUNK 35/51: 2017/01/01 to 2017/06/30

Posting search to NCBI history server...
Total results: 88
WebEnv: MCID_6930ed49b38f600...
QueryKey: 1

Processing 88 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 35 complete: 88 records saved to hiv_imp_chunk_35_2017-01-01_2017-06-30.csv

CHUNK 36/51: 2017/07/01 to 2017/12/31

Posting search to NCBI history server...
Total results: 71
WebEnv: MCID_6930ed4ed632a91...
QueryKey: 1

Processing 71 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 36 complete: 70 records saved to hiv_imp_chunk_36_2017-07-01_2017-12-31.csv

CHUNK 37/51: 2018/01/01 to 2018/06/30

Posting search to NCBI history server...
Total results: 90
WebEnv: MCID_6930ed537c7ed36...
QueryKey: 1

Processing 90 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 37 complete: 89 records saved to hiv_imp_chunk_37_2018-01-01_2018-06-30.csv

CHUNK 38/51: 2018/07/01 to 2018/12/31

Posting search to NCBI history server...
Total results: 88
WebEnv: MCID_6930ed5920efb7c...
QueryKey: 1

Processing 88 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 38 complete: 88 records saved to hiv_imp_chunk_38_2018-07-01_2018-12-31.csv

CHUNK 39/51: 2019/01/01 to 2019/06/30

Posting search to NCBI history server...
Total results: 96
WebEnv: MCID_6930ed5d0a52b6d...
QueryKey: 1

Processing 96 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 39 complete: 96 records saved to hiv_imp_chunk_39_2019-01-01_2019-06-30.csv

CHUNK 40/51: 2019/07/01 to 2019/12/31

Posting search to NCBI history server...
Total results: 113
WebEnv: MCID_6930ed62e1c5005...
QueryKey: 1

Processing 113 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 40 complete: 113 records saved to hiv_imp_chunk_40_2019-07-01_2019-12-31.csv

CHUNK 41/51: 2020/01/01 to 2020/06/30

Posting search to NCBI history server...
Total results: 123
WebEnv: MCID_6930ed675538445...
QueryKey: 1

Processing 123 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 41 complete: 123 records saved to hiv_imp_chunk_41_2020-01-01_2020-06-30.csv

CHUNK 42/51: 2020/07/01 to 2020/12/31

Posting search to NCBI history server...
Total results: 103
WebEnv: MCID_6930ed6c07fbe43...
QueryKey: 1

Processing 103 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 42 complete: 103 records saved to hiv_imp_chunk_42_2020-07-01_2020-12-31.csv

CHUNK 43/51: 2021/01/01 to 2021/06/30

Posting search to NCBI history server...
Total results: 143
WebEnv: MCID_6930ed715376b67...
QueryKey: 1

Processing 143 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 43 complete: 143 records saved to hiv_imp_chunk_43_2021-01-01_2021-06-30.csv

CHUNK 44/51: 2021/07/01 to 2021/12/31

Posting search to NCBI history server...
Total results: 106
WebEnv: MCID_6930ed750890e40...
QueryKey: 1

Processing 106 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 44 complete: 106 records saved to hiv_imp_chunk_44_2021-07-01_2021-12-31.csv

CHUNK 45/51: 2022/01/01 to 2022/06/30

Posting search to NCBI history server...
Total results: 130
WebEnv: MCID_6930ed7ab64a346...
QueryKey: 1

Processing 130 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 45 complete: 130 records saved to hiv_imp_chunk_45_2022-01-01_2022-06-30.csv

CHUNK 46/51: 2022/07/01 to 2022/12/31

Posting search to NCBI history server...
Total results: 174
WebEnv: MCID_6930ed808c03c9e...
QueryKey: 1

Processing 174 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 46 complete: 174 records saved to hiv_imp_chunk_46_2022-07-01_2022-12-31.csv

CHUNK 47/51: 2023/01/01 to 2023/06/30

Posting search to NCBI history server...
Total results: 159
WebEnv: MCID_6930ed8566b8716...
QueryKey: 1

Processing 159 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 47 complete: 159 records saved to hiv_imp_chunk_47_2023-01-01_2023-06-30.csv

CHUNK 48/51: 2023/07/01 to 2023/12/31

Posting search to NCBI history server...
Total results: 157
WebEnv: MCID_6930ed8bc248ca2...
QueryKey: 1

Processing 157 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 48 complete: 157 records saved to hiv_imp_chunk_48_2023-07-01_2023-12-31.csv

CHUNK 49/51: 2024/01/01 to 2024/06/30

Posting search to NCBI history server...
Total results: 124
WebEnv: MCID_6930ed90bd5b3bd...
QueryKey: 1

Processing 124 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 49 complete: 124 records saved to hiv_imp_chunk_49_2024-01-01_2024-06-30.csv

CHUNK 50/51: 2024/07/01 to 2024/12/31

Posting search to NCBI history server...
Total results: 127
WebEnv: MCID_6930ed96a56d289...
QueryKey: 1

Processing 127 records in batches of 200...


Processing records:   0%|          | 0/1 [00:00<?, ?it/s]


‚úì Chunk 50 complete: 127 records saved to hiv_imp_chunk_50_2024-07-01_2024-12-31.csv

CHUNK 51/51: 2025/01/01 to 2025/12/03

Posting search to NCBI history server...
Total results: 305
WebEnv: MCID_6930ed9b225bb2f...
QueryKey: 1

Processing 305 records in batches of 200...


Processing records:   0%|          | 0/2 [00:00<?, ?it/s]


‚úì Chunk 51 complete: 304 records saved to hiv_imp_chunk_51_2025-01-01_2025-12-03.csv

COMBINING ALL CHUNKS

Loaded hiv_imp_chunk_01_2000-01-01_2000-06-30.csv: 22 records
Loaded hiv_imp_chunk_02_2000-07-01_2000-12-31.csv: 12 records
Loaded hiv_imp_chunk_03_2001-01-01_2001-06-30.csv: 11 records
Loaded hiv_imp_chunk_04_2001-07-01_2001-12-31.csv: 7 records
Loaded hiv_imp_chunk_05_2002-01-01_2002-06-30.csv: 12 records
Loaded hiv_imp_chunk_06_2002-07-01_2002-12-31.csv: 17 records
Loaded hiv_imp_chunk_07_2003-01-01_2003-06-30.csv: 18 records
Loaded hiv_imp_chunk_08_2003-07-01_2003-12-31.csv: 9 records
Loaded hiv_imp_chunk_09_2004-01-01_2004-06-30.csv: 23 records
Loaded hiv_imp_chunk_10_2004-07-01_2004-12-31.csv: 13 records
Loaded hiv_imp_chunk_11_2005-01-01_2005-06-30.csv: 18 records
Loaded hiv_imp_chunk_12_2005-07-01_2005-12-31.csv: 21 records
Loaded hiv_imp_chunk_13_2006-01-01_2006-06-30.csv: 16 records
Loaded hiv_imp_chunk_14_2006-07-01_2006-12-31.csv: 41 records
Loaded hiv_imp_chunk_15

In [None]:
# ========================================
#  Use spaCy to identify geolocations
# ========================================

# ========================================
# CONFIGURATION - CHANGE THIS FOR EACH RUN
# ========================================
#INPUT_FOLDER = 'hiv imp'  # Change to 'hiv imp' for the other dataset
INPUT_FOLDER = 'hiv not imp'  # Change to 'hiv imp' for the other dataset
# ========================================


# Load spaCy model
nlp = spacy.load("en_core_web_sm")

def extract_locations_spacy(text):
    if pd.isna(text):
        return None
    doc = nlp(text)
    locations = [ent.text for ent in doc.ents if ent.label_ == "GPE"]
    return list(set(locations)) if locations else None

# Load file from folder
input_file = os.path.join(INPUT_FOLDER, 'hiv_us_2000_2025_FINAL.csv')
print(f"Loading {input_file}...")

if not os.path.exists(input_file):
    print(f"ERROR: File not found: {input_file}")
    print(f"Available files in {INPUT_FOLDER}:")
    if os.path.exists(INPUT_FOLDER):
        for f in os.listdir(INPUT_FOLDER):
            if f.endswith('.csv'):
                print(f"  {f}")
else:
    combined_df = pd.read_csv(input_file)
    print(f"Loaded {len(combined_df):,} records")
    
    # Prepare text
    combined_df['TitleAbstract'] = combined_df['ArticleTitle'].fillna('') + ' ' + combined_df['Abstract'].fillna('')
    
    # Process in chunks with checkpoints
    print("Extracting geographic locations...")
    chunk_size = 10000
    all_locations = []
    
    for i in tqdm(range(0, len(combined_df), chunk_size)):
        chunk = combined_df.iloc[i:i+chunk_size]
        chunk_locations = chunk['TitleAbstract'].apply(extract_locations_spacy)
        all_locations.extend(chunk_locations.tolist())
        
        # Save checkpoint every 50k records
        if (i + chunk_size) % 50000 == 0:
            print(f"\nCheckpoint: processed {i + chunk_size:,} records")
    
    combined_df['GeographicLocations'] = all_locations
    
    # Save final output to same folder
    output_file = os.path.join(INPUT_FOLDER, 'hiv_us_2000_2025_with_locations.csv')
    combined_df.to_csv(output_file, index=False)
    
    print(f"\n‚úì Complete! Saved to {output_file}")
    print(f"Total records with locations: {combined_df['GeographicLocations'].notna().sum():,}")

Loading hiv_imp_us_2000_2025_FINAL.csv...
Loaded 160,602 records
Extracting geographic locations...


 29%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå                                                        | 5/17 [40:59<1:40:19, 501.64s/it]


Checkpoint: processed 50,000 records


 59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé                               | 10/17 [1:28:20<1:04:50, 555.77s/it]


Checkpoint: processed 100,000 records


 88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã         | 15/17 [2:18:36<20:15, 607.88s/it]


Checkpoint: processed 150,000 records


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 17/17 [2:30:23<00:00, 530.79s/it]



‚úì Complete! Saved to hiv not imp/hiv_us_2000_2025_with_locations.csv


In [17]:
# ========================================
#  Exract Unique Locations for Manual Review
# ========================================

# ========================================
# CONFIGURATION - CHANGE THIS FOR EACH RUN
# ========================================
#INPUT_FOLDER = 'hiv imp'  # Change to 'hiv not imp' for the other dataset
INPUT_FOLDER = 'hiv not imp'  
# ========================================

# Determine input filename based on folder
if INPUT_FOLDER == 'hiv imp':
    input_filename = 'hiv_imp_us_2000_2025_with_locations.csv'
    output_filename = 'hiv_imp_locations_to_review.xlsx'
else:  # 'hiv not imp'
    input_filename = 'hiv_us_2000_2025_with_locations.csv'
    output_filename = 'hiv_locations_to_review.xlsx'

# Load file from folder
input_file = os.path.join(INPUT_FOLDER, input_filename)
print(f"Loading {input_file}...")

if not os.path.exists(input_file):
    print(f"ERROR: File not found: {input_file}")
    print(f"Available files in {INPUT_FOLDER}:")
    if os.path.exists(INPUT_FOLDER):
        for f in os.listdir(INPUT_FOLDER):
            if f.endswith('.csv'):
                print(f"  {f}")
else:
    df = pd.read_csv(input_file)
    print(f"Loaded {len(df):,} records")
    
    # Extract all locations
    print("Extracting unique locations...")
    all_locations = []
    for loc_str in df['GeographicLocations'].dropna():
        try:
            if isinstance(loc_str, str):
                locs = ast.literal_eval(loc_str)
                if isinstance(locs, list):
                    all_locations.extend(locs)
        except:
            pass
    
    # Count frequencies
    location_counts = Counter(all_locations)
    
    # Create DataFrame sorted alphabetically (easier to scan)
    freq_df = pd.DataFrame([
        {'Location': loc, 'Count': count} 
        for loc, count in sorted(location_counts.items())
    ])
    
    # Add DELETE column for marking
    freq_df['DELETE?'] = ''
    
    # Save to Excel in same folder
    output_file = os.path.join(INPUT_FOLDER, output_filename)
    freq_df.to_excel(output_file, index=False)
    
    print(f"\n‚úì Exported {len(freq_df)} unique locations to {output_file}")
    print(f"Total location mentions: {sum(location_counts.values()):,}")
    
    print("\nTop 20 most common locations:")
    for loc, count in location_counts.most_common(20):
        print(f"  {loc}: {count:,}")
    
    print("\nNext steps:")
    print(f"1. Open {output_file} in Excel")
    print("2. Put 'X' in the DELETE? column for false positives")
    print("3. Save the file")
    print("4. Run Step 2 code to create exclusion list")

Loading hiv not imp\hiv_us_2000_2025_with_locations.csv...
Loaded 160,602 records
Extracting unique locations...

‚úì Exported 11173 unique locations to hiv not imp\hiv_locations_to_review.xlsx
Total location mentions: 90,154

Top 20 most common locations:
  the United States: 5,778
  US: 3,665
  U.S.: 1,893
  USA: 1,594
  RT: 1,351
  South Africa: 1,306
  New York City: 1,104
  PWH: 968
  syphilis: 937
  CCR5: 890
  PLWH: 872
  United States: 846
  DC: 822
  CD8: 801
  MD: 710
  San Francisco: 706
  India: 675
  NRTIs: 643
  China: 619
  California: 612

Next steps:
1. Open hiv not imp\hiv_locations_to_review.xlsx in Excel
2. Put 'X' in the DELETE? column for false positives
3. Save the file
4. Run Step 2 code to create exclusion list


In [23]:
# ========================================
#  Remove Bad Locations from original file
# ========================================

# ========================================
# CONFIGURATION - CHANGE THIS FOR EACH RUN
# ========================================
#INPUT_FOLDER = 'hiv imp'  # Change to 'hiv imp' for the other dataset
INPUT_FOLDER = 'hiv not imp'  # Change to 'hiv imp' for the other dataset

# ========================================

# Determine filenames based on folder
if INPUT_FOLDER == 'hiv imp':
    review_filename = 'hiv_imp_locations_to_review_COMPLETED.xlsx'
    input_filename = 'hiv_imp_us_2000_2025_with_locations.csv'
    output_filename = 'hiv_imp_us_2000_2025_with_locations_CLEANED.csv'
else:  # 'hiv not imp'
    review_filename = 'hiv_locations_to_review_COMPLETED.xlsx'
    input_filename = 'hiv_us_2000_2025_with_locations.csv'
    output_filename = 'hiv_us_2000_2025_with_locations_CLEANED.csv'

# ===== STEP 1: Read marked deletions from Excel =====
review_file = os.path.join(INPUT_FOLDER, review_filename)
print(f"Reading {review_file}...")

if not os.path.exists(review_file):
    print(f"ERROR: Review file not found: {review_file}")
    print(f"Make sure you've completed the review and saved it as '{review_filename}'")
else:
    review_df = pd.read_excel(review_file)
    
    # Get locations marked for deletion
    marked_for_deletion = review_df[review_df['DELETE?'].str.upper() == 'X']
    
    # print(f"Found {len(marked_for_deletion)} locations marked for deletion\n")
    # print("="*60)
    # print("Copy this list into your cleaning code:")
    # print("="*60)
    # print("\nbad_locations = [")
    # for loc in marked_for_deletion['Location']:
    #     print(f"    '{loc}',")
    # print("]")
    print(f"\n\nTotal to remove: {marked_for_deletion['Count'].sum():,} location mentions")
    
    # ===== STEP 2: Create bad_locations list from marked items =====
    bad_locations = marked_for_deletion['Location'].tolist()
   # print(f"\nCreated bad_locations list with {len(bad_locations)} items")
    
    # ===== STEP 3: Load and clean data =====
    input_file = os.path.join(INPUT_FOLDER, input_filename)
    print(f"\nLoading {input_file}...")
    df = pd.read_csv(input_file)
    print(f"Loaded {len(df):,} records")
    
    def clean_locations(loc_list_str):
        if pd.isna(loc_list_str):
            return None
        
        try:
            # Convert string to list
            if isinstance(loc_list_str, str):
                locs = ast.literal_eval(loc_list_str)
            else:
                locs = loc_list_str
            
            # Filter out bad locations
            if isinstance(locs, list):
                cleaned = [loc for loc in locs if loc not in bad_locations]
                return cleaned if cleaned else None
            else:
                return None
        except:
            return None
    
    # Apply cleaning
    print("Cleaning locations...")
    df['GeographicLocations'] = df['GeographicLocations'].apply(clean_locations)
    
    # Save cleaned version to same folder
    output_file = os.path.join(INPUT_FOLDER, output_filename)
    df.to_csv(output_file, index=False)
    
    print(f"\n‚úì Saved cleaned file to {output_file}")
    print(f"Removed {len(bad_locations)} false positive location types")
    print(f"Total location mentions removed: {marked_for_deletion['Count'].sum():,}")
    
    # Show before/after stats
    original_with_locs = pd.read_csv(input_file)['GeographicLocations'].notna().sum()
    cleaned_with_locs = df['GeographicLocations'].notna().sum()
    print(f"\nRecords with locations:")
    print(f"  Before cleaning: {original_with_locs:,}")
    print(f"  After cleaning: {cleaned_with_locs:,}")
    print(f"  Difference: {original_with_locs - cleaned_with_locs:,}")

Reading hiv not imp\hiv_locations_to_review_COMPLETED.xlsx...


Total to remove: 44,226 location mentions

Loading hiv not imp\hiv_us_2000_2025_with_locations.csv...
Loaded 160,602 records
Cleaning locations...

‚úì Saved cleaned file to hiv not imp\hiv_us_2000_2025_with_locations_CLEANED.csv
Removed 8969 false positive location types
Total location mentions removed: 44,226

Records with locations:
  Before cleaning: 55,993
  After cleaning: 28,316
  Difference: 27,677


In [20]:
# ========================================
#  Flag geolocations as US or International
# ========================================

# ========================================
# CONFIGURATION - CHANGE THIS FOR EACH RUN
# ========================================
#INPUT_FOLDER = 'hiv imp'  # Change to 'hiv imp' for the other dataset
INPUT_FOLDER = 'hiv not imp'  # Change to 'hiv imp' for the other dataset
# ========================================

# Determine filenames based on folder
if INPUT_FOLDER == 'hiv imp':
    input_filename = 'hiv_imp_us_2000_2025_with_locations_CLEANED.csv'
    output_filename = 'hiv_imp_us_2000_2025_with_location_classification.csv'
    cache_filename = 'location_hiv_imp_cache.pkl'
else:  # 'hiv not imp'
    input_filename = 'hiv_us_2000_2025_with_locations_CLEANED.csv'
    output_filename = 'hiv_us_2000_2025_with_location_classification.csv'
    cache_filename = 'location_hiv_cache.pkl'

# Load cleaned data
input_file = os.path.join(INPUT_FOLDER, input_filename)
print(f"Loading {input_file}...")

if not os.path.exists(input_file):
    print(f"ERROR: File not found: {input_file}")
    print(f"Available files in {INPUT_FOLDER}:")
    if os.path.exists(INPUT_FOLDER):
        for f in os.listdir(INPUT_FOLDER):
            if f.endswith('.csv'):
                print(f"  {f}")
    exit()

df = pd.read_csv(input_file)
print(f"Loaded {len(df):,} records")

# ===== STEP 1: Define rule-based US locations =====
us_locations = {
    # Country names
    'United States', 'USA', 'US', 'U.S.', 'U.S.A.', 'America', 'United States of America',
    
    # States (full names)
    'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado',
    'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho',
    'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana',
    'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota',
    'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada',
    'New Hampshire', 'New Jersey', 'New Mexico', 'New York',
    'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon',
    'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota',
    'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
    'West Virginia', 'Wisconsin', 'Wyoming',
    
    # State abbreviations
    'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI', 'ID',
    'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS',
    'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK',
    'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV',
    'WI', 'WY',
    
    # Territories
    'Puerto Rico', 'Guam', 'Virgin Islands', 'American Samoa',
    'Northern Mariana Islands', 'District of Columbia',
    
    # Major cities
    'New York City', 'NYC', 'Los Angeles', 'LA', 'Chicago', 'Houston', 
    'Phoenix', 'Philadelphia', 'San Antonio', 'San Diego', 'Dallas', 
    'San Jose', 'Austin', 'Jacksonville', 'Fort Worth', 'Columbus', 
    'San Francisco', 'Charlotte', 'Indianapolis', 'Seattle', 'Denver', 
    'Washington DC', 'DC', 'Boston', 'Nashville', 'Detroit', 'Portland', 
    'Memphis', 'Atlanta', 'Miami', 'Baltimore', 'Minneapolis', 'Cleveland', 
    'New Orleans', 'Tampa', 'Pittsburgh', 'Cincinnati', 'Newark',
    
    # Metro variations
    'Metropolitan DC', 'Metro DC', 'Washington Metropolitan Area',
    'Greater Los Angeles', 'Greater New York', 'Bay Area', 'San Francisco Bay Area',
    
    # Regions
    'North America', 'Midwest', 'Northeast', 'Southeast', 'Southwest',
    'Pacific Northwest', 'New England', 'Mid-Atlantic', 'Deep South',
}

us_locations_lower = {loc.lower() for loc in us_locations}

# Patterns indicating US locations
us_patterns = [
    r'\bCounty\b',
    r'\bParish\b',
    r'\bBorough\b',
    r'\bMetropolitan\b',
    r'\bMetro\b',
    r'\bGreater\b',
]

# ===== STEP 2: Load or create cache =====
cache_file = os.path.join(INPUT_FOLDER, cache_filename)
if os.path.exists(cache_file):
    with open(cache_file, 'rb') as f:
        location_cache = pickle.load(f)
    print(f"Loaded {len(location_cache)} cached location lookups")
else:
    location_cache = {}
    print("Starting with empty cache")

# Initialize geocoder
geolocator = Nominatim(user_agent="hiv_research_classifier_v1")

# ===== STEP 3: Hybrid detection function =====
def is_us_location_hybrid(location, use_geopy=True):
    """
    Three-stage detection:
    1. Rule-based (instant)
    2. Pattern matching (instant)
    3. Geopy lookup (slow, only for unknowns)
    """
    loc_lower = location.lower()
    
    # Stage 1: Direct match
    if loc_lower in us_locations_lower:
        return True
    
    # Stage 2: Pattern matching
    for pattern in us_patterns:
        if re.search(pattern, location, re.IGNORECASE):
            return True
    
    # Check if state name is part of the location string
    for state in us_locations:
        if state in location and len(state) > 3:  # Avoid matching short abbreviations randomly
            return True
    
    # Stage 3: Geopy (only for uncertain cases)
    if use_geopy:
        if location not in location_cache:
            try:
                time.sleep(1.1)  # Rate limit: 1 request per second
                result = geolocator.geocode(location, addressdetails=True, timeout=10)
                
                if result and hasattr(result, 'raw'):
                    country_code = result.raw.get('address', {}).get('country_code', '').upper()
                    location_cache[location] = (country_code == 'US')
                else:
                    location_cache[location] = False
                    
            except Exception as e:
                location_cache[location] = False
        
        return location_cache.get(location, False)
    
    return False

# ===== STEP 4: Extract all unique locations =====
print("\nExtracting unique locations...")
all_locations = set()
for loc_str in tqdm(df['GeographicLocations'].dropna(), desc="Scanning records"):
    try:
        if isinstance(loc_str, str):
            locs = ast.literal_eval(loc_str)
            if isinstance(locs, list):
                all_locations.update(locs)
    except:
        pass

print(f"\nFound {len(all_locations)} unique locations to classify")

# ===== STEP 5: First pass - rule-based only =====
print("\n--- PHASE 1: Rule-based classification ---")
unknown_locations = []

for loc in tqdm(all_locations, desc="Rule-based detection"):
    is_us = is_us_location_hybrid(loc, use_geopy=False)
    if is_us:
        location_cache[loc] = True
    else:
        unknown_locations.append(loc)

print(f"‚úì Classified {len(all_locations) - len(unknown_locations)} locations as US using rules")
print(f"  Remaining unknowns: {len(unknown_locations)}")

# ===== STEP 6: Second pass - geopy for unknowns =====
if unknown_locations:
    print("\n--- PHASE 2: Geopy lookup for unknowns ---")
    print(f"This will take approximately {len(unknown_locations) * 1.1 / 60:.1f} minutes")
    
    user_input = input("\nProceed with geopy lookups? (y/n): ").lower()
    
    if user_input == 'y':
        for loc in tqdm(unknown_locations, desc="Geopy lookups"):
            is_us_location_hybrid(loc, use_geopy=True)
        
        # Save cache after geopy phase
        with open(cache_file, 'wb') as f:
            pickle.dump(location_cache, f)
        print(f"\n‚úì Cached {len(location_cache)} location lookups")
    else:
        print("\nSkipping geopy lookups. Unknowns will be classified as Non-US.")

# ===== STEP 7: Classify all records =====
print("\n--- PHASE 3: Classifying all records ---")

def classify_location_detailed(loc_list_str):
    """
    Returns:
    - 'US only'
    - 'Mixed (US + Non-US)'
    - 'Non-US only'
    - 'No locations'
    """
    if pd.isna(loc_list_str):
        return 'No locations'
    
    try:
        if isinstance(loc_list_str, str):
            locs = ast.literal_eval(loc_list_str)
        else:
            locs = loc_list_str
        
        if not locs or not isinstance(locs, list):
            return 'No locations'
        
        has_us = any(location_cache.get(loc, False) for loc in locs)
        has_non_us = any(not location_cache.get(loc, False) for loc in locs)
        
        if has_us and has_non_us:
            return 'Mixed (US + Non-US)'
        elif has_us:
            return 'US only'
        elif has_non_us:
            return 'Non-US only'
        else:
            return 'No locations'
            
    except:
        return 'No locations'

# Apply classification with progress bar
tqdm.pandas(desc="Classifying records")
df['Location_Detail'] = df['GeographicLocations'].progress_apply(classify_location_detailed)

# ===== STEP 8: Results and save =====
print("\n" + "="*60)
print("CLASSIFICATION RESULTS")
print("="*60)
print(df['Location_Detail'].value_counts())
print(f"\nTotal records: {len(df):,}")

# Save to same folder
output_file = os.path.join(INPUT_FOLDER, output_filename)
df.to_csv(output_file, index=False)
print(f"\n‚úì Saved to {output_file}")

# Final cache save
with open(cache_file, 'wb') as f:
    pickle.dump(location_cache, f)
print(f"‚úì Saved location cache ({len(location_cache)} entries)")

Loading hiv not imp\hiv_us_2000_2025_with_locations_CLEANED.csv...
Loaded 160,602 records
Starting with empty cache

Extracting unique locations...


Scanning records: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28316/28316 [00:00<00:00, 38409.18it/s]



Found 2339 unique locations to classify

--- PHASE 1: Rule-based classification ---


Rule-based detection: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2339/2339 [00:00<00:00, 40637.90it/s]


‚úì Classified 484 locations as US using rules
  Remaining unknowns: 1855

--- PHASE 2: Geopy lookup for unknowns ---
This will take approximately 34.0 minutes



Proceed with geopy lookups? (y/n):  y


Geopy lookups: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1855/1855 [49:33<00:00,  1.60s/it]



‚úì Cached 2339 location lookups

--- PHASE 3: Classifying all records ---


Classifying records: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 160602/160602 [00:00<00:00, 170877.58it/s]



CLASSIFICATION RESULTS
Location_Detail
No locations           132286
US only                 16250
Non-US only              9695
Mixed (US + Non-US)      2371
Name: count, dtype: int64

Total records: 160,602

‚úì Saved to hiv not imp\hiv_us_2000_2025_with_location_classification.csv
‚úì Saved location cache (2339 entries)


In [None]:
# ========================================
#  Find additional implementation science papers using machine learning
# ========================================

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os

# ========================================
# CONFIGURATION
# ========================================
OUTPUT_FOLDER = 'hiv imp proj validation'  # All outputs go here
FILE_A_FOLDER = 'hiv not imp'  # Non-implementation data
FILE_B_FOLDER = 'hiv imp'  # Implementation data
# ========================================

# Create output folder if it doesn't exist
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# Define folders
folder_a = FILE_A_FOLDER
folder_b = FILE_B_FOLDER

# Load both datasets
file_a_path = os.path.join(folder_a, 'hiv_us_2000_2025_FINAL.csv')
file_b_path = os.path.join(folder_b, 'hiv_imp_us_2000_2025_FINAL.csv')

print(f"Loading File A: {file_a_path}")
file_a = pd.read_csv(file_a_path)

print(f"Loading File B: {file_b_path}")
file_b = pd.read_csv(file_b_path)

# Combine title + abstract for both
file_a['text'] = file_a['ArticleTitle'].fillna('') + ' ' + file_a['Abstract'].fillna('')
file_b['text'] = file_b['ArticleTitle'].fillna('') + ' ' + file_b['Abstract'].fillna('')

# Remove empty texts
file_a = file_a[file_a['text'].str.strip() != '']
file_b = file_b[file_b['text'].str.strip() != '']

print(f"\nFile A (non-implementation): {len(file_a):,} records")
print(f"File B (implementation): {len(file_b):,} records")

# Create TF-IDF vectorizer on implementation papers
print("\nVectorizing implementation science papers...")
vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 3),
    min_df=5,
    stop_words='english'
)

# Fit on File B (known implementation papers)
tfidf_implementation = vectorizer.fit_transform(file_b['text'])

# Get top distinguishing terms for implementation science
feature_names = vectorizer.get_feature_names_out()
tfidf_means = tfidf_implementation.mean(axis=0).A1
top_terms_idx = tfidf_means.argsort()[-100:][::-1]

print("\nTop 50 implementation science terms/phrases:")
for idx in top_terms_idx[:50]:
    print(f"  {feature_names[idx]}")

# Save top terms to output folder
top_terms_df = pd.DataFrame({
    'Term': [feature_names[idx] for idx in top_terms_idx],
    'TF-IDF Score': [tfidf_means[idx] for idx in top_terms_idx]
})
top_terms_file = os.path.join(OUTPUT_FOLDER, 'top_implementation_terms.csv')
top_terms_df.to_csv(top_terms_file, index=False)
print(f"\n‚úì Saved top terms to {top_terms_file}")

# ===== STAGE 2: Score File A papers =====
print("\nScoring File A papers...")
tfidf_file_a = vectorizer.transform(file_a['text'])

# Calculate similarity to implementation science centroid
implementation_centroid = tfidf_implementation.mean(axis=0)
implementation_centroid = np.asarray(implementation_centroid)

similarities = cosine_similarity(tfidf_file_a, implementation_centroid).flatten()

# Add similarity scores to File A
file_a['implementation_similarity'] = similarities

# Sort by similarity
file_a_sorted = file_a.sort_values('implementation_similarity', ascending=False)

# Show distribution
print("\nSimilarity Score Distribution:")
print(file_a_sorted['implementation_similarity'].describe())

# Export top candidates for review
top_n = 1000
top_candidates = file_a_sorted.head(top_n)[['PMID', 'ArticleTitle', 'Abstract', 'implementation_similarity']]
top_candidates_file = os.path.join(OUTPUT_FOLDER, 'potential_implementation_papers.xlsx')
top_candidates.to_excel(top_candidates_file, index=False)
print(f"\n‚úì Exported top {top_n} candidates to {top_candidates_file}")

# ===== STAGE 3: Add concept flags =====
print("\nAdding concept flags...")
implementation_concepts = {
    'scale_up': ['scale up', 'scale-up', 'scaling', 'scaleup'],
    'real_world': ['real world', 'real-world', 'pragmatic', 'effectiveness'],
    'pilot': ['pilot', 'demonstration', 'proof of concept'],
    'rollout': ['rollout', 'roll out', 'deployment'],
    'workflow': ['workflow', 'work flow', 'clinical workflow'],
    'training': ['training program', 'capacity building', 'provider training'],
    'guideline': ['guideline adherence', 'protocol adherence', 'clinical guidelines'],
    'quality_improvement': ['quality improvement', 'QI', 'continuous improvement'],
    'clinic_level': ['clinic level', 'clinic-level', 'facility level'],
    'patient_centered': ['patient centered', 'patient-centered', 'patient engagement']
}

# Flag papers with these concepts
for concept, terms in implementation_concepts.items():
    pattern = '|'.join(terms)
    file_a_sorted[f'has_{concept}'] = file_a_sorted['text'].str.contains(pattern, case=False, na=False)

# Create composite flag
concept_columns = [f'has_{concept}' for concept in implementation_concepts.keys()]
file_a_sorted['num_concepts'] = file_a_sorted[concept_columns].sum(axis=1)

print("‚úì Concept flags added")

# ===== STAGE 4: Stratified sampling for validation =====
print("\nCreating validation sample...")
bins = [0, 0.1, 0.2, 0.3, 0.4, 1.0]
labels = ['Very Low', 'Low', 'Medium', 'High', 'Very High']
file_a_sorted['similarity_bin'] = pd.cut(file_a_sorted['implementation_similarity'], bins=bins, labels=labels)

samples = []
for bin_label in labels:
    bin_data = file_a_sorted[file_a_sorted['similarity_bin'] == bin_label]
    sample = bin_data.sample(min(50, len(bin_data)), random_state=42)
    samples.append(sample)

validation_set = pd.concat(samples)
validation_set['YOUR_REVIEW'] = ''

validation_file = os.path.join(OUTPUT_FOLDER, 'validation_sample.xlsx')
validation_set[['PMID', 'ArticleTitle', 'Abstract', 'implementation_similarity', 
                'num_concepts', 'similarity_bin', 'YOUR_REVIEW']].to_excel(validation_file, index=False)

print(f"\n‚úì Exported {len(validation_set)} papers for validation to {validation_file}")
print("\nDistribution by similarity bin:")
print(validation_set['similarity_bin'].value_counts().sort_index())

# ===== Save file_a_sorted for later use =====
file_a_sorted_file = os.path.join(OUTPUT_FOLDER, 'file_a_with_scores.pkl')
file_a_sorted.to_pickle(file_a_sorted_file)
print(f"\n‚úì Saved scored dataset to {file_a_sorted_file}")

print("\n" + "="*60)
print("NEXT STEPS:")
print("="*60)
print(f"1. Open {validation_file} in Excel")
print("2. Review papers and mark 'Y' for implementation science, 'N' for not")
print("3. Save as 'validation_sample_COMPLETED.xlsx' in the same folder")
print("4. Run the threshold determination code")
print("="*60)

In [None]:
# ========================================
# Find validation threshold and apply labels
# ========================================

# ========================================
# CONFIGURATION
# ========================================
OUTPUT_FOLDER = 'hiv imp proj validation'
FILE_A_FOLDER = 'hiv not imp'
# ========================================

# Step 1: Read your reviews and find threshold
print("Reading validation results...")
validation_file = os.path.join(OUTPUT_FOLDER, 'validation_sample_COMPLETED.xlsx')
reviewed = pd.read_excel(validation_file)
implementation_papers = reviewed[reviewed['YOUR_REVIEW'].str.upper() == 'Y']

# Find optimal threshold
threshold = implementation_papers['implementation_similarity'].min()
print(f"Suggested threshold: {threshold:.3f}")
print(f"Papers marked as implementation in validation: {len(implementation_papers)}")

# Step 2: Load file_a_sorted
file_a_sorted_file = os.path.join(OUTPUT_FOLDER, 'file_a_with_scores.pkl')
print(f"\nLoading scored dataset from {file_a_sorted_file}...")
file_a_sorted = pd.read_pickle(file_a_sorted_file)

# Step 3: Load the full dataset with location classification
input_file = os.path.join(FILE_A_FOLDER, 'hiv_us_2000_2025_with_location_classification.csv')
print(f"\nLoading full dataset: {input_file}")
full_dataset = pd.read_csv(input_file)
print(f"Total records in full dataset: {len(full_dataset):,}")

# Step 4: Merge similarity scores from file_a_sorted
print("\nMerging similarity scores...")
similarity_scores = file_a_sorted[['PMID', 'implementation_similarity']].copy()

# Merge on PMID
full_dataset = full_dataset.merge(similarity_scores, on='PMID', how='left')

# Step 5: Add flag column
full_dataset['likely_implementation'] = full_dataset['implementation_similarity'] >= threshold

# Count how many are flagged
flagged_count = full_dataset['likely_implementation'].sum()
print(f"\nPapers flagged as likely implementation: {flagged_count:,}")
print(f"Percentage of dataset: {flagged_count/len(full_dataset)*100:.2f}%")

# Step 6: Show distribution by similarity score
print("\nDistribution of similarity scores:")
print(full_dataset['implementation_similarity'].describe())

print(f"\nBreakdown by flag:")
print(full_dataset['likely_implementation'].value_counts())

# Step 7: Save updated dataset to output folder
output_file = os.path.join(OUTPUT_FOLDER, 'hiv_us_2000_2025_with_location_and_impl_score.csv')
full_dataset.to_csv(output_file, index=False)
print(f"\n‚úì Saved updated dataset to: {output_file}")

# Step 8: Export just the likely implementation papers
likely_impl_papers = full_dataset[full_dataset['likely_implementation'] == True].copy()
likely_impl_output = os.path.join(OUTPUT_FOLDER, 'likely_implementation_papers.csv')
likely_impl_papers.to_csv(likely_impl_output, index=False)
print(f"‚úì Saved {len(likely_impl_papers):,} likely implementation papers to: {likely_impl_output}")

# Step 9: Summary
print("\n" + "="*60)
print("SUMMARY")
print("="*60)
print(f"Threshold used: {threshold:.3f}")
print(f"Total papers in dataset: {len(full_dataset):,}")
print(f"Papers above threshold: {flagged_count:,} ({flagged_count/len(full_dataset)*100:.2f}%)")
print(f"\nLocation breakdown of likely implementation papers:")
if 'Location_Detail' in likely_impl_papers.columns:
    print(likely_impl_papers['Location_Detail'].value_counts())

print(f"\nAll outputs saved to: {OUTPUT_FOLDER}/")
```



In [None]:
# ========================================
# Filter final dataset as needed
# ========================================

# ========================================
# CONFIGURATION
# ========================================
INPUT_FOLDER = 'hiv not imp'
INPUT_FILE = 'hiv_us_2000_2025_with_location_and_impl_score.csv'
OUTPUT_FILE = 'filtered_2015-2025_with_DOI_likely_impl_US.csv'
# ========================================

# Load the full dataset
input_path = os.path.join(INPUT_FOLDER, INPUT_FILE)
print(f"Loading {input_path}...")
df = pd.read_csv(input_path)
print(f"Total records: {len(df):,}")

# Show what columns are available
print("\nAvailable columns:")
print(df.columns.tolist())

# Apply ALL your filters
print("\nApplying filters...")

# Filter 1: date_year between 2015-2025
df_filtered = df[(df['date_year'] >= 2015) & (df['date_year'] <= 2025)].copy()
print(f"After date filter (2015-2025): {len(df_filtered):,}")

# Filter 2: Only rows with DOIs (not null/empty)
df_filtered = df_filtered[df_filtered['DOI'].notna()].copy()
df_filtered = df_filtered[df_filtered['DOI'].str.strip() != ''].copy()
print(f"After DOI filter (has DOI): {len(df_filtered):,}")

# Filter 3: likely_implementation = True
df_filtered = df_filtered[df_filtered['likely_implementation'] == True].copy()
print(f"After likely_implementation filter (True): {len(df_filtered):,}")

# Filter 4: Location_Detail NOT "Non-US only"
df_filtered = df_filtered[df_filtered['Location_Detail'] != 'Non-US only'].copy()
print(f"After location filter (NOT Non-US only): {len(df_filtered):,}")

# Show what's included in Location_Detail
print("\nLocation_Detail breakdown in filtered data:")
print(df_filtered['Location_Detail'].value_counts())

# Save filtered version
output_path = os.path.join(INPUT_FOLDER, OUTPUT_FILE)
df_filtered.to_csv(output_path, index=False)

print(f"\n{'='*60}")
print(f"‚úì SAVED: {output_path}")
print(f"{'='*60}")
print(f"Final filtered records: {len(df_filtered):,}")
print(f"Reduction: {len(df):,} ‚Üí {len(df_filtered):,} ({len(df_filtered)/len(df)*100:.1f}%)")