In [1]:
# List of exact paths to your JSON files
FILE_PATHS = [
    "/Users/jacobhessels/KU/bachelor/src/comm/PMC011_600/refids/merge/contexts.json"
]

In [2]:
import json

# Initialize sets to store citing_ids and refids
citing_ids = set()
refids = set()

# Process each JSON file in the list
for file_path in FILE_PATHS:
    # Read the JSON file
    with open(file_path, 'r') as file:
        data = json.load(file)
    
    # Extract citing_ids and refids
    for key, value in data.items():
        citing_ids.add(value['citing_id'])
        refids.add(value['refid'])

# Calculate the refids that are not in citing_ids
not_citing_ids = refids - citing_ids

# Convert to a list and sort if necessary
not_citing_ids_list = list(not_citing_ids)

# Print the list of refids that are not citing_ids
# print("Refids that are not citing_ids:")
# for refid in not_citing_ids_list:
#     print(refid)



In [3]:
len(not_citing_ids)

149231

## Extracting and adding all refids to the dataset that are not citing ids (which are most of them)

In [67]:
import requests
import xml.etree.ElementTree as ET
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

def get_url(pmid):
    # Construct the E-utilities URL
    esearch_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pmc&term={pmid}[pmid]"
    
    # Make the GET request to retrieve the PMCID
    response = requests.get(esearch_url)
    
    # Check if the request was successful
    if response.status_code != 200:
        return None

    # Parse the XML response
    root = ET.fromstring(response.content)
    pmcid_elem = root.find('.//Id')
    
    if pmcid_elem is None:
        return None
    
    pmcid = pmcid_elem.text
    # Construct the URL to get the record by PMCID
    url = f'https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id=PMC{pmcid}'
    
    # Make the GET request
    response = requests.get(url)
    
    if response.status_code != 200:
        return None

    # Parse the XML response
    root = ET.fromstring(response.content)
    
    # Find the records element
    records_element = root.find('records')
    if records_element is None:
        return None
    
    # Iterate through the records
    for record in records_element:
        # Check if the record is retracted
        retracted = record.attrib['retracted']
        if retracted == 'yes':
            return None
        
        # Extract and return tgz link
        for link in record.findall('link'):
            format = link.attrib['format']
            if format == 'tgz':
                href = link.attrib['href']
                return href

    return None

# Function to process a single PMID and update the URLs set
def process_pmid(pmid, urls_set):
    url = get_url(pmid)
    if url:
        urls_set.add(url)

# Initialize a set to store URLs
urls = set()

# Use ThreadPoolExecutor to process PMIDs concurrently
with ThreadPoolExecutor(max_workers=10) as executor:
    # List to store the future objects
    futures = [executor.submit(process_pmid, pmid, urls) for pmid in not_citing_ids]
    
    # Use tqdm to show progress bar
    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing PMIDs"):
        pass

print(f"Found {len(urls)} unique tgz URLs.")


Processing PMIDs:   2%|▏         | 2323/149231 [01:44<1:49:43, 22.32it/s]


In [52]:
urls

{'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/96/37/PMC8858624.tar.gz',
 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/d6/ff/PMC9917291.tar.gz',
 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/de/61/PMC5868307.tar.gz',
 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/1e/90/PMC9476461.tar.gz',
 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/fa/fc/PMC5580518.tar.gz',
 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/4f/60/PMC6791134.tar.gz',
 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/45/de/PMC4737187.tar.gz',
 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/62/bc/PMC10154205.tar.gz',
 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/c6/82/PMC5084014.tar.gz',
 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/a3/17/PMC6946810.tar.gz',
 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/80/69/PMC6489647.tar.gz',
 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/d3/17/PMC4931855.tar.gz',
 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/a4/0f/PMC6318962.tar.gz',
 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/

In [57]:
import json

# Save the set to a JSON file
with open('urls.json', 'w') as file:
    json.dump(list(urls), file)

# Load the set from the JSON file
with open('urls.json', 'r') as file:
    loaded_urls = set(json.load(file))


In [58]:
import os
import tarfile
from ftplib import FTP
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

def download_and_extract_xml(ftp_url, output_dir):
    try:
        # Parse the FTP URL
        url_parts = ftp_url.split('/')
        ftp_host = url_parts[2]
        ftp_path = '/'.join(url_parts[3:])
        
        # Connect to the FTP server
        ftp = FTP(ftp_host)
        ftp.login()
        
        # Navigate to the directory containing the file
        directory, filename = os.path.split(ftp_path)
        ftp.cwd(directory)
        
        # Download the tar.gz file
        local_tar_path = os.path.join(output_dir, filename)
        with open(local_tar_path, 'wb') as file:
            ftp.retrbinary(f'RETR {filename}', file.write)
        
        # Close the FTP connection
        ftp.quit()
        
        # Extract the tar.gz file and find the XML file
        xml_path = None
        with tarfile.open(local_tar_path, 'r:gz') as tar:
            tar.extractall(path=output_dir)
            for member in tar.getmembers():
                if member.name.endswith('.nxml'):
                    xml_path = os.path.join(output_dir, member.name)
                    # Move the XML file to the output directory
                    tar.extract(member, output_dir)
                    extracted_path = os.path.join(output_dir, member.name)
                    os.rename(extracted_path, os.path.join(output_dir, os.path.basename(extracted_path)))
                    # Remove the tar file after extracting the necessary files
                    os.remove(local_tar_path)
                    break

        return xml_path
    except Exception as e:
        print(f"Error processing {ftp_url}: {e}")
        return None

def download_all_xml(urls, output_dir):
    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    with ThreadPoolExecutor(max_workers=5) as executor:
        future_to_url = {executor.submit(download_and_extract_xml, url, output_dir): url for url in urls}
        for future in tqdm(as_completed(future_to_url), total=len(urls), desc="Downloading XML files"):
            url = future_to_url[future]
            try:
                xml_path = future.result()
                # if xml_path:
                #     print(f'Successfully downloaded XML file to {xml_path}')
            except Exception as e:
                print(f'Error downloading {url}: {e}')

# Define the output directory
output_dir = '/Users/jacobhessels/KU/bachelor/src/comm/PMC011_600/refids/data'

# Download and extract XML files
download_all_xml(urls, output_dir)


Downloading XML files:  22%|██▏       | 825/3812 [10:38<31:43,  1.57it/s]  

Error processing ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/3f/dc/PMC7641728.tar.gz: Error -3 while decompressing data: invalid stored block lengths


Downloading XML files:  40%|███▉      | 1506/3812 [19:21<33:57,  1.13it/s]  

Error processing ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/51/62/PMC3348462.tar.gz: Error -3 while decompressing data: invalid stored block lengths


Downloading XML files:  44%|████▍     | 1679/3812 [21:32<37:00,  1.04s/it]  

Error processing ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/ea/66/PMC4491249.tar.gz: Error -3 while decompressing data: invalid stored block lengths


Downloading XML files:  84%|████████▍ | 3206/3812 [43:11<17:59,  1.78s/it]  

Error processing ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/38/b1/PMC8390749.tar.gz: Error -3 while decompressing data: invalid stored block lengths


Downloading XML files:  92%|█████████▏| 3519/3812 [47:22<03:01,  1.61it/s]

Error processing ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/7d/8b/PMC4937200.tar.gz: Error -3 while decompressing data: invalid stored block lengths


Downloading XML files: 100%|██████████| 3812/3812 [50:59<00:00,  1.25it/s]
