# Prototype retraction logic


In [1]:
# from https://github.com/pangeo-data/pangeo-cmip6-cloud/blob/master/retractions.py
from tqdm.autonotebook import tqdm
import requests
import json
import time

def query_retraction(url, params, batchsize):
    print(f"Downloading Retraction Data from {url}...")
    resp = requests.get(url=url, params=params)
    header = resp.json()  # Check the JSON Response Content documentation below
    n_items = header["response"]["numFound"]
    print(f"Found {n_items} items.")

    batches = range(0, n_items+1, batchsize)  # if I offset these, can
    params["limit"] = batchsize

    batch_jsons = []

    
    for batch in tqdm(batches):
        params["offset"] = batch
        resp = requests.get(url=url, params=params)
        if resp.status_code != 200:
            print(batch)
            print(resp.status_code)
        try:
            data = resp.json()
            batch_jsons.append(data)
        except Exception as e:
            print(f"FAILED with {e}. {resp=}")

    # Convert to list of instance ids
    print("Extracting instance_ids...")
    all_retracted_instance_ids = []
    for data in batch_jsons:
        extracted = [i["instance_id"] for i in data["response"]["docs"]]
        all_retracted_instance_ids.extend(extracted)

    # # Fail out here if the total number of items is not what was promised in the header
    # # I had a few instances today, where that was the case, I think a simple retry is
    # # a good enough solution for now.
    # n_retracted = len(all_retracted_instance_ids)
    # if n_retracted == n_items:
    #     print('Successfully downloaded all retraction info')
    # else:
    #     raise RuntimeError(f'Downloaded retraction info is incomplete. Found {n_retracted} items, expected {n_items}')

    # # There is the possibility that we are getting duplicate instance_ids here because we query replicas
    # # Make sure dubplicates are not carried forward
    # retracted_instance_ids = set(all_retracted_instance_ids)
    # print(f"{len(all_retracted_instance_ids)-len(retracted_instance_ids)} replicas found")
    # return retracted_instance_ids

# def query_retraction_retry(url, params, batchsize = 10000):
#     """Retrys query if it fails"""
#     status = 0
#     while status == 0:
#         try:
#             query_result = query_retraction(url, params, batchsize)
#             status = 1
#         except RuntimeError as e:
#             print(f"{e}.\nRetrying")
    
#     return query_result


  from tqdm.autonotebook import tqdm


In [2]:
import gcsfs
import pandas as pd
import os
from functools import reduce

In [None]:
gcs = gcsfs.GCSFileSystem()
catalog_url = "https://cmip6.storage.googleapis.com/pangeo-cmip6.csv"
node_urls = [
# "https://esgf-node.llnl.gov/esg-search/search",
"https://esgf-data.dkrz.de/esg-search/search",
# "https://esgf-index1.ceda.ac.uk/esg-search/search",
# "https://esgf-node.ipsl.upmc.fr/esg-search/search",
]

params = {
    "type": "Dataset",
    "mip_era": "CMIP6",
    "replica": "none",
    "distrib": "true",
    "retracted": "true",
    "format": "application/solr+json",
    "fields": "instance_id",
}
# query every one of the nodes
retracted_ids = {}
for url in node_urls:
    retracted_ids[url.split('.')[1]] = query_retraction(url, params, batchsize=1000)

# convert to pandas dataframes
retracted_ids_df = {k:pd.Series(list(v)).to_frame(name="instance_id") for k,v in retracted_ids.items()}

# iteratively merge dataframes with 'outer' to get all possible retractions
# from https://stackoverflow.com/a/44338256
retracted_df = reduce(
    lambda  left,right: pd.merge(
        left,
        right,
        on=['instance_id'],
        how='outer'
    ), 
    retracted_ids_df.values()
)

## document missing instances for each node
print('Documenting missing instance_ids per node')
def unique_instances(df, df_full):
    """Return all the items of `df_full` not found in `df`"""
    df_merged = pd.merge(df, df_full, on=['instance_id'],how='outer', indicator=True)
    df_merged = df_merged[df_merged['_merge']=='right_only']
    df_merged = df_merged.drop(columns=['_merge'])
    return df_merged

missing_ids = {k: unique_instances(v, retracted_df) for k,v in retracted_ids_df.items()}

for k,v in missing_ids.items():
    print(f"Found {len(v)} missing instances from the {k} node.")
    filename = f"missing_instance_ids_{k}.csv"
    v['instance_id'].to_csv(filename, index=False)
    print(f"Missing instance_ids written to {filename}")


Downloading Retraction Data from https://esgf-data.dkrz.de/esg-search/search...
Found 697522 items.


  0%|          | 0/698 [00:00<?, ?it/s]

In [None]:
retracted_ids_df

## I need to async this, but too tired to debug chatgpt now

In [4]:
resp

NameError: name 'resp' is not defined

In [14]:
import aiohttp
import asyncio
import json
from tqdm.asyncio import tqdm


async def fetch_data(session, url, params):
    async with session.get(url=url, params=params) as response:
        if response.status != 200:
            print(params.get("offset", 0))
            print(response.status)
            raw_response = await response.text()
            print(raw_response)
            try:
                response = json.loads(raw_response)
                return response
            except Exception as e:
                print('HERE'+e)
                
        

async def download_retraction_data(url, params, batchsize):
    print(f"Downloading Retraction Data from {url}...")
    async with aiohttp.ClientSession() as session:
        # Fetch the header
        async with session.get(url=url, params=params) as response:
            header = await response.text()
            header = json.loads(header)
            n_items = header["response"]["numFound"]
            print(f"Found {n_items} items.")

        # Prepare batches
        batches = range(0, n_items + 1, batchsize)
        params["limit"] = batchsize

        # Fetch data for each batch asynchronously
        
        batch_jsons = await tqdm.gather(
            *[fetch_data(session, url, {**params, "offset": batch}) for batch in batches]
        )

    # Extract instance ids
    print("Extracting instance_ids...")
    all_retracted_instance_ids = [
        i["instance_id"] for data in batch_jsons for i in json.loads(data)["response"]["docs"]
    ]

    # Check if the downloaded data is complete
    n_retracted = len(all_retracted_instance_ids)
    if n_retracted == n_items:
        print('Successfully downloaded all retraction info')
    else:
        raise RuntimeError(f'Downloaded retraction info is incomplete. ')

    return all_retracted_instance_ids

url = "https://esgf-node.llnl.gov/esg-search/search"
params = {
    "type": "Dataset",
    "mip_era": "CMIP6",
    "replica": "none",
    "distrib": "true",
    "retracted": "true",
    "format": "application/solr+json",
    "fields": "instance_id",
}
batchsize = 100  # Set your desired batch size
test = await download_retraction_data(url, params, batchsize)

Downloading Retraction Data from https://esgf-node.llnl.gov/esg-search/search...
Found 696747 items.


  3%|██▋                                                                                                   | 187/6968 [00:50<07:02, 16.04it/s]

696500
500
<!doctype html><html lang="en"><head><title>HTTP Status 500 – Internal Server Error</title><style type="text/css">body {font-family:Tahoma,Arial,sans-serif;} h1, h2, h3, b {color:white;background-color:#525D76;} h1 {font-size:22px;} h2 {font-size:16px;} h3 {font-size:14px;} p {font-size:12px;} a {color:black;} .line {height:1px;background-color:#525D76;border:none;}</style></head><body><h1>HTTP Status 500 – Internal Server Error</h1><hr class="line" /><p><b>Type</b> Exception Report</p><p><b>Message</b> Read timed out</p><p><b>Description</b> The server encountered an unexpected condition that prevented it from fulfilling the request.</p><p><b>Exception</b></p><pre>java.net.SocketTimeoutException: Read timed out
	java.net.SocketInputStream.socketRead0(Native Method)
	java.net.SocketInputStream.socketRead(SocketInputStream.java:116)
	java.net.SocketInputStream.read(SocketInputStream.java:171)
	java.net.SocketInputStream.read(SocketInputStream.java:141)
	java.io.BufferedInputS

TypeError: can only concatenate str (not "JSONDecodeError") to str

  3%|██▋                                                                                                   | 187/6968 [01:01<07:02, 16.04it/s]

In [17]:
import aiohttp
import asyncio
from tqdm import tqdm

async def fetch_data(session, url, params):
    headers = {"Accept": "application/json"}  # Specify the expected content type
    try:
        async with session.get(url=url, params=params, headers=headers) as response:
            response.raise_for_status()  # Raise an exception for non-200 status codes
            return await response.json()
    except aiohttp.ClientResponseError as e:
        print(f"Error fetching batch. Status code: {e.status}")
        return None  # Ignore this batch
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None  # Ignore this batch

async def download_data(url, params, batchsize):
    async with aiohttp.ClientSession() as session:
        # Fetch the header to determine the total number of items
        headers = {"Accept": "application/json"}  # Specify the expected content type
        async with session.get(url=url, params=params, headers=headers) as response:
            response.raise_for_status()  # Raise an exception for non-200 status codes
            header = await response.json()
            n_items = header["response"]["numFound"]
            print(f"Found {n_items} items.")

        # Prepare batches
        batches = range(0, n_items + 1, batchsize)
        params["limit"] = batchsize

        # Fetch data for each batch asynchronously, handle errors, and ignore problematic batches
        tasks = [fetch_data(session, url, {**params, "offset": batch}) for batch in tqdm(batches)]
        batch_jsons = await asyncio.gather(*tasks)

    # Process the fetched data, excluding None values (error batches)
    process_data([data for data in batch_jsons if data is not None])

def process_data(batch_jsons):
    # Extract and process the data as needed
    # Example: Extract instance_ids
    all_instance_ids = [
        i["instance_id"] for data in batch_jsons for i in data["response"]["docs"]
    ]
    print("Successfully downloaded all data.")
    print(f"Total items: {len(all_instance_ids)}")

In [None]:
# Example usage
batchsize = 100  # Adjust your desired batch size

# Run the asynchronous download process
a = await download_data(url, params, batchsize)