# Jupyter Notebook used for exploring and drafting script to automate downloads of Sentinel-5P CO and CH4 data

In [None]:
import requests
import json
from datetime import datetime, time
import pandas as pd
import sys 
import os
import zipfile
import io

In [None]:
# Copernicus credentials function
def get_credentials():
    print('Enter your Copernicus Data Space Ecosystem credentials')
    user = input('username: ')
    passwd = input('password: ')
    return user, passwd

In [None]:
# define authentication variables
client_id = 'cdse-public'
token_url = 'https://identity.dataspace.copernicus.eu/auth/realms/CDSE/protocol/openid-connect/token'
grant_type = 'password'

while True:
    username, password = get_credentials()


    # Make a POST request to obtain an authentication token
    auth_data = {'client_id': client_id, 'username': username, 'password': password, 
               'grant_type': grant_type}
    response = requests.post(token_url, data=auth_data)
    
    # Confirm request successful
    if response.status_code == 200:
         # Parse the JSON response and extract token
         token_data = response.json()
         access_token = token_data.get('access_token')
         # Check access token retrieved + print if so
         if access_token:
             print(f'Authentication Token Retrieved')
             break
         else:
             print('Token not found in the response')
    else:
        text_dict = json.loads(response.text)
        print(f'Failed to obtain Authentication Token. Error: {text_dict['error_description']}')

In [None]:
# Select parameter function
def select_parameter(choice):
    # print('Sentinel-5P parameter: \n1. CO \n2. CH4')
    # choice = int(input('Select:'))
    if choice == 1:
        parameter = 'CH4'
        product = 'L2__CH4___' # SENTINEL-5P CH4 nomenclature
    elif choice == 2:
        parameter = 'CO'
        product = 'L2__CO____' # SENTINEL-5P CO
    else: 
        print('Invalid selection') 
        select_parameter()
    return parameter, product

In [None]:
# Call function
choice = int(input('Select Sentinel-5P parameter: \n1. CH4\n2. CO\n'))
parameter, product = select_parameter(choice)
# product = 'L2__CH4___'  #override
# product = 'L2__CO____' 

In [None]:
# Select date of interest
while True:
    date_str = input('Enter date of interest (YYYYMMDD): ')
    try:
        date = datetime.strptime(date_str, '%Y%m%d')
        break
    except ValueError:
        print(f"Invalid date string: {date_str}. Try again")

# date_str = '09012024' #overwrite date for testing purposes
date = datetime.strptime(date_str, '%Y%m%d')
start_date = date.strftime('%Y-%m-%dT%H:%M:%S.%f')[:-3] +'Z'
end_date = datetime.combine(date,time.max).strftime('%Y-%m-%dT%H:%M:%S.%f')[:-3] +'Z'
# print(start_date, end_date, sep='\t')

In [None]:
## OData requires manually building the URL, which is tedious, so I used OpenSearch for querying
# Search for Sentinel-5P data using OpenSearch 
opensearch_url = 'https://catalogue.dataspace.copernicus.eu/resto/api/collections/Sentinel5P/search.json'

# Define the query parameters for Sentinel-5P CH4 and CO data
query_params = { 
    'startDate': start_date, 
    'completionDate': end_date, # Date range for the current day
    'productType': product,  
}
# Make a GET request to the OpenSearch Catalog
response = requests.get(opensearch_url, 
                            params=query_params).json()

In [None]:
# View features in response
response_df = pd.DataFrame.from_dict(response['features'])
response_df
# response_df.head()

In [None]:
# View values contained in 'properties' header
testsub_df = pd.DataFrame.from_dict(response_df['properties'].values.tolist())
# testsub_df.head()

In [None]:
# The response json file contains details about each item in the query, including an 'id' and a download url. 
# I could not get the OpenSearch download url to work with my authentication token, so I moved on to using the download
# feature with OData using the id

# Extract download URLs and filenames
download_urls = response_df['properties'].apply(lambda x: x.get('services').get('download').get('url')).values.tolist()
titles = response_df['properties'].apply(lambda x: x.get('title')).values.tolist()
ids = response_df['id'].values.tolist() #ids needed to download data
ids

In [None]:
# Download files
# create a list of urls from ids
urls = []
for id in ids:
    urls.append('https://download.dataspace.copernicus.eu/odata/v1/Products(' + id + ')/$value')

# urls = urls[:3] #testing just 3 urls
# print(urls)

# Create a session and update headers
headers = {"Authorization": f"Bearer {access_token}"}
session = requests.Session()
session.headers.update(headers)

download_directory = 'tropomi_download_' +  product + '_' + date_str
check_set = set()

for url, title in zip(urls, titles): # Loop over the download urls and titles of files performing GET request
    filename = title[:-3] + '.zip'
    response = session.get(url, stream=True)
    check_set.add(response.status_code)
    # print(filename)
    # print(response.text)
    
    if response.status_code == 200: # Check if the request was successful
        zip_buffer = io.BytesIO(response.content) 
        with zipfile.ZipFile(zip_buffer, 'r') as zip_file:
            zip_file.extractall(download_directory)
            print(f"Extracting {filename} to {download_directory}")
        
    else: 
        print(f"Failed to download: {filename} from {url} (Status code: {response.status_code})") 
        print(response.text)

if check_set == {200}:
    print('Download complete')

In [None]:
###########################################################################################################

In [None]:
################# Token Refresh #################
# refresh_url = 'https://identity.dataspace.copernicus.eu/auth/realms/CDSE/protocol/openid-connect/token'

# headers = {'Content-Type' : 'application/x-www-form-urlencoded'}
# auth_data = {'refresh_token': token_data.get('refresh_token'),
#             'client_id' : client_id,
#             'grant_type' : 'refresh_token'}
             
# response = requests.post(refresh_url, data=auth_data, headers=headers)
# token_data = response.json()
# access_token =token_data.get('access_token')
# token_data

In [None]:
###########################################################################################################

In [None]:
# # Explore query parameters
# import xml.etree.ElementTree as ET
# url = 'https://catalogue.dataspace.copernicus.eu/resto/api/collections/Sentinel5P/describe.xml'
# response = requests.get(url)

# # import xml.dom.minidom
# # dom = xml.dom.minidom.parseString(response.content)
# # pretty_xml = dom.toprettyxml()
# # print(pretty_xml)
# root = ET.fromstring(response.content)
# ET.indent(root)
# print(ET.tostring(root, encoding='unicode'))

In [None]:
# # Explore query parameters for SENTINEL-5p
# json = requests.get("https://catalogue.dataspace.copernicus.eu/resto/api/collections/Sentinel2/search.json?startDate=2021-07-01T00:00:00Z&completionDate=2021-07-31T23:59:59Z&sortParam=startDate&maxRecords=20").json()
# pd.DataFrame.from_dict(json['features']).head(3)

# import pandas as pd
# json = requests.get("https://catalogue.dataspace.copernicus.eu/odata/v1/Products?$filter=PublicationDate ge 2019-05-15T00:00:00.000Z and PublicationDate le 2019-05-16T00:00:00.000Z").json()
# df = pd.DataFrame.from_dict(json['value'])
# columns_to_print = ['Id', 'Name','S3Path','GeoFootprint']  
# df[columns_to_print].head(3)

In [None]:
# # Explore query parameters for SENTINEL-5p
# import xml.etree.ElementTree as ET
# url = 'https://catalogue.dataspace.copernicus.eu/resto/api/collections/Sentinel5P/describe.xml'
# response = requests.get(url)
# print('Query-able Parameters')
# root = ET.fromstring(response.content)
# # for child in root:
# #     print(child.tag, child.text,)
# # for child in root:
# #      print(child.tag, child.attrib)
# #     # if child.tag.endswith('ShortName') or child.tag.endswith('Description'):
# #         print(f"{child.tag}: {child.text}")
# for child in root:
#     print(child.tag, child.text)
#     for subchild in child:
#         print(f'\t{subchild.tag} {subchild.attrib}')
#         for subchild2 in subchild:
#             print(f'\t\t{subchild2.tag} {subchild2.attrib}')

In [None]:
###########################################################################################################

In [None]:
# # Search for Sentinenel 5P data using OData
# odata_url = 'https://catalogue.dataspace.copernicus.eu/odata/v1/Products'
# query_params_CH4 = {
#     '$filter': '(Collection/Name eq ‘SENTINEL-5P)',
#     'startDate': start_date, 
#     'completionDate': end_date, # Date range for the current day
#     'productType': 'L2__CH4___',  # SENTINEL-5P CH4
# }
# response_CH4 = requests.get(odata_url, params=query_params_CH4)
# response_CH4.url

In [None]:
###########################################################################################################

In [None]:
# Testing download URL from OpenSearch
# Create a session and update headers
# headers = {"Authorization": f"Bearer {access_token}", 'Content-Type': 'application/octet-stream'}
# headers = {"Authorization": access_token}
# session = requests.Session()
# session.headers.update(headers)


# # Perform the GET request for downloads
# for url, title in zip(download_urls, filenames):
#     save_path = title[:-3]
#     response = session.get(url, stream=True)
    
#     if response.status_code == 200: # Check if the request was successful
#         print('Success')
#         # with open(save_path, "wb") as file: # save to specified path
#         #     # for chunk in response.iter_content(chunk_size=8192):
#         #     #     if chunk:  # filter out keep-alive new chunks
#         #     #         file.write(chunk)
#         #     # print(f"Downloaded: {save_path}") 
#     else: 
#         print(f"Failed to download: {save_path} from {url} (Status code: {response.status_code})") 
#         print(response.text)
#         break

In [None]:
############# DUMP #################
# prop_df = pd.json_normalize(response_df['properties'])
# download_urls = prop_df['services.download.url'].values.tolist()
# titles = prop_df['title'].values.tolist()
# download_urls
# # test_df = pd.json_normalize(response_df['properties'])
# # test_df.head()
# test2 = pd.DataFrame.from_dict(test['properties'].values.tolist())
# test2.head()

# features = response_CH4['features']
# response_CH4
# response_df = pd.DataFrame.from_dict(response_CH4['features']).head()
# response_df
# # print(CH4_df['properties'][0])
# download_url = response_CH4.get('features')
# download_url

In [None]:
# zip_buffer = io.BytesIO() # Create an in-memory ZIP

# with zipfile.ZipFile(zip_buffer, 'w',zipfile.ZIP_DEFLATED) as zip_file: # Open ZIP for writing

#     for url, title in zip(urls, titles): # Loop over the download urls and titles of files performing GET request
#         filename = title[:-3] + '.zip'
#         response = session.get(url, stream=True)
        
#         if response.status_code == 200: # Check if the request was successful
#             content_buffer = io.BytesIO() # create in-memory buffer to store download
#             for chunk in response.iter_content(chunk_size=8192):
#                 if chunk:  # filter out keep-alive new chunks
#                     content_buffer.write(chunk) # write response to buffer

#             zip_file.writestr(filename, content_buffer.getvalue())
#             print(f"Downloading {filename}")
            
#         else: 
#             print(f"Failed to download: {filename} from {url} (Status code: {response.status_code})") 
#             print(response.text)

# # Write ZIP to disk
# with open('download.zip', 'wb') as f:
#     f.write(zip_buffer.getvalue())

# print('Downloads saved to download.zip')
            
# OLD: this extracts and saves each file separately
# for url, title in zip(urls, titles):
#     save_path = title[:-3] + '.zip'
#     response = session.get(url, stream=True)
    
#     if response.status_code == 200: # Check if the request was successful
#         with open(save_path, "wb") as file: # save to specified path
#             for chunk in response.iter_content(chunk_size=8192):
#                 if chunk:  # filter out keep-alive new chunks
#                     file.write(chunk)
#             print(f"Downloaded: {title}")
            
#     else: 
#         print(f"Failed to download: {save_path} from {url} (Status code: {response.status_code})") 
#         print(response.text)