# Debug notebook for the Script adding a provider layer to S3

### Imports

In [10]:
import os
import boto3
from impresso_essentials.io.s3 import (
    IMPRESSO_STORAGEOPT,
    get_s3_resource,
    get_s3_client
)
from impresso_essentials.utils import get_provider_for_alias, ALL_MEDIA, PARTNER_TO_MEDIA
from urllib.parse import urlparse
import signal
from contextlib import contextmanager

### Implement the function adding the provider given the S3 path

In [11]:
@contextmanager
def disable_interrupts():
    """Context manager to temporarily disable keyboard interrupts."""
    original_handler = signal.getsignal(signal.SIGINT)
    signal.signal(signal.SIGINT, signal.SIG_IGN)
    try:
        yield
    finally:
        signal.signal(signal.SIGINT, original_handler)

In [127]:
t_s = 'fd'

if t_s:
    print('yes')
else:
    print('no')

yes


In [16]:
def get_alias_from_path(source_path, og_partition):
    if not og_partition.endswith('/') and og_partition!= '':
        og_partition = og_partition + "/"

    split = source_path.replace(og_partition, "").split('/') if og_partition else source_path.split('/')

    # make sure to know if the partner is already in this path or not
    if split[0] in ALL_MEDIA:
        #print(f"returnning split[0], None: {split[0], None}")
        return split[0], None
    elif split[0] in PARTNER_TO_MEDIA and split[1] in PARTNER_TO_MEDIA[split[0]]:
        # also return the provider if it's there
        #print(f"returnning split[1], split[0]: {split[1], split[0]}")
        return split[1], split[0]
    else:
        msg = f"The source path {source_path} does not contain the media alias at an expected place."
        raise AttributeError(msg)

In [85]:
s3_path = "s3://22-rebuilt-final/actionfem/actionfem-1928.jsonl.bz2"
s3_path = "s3://42-processed-data-final/langident/langident_v1-4-4/CL/CL-1883.jsonl.bz2"
partition = "s3://42-processed-data-final/langident/langident_v1-4-4"

get_alias_from_path(s3_path, partition)

'CL'

In [12]:
def construct_dest_key(src_key, provider, og_partition, current_alias=None, found_prov=None):
    if found_prov:
        print(f"construct_dest_key - found_provider is not None ({found_prov}), skipping, and returning key as-is: {src_key}")
        return src_key
    #print(f"og_partition : {og_partition}, current_alias={current_alias}")
    if og_partition != '':
        return src_key.replace(og_partition, os.path.join(og_partition, provider))
    
    # if there is no specific partition, the provider becomes the first element of the key
    return os.path.join(provider, src_key)

In [144]:
"""def construct_dest_key(source_key, curr_alias, provider, og_partition):

    #stripped_partition = '/'.join(og_partition.split('/')[3:])

    media_alias, found_prov = get_alias_from_path(source_key, og_partition)
    print(f" media_alias, found_prov: {media_alias, found_prov}")
    if found_prov != provider:
        print(f"PROBLEM: the provider is already in the path found_prov={found_prov}, and different from provider: {provider}")

    if media_alias != curr_alias or provider is None:
        print(f"PROBLEM: curr_alias={curr_alias}, media_alias: {media_alias}")
        found_prov = get_provider_for_alias(media_alias)

    full_dest_path = reconstruct_dest(source_key, og_partition, found_prov)
    
    return full_dest_path"""

'def construct_dest_key(source_key, curr_alias, provider, og_partition):\n\n    #stripped_partition = \'/\'.join(og_partition.split(\'/\')[3:])\n\n    media_alias, found_prov = get_alias_from_path(source_key, og_partition)\n    print(f" media_alias, found_prov: {media_alias, found_prov}")\n    if found_prov != provider:\n        print(f"PROBLEM: the provider is already in the path found_prov={found_prov}, and different from provider: {provider}")\n\n    if media_alias != curr_alias or provider is None:\n        print(f"PROBLEM: curr_alias={curr_alias}, media_alias: {media_alias}")\n        found_prov = get_provider_for_alias(media_alias)\n\n    full_dest_path = reconstruct_dest(source_key, og_partition, found_prov)\n    \n    return full_dest_path'

In [7]:
s3_path = "s3://22-rebuilt-final/actionfem/actionfem-1928.jsonl.bz2"
#s3_path = "s3://42-processed-data-final/langident/langident_v1-4-4/CL/CL-1883.jsonl.bz2"

og_partition = "s3://22-rebuilt-final"
#og_partition = "s3://42-processed-data-final/langident/langident_v1-4-4"

stripped_partition = '/'.join(og_partition.split('/')[3:])

print(stripped_partition)
parsed = urlparse(s3_path)
src_bucket = parsed.netloc
src_key = parsed.path.lstrip("/")
dest_bucket = "122-rebuilt-staging"
provider = 'BNL'
alias = "actionfem"


construct_dest_key(src_key, provider, alias, stripped_partition)




'actionfem/BNL/actionfem/BNL-1928.jsonl.bz2'

In [8]:
part = "s3://42-processed-data-final/langident/langident_v1-4-4"
part = "s3://22-rebuilt-final"
parsed = urlparse(part)
src_bucket = parsed.netloc
src_key = parsed.path.lstrip("/")
src_bucket, src_key

('22-rebuilt-final', '')

In [38]:
s3_path

's3://42-processed-data-final/langident/langident_v1-4-4/CL/CL-1883.jsonl.bz2'

In [95]:
s3_c = get_s3_client()

parsed = urlparse(s3_path)
bucket = parsed.netloc
key = parsed.path.lstrip("/")


print(bucket, key, s3_path)
s3_c.head_object(Bucket=bucket, Key=key)

22-rebuilt-final actionfem/actionfem-1928.jsonl.bz2 s3://22-rebuilt-final/actionfem/actionfem-1928.jsonl.bz2


{'ResponseMetadata': {'RequestId': 'tx00000a293b46b9407dd86-006863e9f1-2487f759b-default',
  'HostId': '',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'server': 'nginx',
   'date': 'Tue, 01 Jul 2025 14:00:18 GMT',
   'content-type': 'binary/octet-stream',
   'content-length': '768962',
   'connection': 'keep-alive',
   'accept-ranges': 'bytes',
   'last-modified': 'Thu, 15 May 2025 22:34:15 GMT',
   'x-rgw-object-type': 'Normal',
   'etag': '"27912107d7250c8ffc3e2ede5c6caab6"',
   'x-amz-meta-impresso-last-ts': '2024-03-04T18:34:32Z',
   'x-amz-meta-mtime': '1709577450.524',
   'x-amz-request-id': 'tx00000a293b46b9407dd86-006863e9f1-2487f759b-default',
   'strict-transport-security': 'max-age=31536000; includeSubdomains'},
  'RetryAttempts': 0},
 'AcceptRanges': 'bytes',
 'LastModified': datetime.datetime(2025, 5, 15, 22, 34, 15, tzinfo=tzutc()),
 'ContentLength': 768962,
 'ETag': '"27912107d7250c8ffc3e2ede5c6caab6"',
 'ContentType': 'binary/octet-stream',
 'Metadata': {'impresso-last-t

In [None]:
def add_provider_to_s3_partition(src_bucket, dest_bucket, exact_partition, perform_copy=False, metadata_directive="COPY"):

    s3 = get_s3_client()

    paginator = s3.get_paginator("list_objects_v2")
    page_iterator = paginator.paginate(Bucket=src_bucket, Prefix=exact_partition)

    current_alias = None
    provider = None
    for page in page_iterator:
        for obj in page.get("Contents", []):
            key = obj["Key"]
            #print(f"        key: {key}, key.split('/')[-2]: {key.split('/')[-2]}")
            if key[:-1]==exact_partition:
                print(f"partition: {key}")
            elif not key.endswith(".jsonl.bz2") and key.endswith("/"):
                current_alias = key.split('/')[-2]
                provider = get_provider_for_alias(current_alias)
                print(f"Now processing alias {current_alias} - provider = {provider}")
            elif key.endswith(".jsonl.bz2"):

                # check if we have now changed Alias or provider
                new_alias, found_prov = get_alias_from_path(key, exact_partition)

                if current_alias!=new_alias:
                    current_alias=new_alias
                    provider = get_provider_for_alias(current_alias)
                    print(f"Found new alias in key - Now processing alias {current_alias} - provider = {provider}")

                #print(f"    file key: {key} - current_alias: {current_alias}, provider={provider}, exact_partition: {exact_partition}.")
                dest_key = construct_dest_key(key, provider, exact_partition, current_alias, found_prov)
                if perform_copy:
                    if dest_key!=key:

                        try: 
                            existing_dest = s3.head_object(Bucket=dest_bucket, Key=dest_key)
                            print(f"    The destination key {dest_key} already exists in bucket {dest_bucket}, skipping.")
                        except Exception as e:
                            # the destination does not exist yet, perform the copy
                            #existing_metadata = s3_c.head_object(Bucket=src_bucket, Key="SOC_VS/SOC_VS-1944.jsonl.bz2").get("Metadata", {})
                            if False:
                                print(f"    File key {key} will be copied to here: {dest_key} in bucket {dest_bucket}")
                                with disable_interrupts():
                                    s3.copy_object(
                                        Bucket=dest_bucket,
                                        Key=dest_key,
                                        CopySource={"Bucket": src_bucket, "Key": key},
                                        MetadataDirective=metadata_directive,
                                    )
                    else:
                        print(f"    will NOT copy {key} to {dest_key} - same key or no copy")
            else:
                print(f"    -> another file: {key}")

## TODO add function to delete after, check that the alias is in aliases but also that the provider is in providers (eg. NZZ!!)

In [14]:
#og_partition = "s3://42-processed-data-final/langident/langident_v1-4-4"
og_partition = "s3://120-rebuilt-sandbox/"

parsed = urlparse(og_partition)
src_bucket = parsed.netloc
exact_partition = parsed.path.lstrip("/")

dest_bucket = '120-rebuilt-sandbox'

src_bucket, exact_partition

('120-rebuilt-sandbox', '')

In [17]:
add_provider_to_s3_partition(src_bucket, dest_bucket, exact_partition, perform_copy=True)

Found new alias in key - Now processing alias ACI - provider = BCUL
    The destination key BCUL/ACI/ACI-1832.jsonl.bz2 already exists in bucket 120-rebuilt-sandbox, skipping - {'ResponseMetadata': {'RequestId': 'tx00000649f3fed13c070f6-00686535f6-2456ef177-default', 'HostId': '', 'HTTPStatusCode': 200, 'HTTPHeaders': {'server': 'nginx', 'date': 'Wed, 02 Jul 2025 13:36:54 GMT', 'content-type': 'binary/octet-stream', 'content-length': '271664', 'connection': 'keep-alive', 'accept-ranges': 'bytes', 'last-modified': 'Tue, 01 Jul 2025 15:21:49 GMT', 'x-rgw-object-type': 'Normal', 'etag': '"3a187e2b4ce700bc508e856c79c59887"', 'x-amz-meta-impresso-last-ts': '2024-03-26T16:28:05Z', 'x-amz-meta-mtime': '1711470487.908', 'x-amz-request-id': 'tx00000649f3fed13c070f6-00686535f6-2456ef177-default', 'strict-transport-security': 'max-age=31536000; includeSubdomains'}, 'RetryAttempts': 0}, 'AcceptRanges': 'bytes', 'LastModified': datetime.datetime(2025, 7, 1, 15, 21, 49, tzinfo=tzutc()), 'ContentLeng

In [22]:

add_provider_to_s3_partition(src_bucket, dest_bucket, exact_partition, perform_copy=True)

Found new alias in key - Now processing alias ACI - provider = BCUL
Found new alias in key - Now processing alias AV - provider = BCUL
Found new alias in key - Now processing alias ACI - provider = BCUL
construct_dest_key - found_provider is not None (BCUL), skipping, and returning key as-is: BCUL/ACI/ACI-1832.jsonl.bz2
    will NOT copy BCUL/ACI/ACI-1832.jsonl.bz2 to BCUL/ACI/ACI-1832.jsonl.bz2 - same key or no copy
Found new alias in key - Now processing alias AV - provider = BCUL
construct_dest_key - found_provider is not None (BCUL), skipping, and returning key as-is: BCUL/AV/AV-1880.jsonl.bz2
    will NOT copy BCUL/AV/AV-1880.jsonl.bz2 to BCUL/AV/AV-1880.jsonl.bz2 - same key or no copy
construct_dest_key - found_provider is not None (BCUL), skipping, and returning key as-is: BCUL/AV/AV-1881.jsonl.bz2
    will NOT copy BCUL/AV/AV-1881.jsonl.bz2 to BCUL/AV/AV-1881.jsonl.bz2 - same key or no copy
construct_dest_key - found_provider is not None (BCUL), skipping, and returning key as-i

In [21]:
s3_c = get_s3_client()
s3_c.delete_object(
    Bucket=dest_bucket,
    Key="CFCE/CFCE-1996.jsonl.bz2",
)

{'ResponseMetadata': {'RequestId': 'tx000006485d4680a624d6a-0068653cdd-24675da1e-default',
  'HostId': '',
  'HTTPStatusCode': 204,
  'HTTPHeaders': {'server': 'nginx',
   'date': 'Wed, 02 Jul 2025 14:06:21 GMT',
   'connection': 'keep-alive',
   'x-amz-request-id': 'tx000006485d4680a624d6a-0068653cdd-24675da1e-default',
   'strict-transport-security': 'max-age=31536000; includeSubdomains'},
  'RetryAttempts': 0}}

In [None]:
s3_c.copy_object(
    Bucket=dest_bucket,
    Key="INA/CFCE/CFCE-1996.jsonl.bz2",
    CopySource={"Bucket": src_bucket, "Key": "CFCE/CFCE-1996.jsonl.bz2"},
    MetadataDirective='COPY',
)

In [178]:
existing_metadata = s3_c.head_object(Bucket=src_bucket, Key="CFCE/CFCE-1996.jsonl.bz2").get("Metadata", {})
existing_metadata = s3_c.head_object(Bucket=src_bucket, Key="SOC_VS/SOC_VS-1944.jsonl.bz2").get("Metadata", {})
existing_metadata

{}

In [11]:
src_bucket

'120-rebuilt-sandbox'

In [24]:
s3_c = get_s3_client()
try:
    existing_object = s3_c.head_object(Bucket=src_bucket, Key="BCUL/ACI/ACI-1832.jsonl.bz2")
    print(f"key found!: {existing_object}")
except Exception as e:
    print("key not found")

key found!: {'ResponseMetadata': {'RequestId': 'tx000001ec57d7d744ddffd-006863ffed-2487f759b-default', 'HostId': '', 'HTTPStatusCode': 200, 'HTTPHeaders': {'server': 'nginx', 'date': 'Tue, 01 Jul 2025 15:34:05 GMT', 'content-type': 'binary/octet-stream', 'content-length': '271664', 'connection': 'keep-alive', 'accept-ranges': 'bytes', 'last-modified': 'Tue, 01 Jul 2025 15:21:49 GMT', 'x-rgw-object-type': 'Normal', 'etag': '"3a187e2b4ce700bc508e856c79c59887"', 'x-amz-meta-impresso-last-ts': '2024-03-26T16:28:05Z', 'x-amz-meta-mtime': '1711470487.908', 'x-amz-request-id': 'tx000001ec57d7d744ddffd-006863ffed-2487f759b-default', 'strict-transport-security': 'max-age=31536000; includeSubdomains'}, 'RetryAttempts': 0}, 'AcceptRanges': 'bytes', 'LastModified': datetime.datetime(2025, 7, 1, 15, 21, 49, tzinfo=tzutc()), 'ContentLength': 271664, 'ETag': '"3a187e2b4ce700bc508e856c79c59887"', 'ContentType': 'binary/octet-stream', 'Metadata': {'impresso-last-ts': '2024-03-26T16:28:05Z', 'mtime': '1

In [179]:
s3_c.copy_object(
    Bucket=dest_bucket,
    Key="SWISSINFO/SOC_VS/SOC_VS-1944.jsonl.bz2",
    CopySource={"Bucket": src_bucket, "Key": "SOC_VS/SOC_VS-1944.jsonl.bz2"},
    MetadataDirective='COPY',
)

{'ResponseMetadata': {'RequestId': 'tx00000aaf97c1716a06036-006863f9dc-2487f774e-default',
  'HostId': '',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'server': 'nginx',
   'date': 'Tue, 01 Jul 2025 15:08:12 GMT',
   'content-type': 'application/xml',
   'transfer-encoding': 'chunked',
   'connection': 'keep-alive',
   'x-amz-request-id': 'tx00000aaf97c1716a06036-006863f9dc-2487f774e-default',
   'strict-transport-security': 'max-age=31536000; includeSubdomains'},
  'RetryAttempts': 0},
 'CopyObjectResult': {'ETag': 'f6ed914ee97668222feff2b52f7b0077',
  'LastModified': datetime.datetime(2025, 7, 1, 15, 8, 12, 896000, tzinfo=tzutc())}}

In [159]:
paginator = s3_c.get_paginator("list_objects_v2")

page_iterator = paginator.paginate(Bucket=src_bucket, Prefix=exact_partition)

current_alias = None
provider = None
for page in page_iterator:
    for obj in page.get("Contents", []):
        key = obj["Key"]
        #print(f"        key: {key}, key.split('/')[-2]: {key.split('/')[-2]}")
        if key[:-1]==exact_partition:
            print(f"partition: {key}")
        elif not key.endswith(".jsonl.bz2") and key.endswith("/"):
            print(f"media_alias: {key}")
            current_alias = key.split('/')[-2]
            provider = get_provider_for_alias(current_alias)
            print(f"Now processing alias {current_alias} - provider = {provider}")
        elif key.endswith(".jsonl.bz2"):

            # check if we have now changed Alias or provider
            new_alias, found_prov = get_alias_from_path(key, exact_partition)
            if current_alias!=new_alias:
                current_alias=new_alias
                provider = get_provider_for_alias(current_alias)
                print(f"Found new alias in key - Now processing alias {current_alias} - provider = {provider}")


            #print(f"    file key: {key} - current_alias: {current_alias}, provider={provider}, exact_partition: {exact_partition}.")
            dest_key = construct_dest_key(key, provider, exact_partition, current_alias)
            print(f"    file key: will copy {key} here: {dest_key}")
            if False:
                with disable_interrupts():
                    s3_c.copy_object(
                        Bucket=dest_bucket,
                        Key=dest_key,
                        CopySource={"Bucket": bucket, "Key": key},
                        Metadata=updated_metadata,
                        MetadataDirective="REPLACE",
                        ContentType=head.get("ContentType", "application/octet-stream"),
                    )
        else:
            print(f"    -> another file: {key}")

partition: langident/langident_v1-4-4/
media_alias: langident/langident_v1-4-4/ACI/
Now processing alias ACI - provider = BCUL
og_partition : langident/langident_v1-4-4, current_alias=ACI
    file key: will copy langident/langident_v1-4-4/ACI/ACI-1832.jsonl.bz2 here: langident/langident_v1-4-4/BCUL/ACI/ACI-1832.jsonl.bz2
media_alias: langident/langident_v1-4-4/AV/
Now processing alias AV - provider = BCUL
og_partition : langident/langident_v1-4-4, current_alias=AV
    file key: will copy langident/langident_v1-4-4/AV/AV-1880.jsonl.bz2 here: langident/langident_v1-4-4/BCUL/AV/AV-1880.jsonl.bz2
og_partition : langident/langident_v1-4-4, current_alias=AV
    file key: will copy langident/langident_v1-4-4/AV/AV-1881.jsonl.bz2 here: langident/langident_v1-4-4/BCUL/AV/AV-1881.jsonl.bz2
og_partition : langident/langident_v1-4-4, current_alias=AV
    file key: will copy langident/langident_v1-4-4/AV/AV-1886.jsonl.bz2 here: langident/langident_v1-4-4/BCUL/AV/AV-1886.jsonl.bz2
media_alias: langi

In [None]:
def copy_with_provider(s3, source_key, partition, dest_bucket):

    
    dest_key = construct_dest_key(source_key, partition, src_bucket, dest_bucket)

    if 