# Debug notebook for the Script adding a provider layer to S3

### Imports

In [33]:
import os
import boto3
from impresso_essentials.io.s3 import (
    IMPRESSO_STORAGEOPT,
    get_s3_resource,
    get_s3_client
)
from impresso_essentials.utils import get_provider_for_alias
from urllib.parse import urlparse

### Implement the function adding the provider given the S3 path

In [54]:
def get_alias_from_path(source_path, og_partition):
    if not og_partition.endswith('/') and og_partition!= '':
        og_partition = og_partition + "/"

    return source_path.replace(og_partition, "").split('/')[0]

In [11]:
s3_path = "s3://22-rebuilt-final/actionfem/actionfem-1928.jsonl.bz2"
s3_path = "s3://42-processed-data-final/langident/langident_v1-4-4/CL/CL-1883.jsonl.bz2"
partition = "s3://42-processed-data-final/langident/langident_v1-4-4"

get_alias_from_path(s3_path, partition)

'CL'

In [79]:
def reconstruct_dest(src_key, og_partition, provider):
    if og_partition != '':
        return src_key.replace(og_partition, os.path.join(og_partition, provider))
    
    # if there is no specific partition, the provider becomes the first element of the key
    return os.path.join(provider, src_key)

In [80]:
def construct_dest_key(source_key, curr_alias, provider, og_partition):

    #stripped_partition = '/'.join(og_partition.split('/')[3:])

    media_alias = get_alias_from_path(source_key, og_partition)

    if curr_alias != curr_alias:
        print(f"PROBLEM: curr_alias={curr_alias}, media_alias: {media_alias}")
        provider = get_provider_for_alias(media_alias)

    full_dest_path = reconstruct_dest(source_key, og_partition, provider)
    
    return full_dest_path

In [61]:
s3_path = "s3://22-rebuilt-final/actionfem/actionfem-1928.jsonl.bz2"
#s3_path = "s3://42-processed-data-final/langident/langident_v1-4-4/CL/CL-1883.jsonl.bz2"

og_partition = "s3://22-rebuilt-final"
#og_partition = "s3://42-processed-data-final/langident/langident_v1-4-4"

stripped_partition = '/'.join(og_partition.split('/')[3:])

print(stripped_partition)
parsed = urlparse(s3_path)
src_bucket = parsed.netloc
src_key = parsed.path.lstrip("/")
dest_bucket = "122-rebuilt-staging"



construct_dest_key(src_key, og_partition)


media_alias: actionfem


'BNL/actionfem/actionfem-1928.jsonl.bz2'

In [38]:
s3_path

's3://42-processed-data-final/langident/langident_v1-4-4/CL/CL-1883.jsonl.bz2'

In [41]:
s3_c = get_s3_client()

parsed = urlparse(s3_path)
bucket = parsed.netloc
key = parsed.path.lstrip("/")


print(bucket, key, s3_path)
s3_c.head_object(Bucket=bucket, Key=key)

42-processed-data-final langident/langident_v1-4-4/CL/CL-1883.jsonl.bz2 s3://42-processed-data-final/langident/langident_v1-4-4/CL/CL-1883.jsonl.bz2


{'ResponseMetadata': {'RequestId': 'tx00000b598947e121a2de1-00685c0c1d-24675da1e-default',
  'HostId': '',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'server': 'nginx',
   'date': 'Wed, 25 Jun 2025 14:47:57 GMT',
   'content-type': 'application/octet-stream',
   'content-length': '7467',
   'connection': 'keep-alive',
   'accept-ranges': 'bytes',
   'last-modified': 'Wed, 25 Jun 2025 12:30:22 GMT',
   'x-rgw-object-type': 'Normal',
   'etag': '"b38bbcb92b63dfcb0ab3bc15a0a1e3cc"',
   'x-amz-meta-impresso-last-ts': '2025-06-25T12:30:22Z',
   'x-amz-meta-mtime': '1712560509.656617582',
   'x-amz-request-id': 'tx00000b598947e121a2de1-00685c0c1d-24675da1e-default',
   'strict-transport-security': 'max-age=31536000; includeSubdomains'},
  'RetryAttempts': 0},
 'AcceptRanges': 'bytes',
 'LastModified': datetime.datetime(2025, 6, 25, 12, 30, 22, tzinfo=tzutc()),
 'ContentLength': 7467,
 'ETag': '"b38bbcb92b63dfcb0ab3bc15a0a1e3cc"',
 'ContentType': 'application/octet-stream',
 'Metadata': {'imp

In [65]:
og_partition = "s3://42-processed-data-final/langident/langident_v1-4-4"

parsed = urlparse(og_partition)
bucket = parsed.netloc
exact_partition = parsed.path.lstrip("/")

bucket, exact_partition

('42-processed-data-final', 'langident/langident_v1-4-4')

In [82]:
paginator = s3_c.get_paginator("list_objects_v2")
page_iterator = paginator.paginate(Bucket=bucket, Prefix=exact_partition)

current_alias = None
provider = None
for page in page_iterator:
    for obj in page.get("Contents", []):
        key = obj["Key"]
        if key[:-1]==exact_partition:
            print(f"partition: {key}")
        elif not key.endswith(".jsonl.bz2") and key.endswith("/"):
            #print(f"media_alias: {key}")
            current_alias = key.split('/')[-2]
            provider = get_provider_for_alias(current_alias)
            print(f"Now processing alias {current_alias} - provider = {provider}")
        elif key.endswith(".jsonl.bz2"):
            #print(f"    file key: {key}")
            dest_key = construct_dest_key(key, current_alias, provider, exact_partition)
            print(f"    file key: will copy {key} here: {dest_key}")
        else:
            print(f"    -> another file: {key}")

partition: langident/langident_v1-4-4/
Now processing alias ACI - provider = BCUL
    file key: will copy langident/langident_v1-4-4/ACI/ACI-1832.jsonl.bz2 here: langident/langident_v1-4-4/BCUL/ACI/ACI-1832.jsonl.bz2
Now processing alias AV - provider = BCUL
    file key: will copy langident/langident_v1-4-4/AV/AV-1880.jsonl.bz2 here: langident/langident_v1-4-4/BCUL/AV/AV-1880.jsonl.bz2
    file key: will copy langident/langident_v1-4-4/AV/AV-1881.jsonl.bz2 here: langident/langident_v1-4-4/BCUL/AV/AV-1881.jsonl.bz2
    file key: will copy langident/langident_v1-4-4/AV/AV-1886.jsonl.bz2 here: langident/langident_v1-4-4/BCUL/AV/AV-1886.jsonl.bz2
Now processing alias BDC - provider = SNL
    file key: will copy langident/langident_v1-4-4/BDC/BDC-1839.jsonl.bz2 here: langident/langident_v1-4-4/SNL/BDC/BDC-1839.jsonl.bz2
Now processing alias BLB - provider = SNL
    file key: will copy langident/langident_v1-4-4/BLB/BLB-1845.jsonl.bz2 here: langident/langident_v1-4-4/SNL/BLB/BLB-1845.jsonl.

In [None]:
def copy_with_provider(s3, source_key, partition, dest_bucket):

    
    dest_key = construct_dest_key(source_key, partition, src_bucket, dest_bucket)

    if 