In [None]:
import boto3
import gzip
import os
from botocore.exceptions import ClientError
from tqdm import tqdm

class ProgressBar(tqdm):
    """Custom progress bar for S3 operations."""

    def update_to(self, bytes_transferred):
        self.update(bytes_transferred - self.n)


def download_file_with_progress(s3_client, bucket_name, key, local_file):
    """
    Downloads a file from S3 with a progress bar.

    Args:
        s3_client: The boto3 S3 client.
        bucket_name (str): The name of the S3 bucket.
        key (str): The key of the file in the S3 bucket.
        local_file (str): The local file path to save the downloaded file.

    Returns:
        None
    """
    # Get the file size from S3
    file_size = s3_client.head_object(Bucket=bucket_name, Key=key)['ContentLength']

    # Use tqdm for the progress bar
    with ProgressBar(total=file_size, unit="B", unit_scale=True, desc="Downloading") as progress_bar:
        s3_client.download_file(
            Bucket=bucket_name,
            Key=key,
            Filename=local_file,
            Callback=progress_bar.update_to
        )


def upload_file_with_progress(s3_client, local_file, bucket_name, key):
    """
    Uploads a file to S3 with a progress bar.

    Args:
        s3_client: The boto3 S3 client.
        local_file (str): The local file path to upload.
        bucket_name (str): The name of the S3 bucket.
        key (str): The key for the file in the S3 bucket.

    Returns:
        None
    """
    # Get the file size
    file_size = os.path.getsize(local_file)

    # Use tqdm for the progress bar
    with ProgressBar(total=file_size, unit="B", unit_scale=True, desc="Uploading") as progress_bar:
        s3_client.upload_file(
            Filename=local_file,
            Bucket=bucket_name,
            Key=key,
            Callback=progress_bar.update_to
        )

        
def download_extract_upload_s3(bucket_name, gz_key, extracted_key, aws_region="us-east-1"):
    """
    Downloads a .gz file from S3, extracts its contents, and uploads the extracted file back to S3.

    Args:
        bucket_name (str): The name of the S3 bucket.
        gz_key (str): The key of the .gz file in the S3 bucket.
        extracted_key (str): The key for the extracted file to be uploaded back to S3.
        aws_region (str): The AWS region of the S3 bucket.

    Returns:
        None
    """
    s3_client = boto3.client("s3", region_name=aws_region)
    local_extracted_file = extracted_key.split('/')[-1]
    local_gz_file = f'{local_extracted_file}.gz'
    

    try:
        # Step 1: Download the .gz file from S3
        print(f"Downloading {gz_key} from S3 bucket {bucket_name}...")
        download_file_with_progress(s3_client, bucket_name, gz_key, local_gz_file)
        print(f"Downloaded {gz_key} to {local_gz_file}")

        with gzip.open(local_gz_file, 'rt') as gz_file:
            # Get the total size of the compressed file
            gz_file_size = os.path.getsize(local_gz_file)
            with tqdm(total=gz_file_size, unit='B', unit_scale=True, desc=f'Extracting {local_gz_file}') as progress_bar:
                with open(local_extracted_file, 'W') as extracted_file:
                    while chunk := gz_file.read(8192):  # Read in chunks
                        extracted_file.write(chunk)
                        progress_bar.update(len(chunk))
        print(f'Extracted content saved to {local_extracted_file}')

        # Step 3: Upload the extracted file back to S3
        print(f"Uploading extracted file to S3 bucket {bucket_name} with key {extracted_key}...")
        upload_file_with_progress(s3_client, local_extracted_file, bucket_name, extracted_key)
        print(f"Uploaded extracted file to s3://{bucket_name}/{extracted_key}")

    except ClientError as e:
        print(f"Error: {e}")
    finally:
        # Clean up local temporary files
        if os.path.exists(local_gz_file):
            os.remove(local_gz_file)
        if os.path.exists(local_extracted_file):
            os.remove(local_extracted_file)


def process_all_files(bucket_name, prefix, aws_region="us-east-1", min_index=0, max_index=100000):
    """
    Processes all .gz files in an S3 bucket with a given prefix.

    Args:
        bucket_name (str): The name of the S3 bucket.
        prefix (str): The prefix of the .gz files in the S3 bucket.
        aws_region (str): The AWS region of the S3 bucket.

    Returns:
        None
    """
    s3_client = boto3.client("s3", region_name=aws_region)

    try:
        # List all .gz files in the bucket with the given prefix
        print(f"Listing .gz files in bucket {bucket_name} with prefix {prefix}...")
        response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
        if "Contents" not in response:
            print("No files found.")
            return

        counter = 0
        for obj in response["Contents"]:
            if counter >= min_index and counter < max_index:
                gz_key = obj["Key"]
                if gz_key.endswith(".gz"):
                    # Define the key for the extracted file
                    extracted_key = gz_key.replace(".gz", "")
                    print(f"Processing file: {gz_key}")
                    download_extract_upload_s3(bucket_name, gz_key, extracted_key, aws_region)
            counter += 1

    except ClientError as e:
        print(f"Error listing files in S3: {e}")


bucket_name = "steve-sagemaker-data-bucket"
prefix = "papers/"  # Prefix for the .gz files
aws_region = "eu-west-2"  # AWS region of the bucket

process_all_files(bucket_name, prefix, aws_region, 0, 1)