In [40]:
import os
from io import BytesIO

import boto3
from botocore.exceptions import ClientError

from common.file.mimetype import get_mimetype
from common.process.args import delimit, join, parse_order_arg
from common.process.ascii import to_ascii
from common.process.day import Day
from common.process.when import HOUR

S3_BASE_URL = "https://s3.amazonaws.com"


class S3:  # pragma: no cover
    def __init__(self, key, secret, bucket, dir="", region="us-east-1"):
        """
        The optional dir let's you set a base directory for all of the methods. For example:

          s3 = S3(key, secret, 'mybucket', dir='mydir')
          path = s3.download('/tmp/myfile.txt')

        will download the s3 object from s3://mybucket/mydir/myfile.txt to /tmp/myfile.txt.
        """
        self.key = to_ascii(key)
        self.secret = to_ascii(secret)
        self.bucket = bucket
        self.dir = (os.path.normpath(dir) + os.path.sep) if dir else ""
        self.region = region
        self.client = None

    def connect(self):
        if not self.client:
            self.client = boto3.client(
                "s3",
                aws_access_key_id=self.key,
                aws_secret_access_key=self.secret,
                region_name=self.region,
                use_ssl=True,
            )

    def list(
        self,
        remote=None,
        include_dirs=True,
        include_files=True,
        delimiter="/",
        prefix=None,
        order_by=None,
        since=None,
        start_after=None,
        expanded=False,
    ):
        """Ordering works only within each (truncated) batch.
        Filters files older than the provided 'since' timestamp in UTC.
        Returns file objects instead of filenames when 'expanded' is True."""
        if order_by:
            order_by, reverse = parse_order_arg(order_by)

        def yielder(items):
            if order_by:
                items = sorted(items, key=lambda x: x.get(order_by, ""), reverse=reverse)
            for item in items:
                is_dir = item.get("Prefix")
                if is_dir:
                    if include_dirs:
                        yield item if expanded else os.path.basename(item["Prefix"][:-1])
                else:
                    if include_files:
                        if item["Key"].endswith(delimiter):
                            continue
                        if since and item["LastModified"] < Day(since).datetime:
                            continue
                        yield item if expanded else os.path.basename(item["Key"])

        self.connect()
        remote = self.get_remote(remote).rstrip("/") + "/"
        prefix = os.path.join(remote, prefix or "")

        params = {"Bucket": self.bucket, "Prefix": prefix, "Delimiter": delimiter}
        if start_after:
            params["StartAfter"] = start_after
        truncated = True
        while truncated:
            response = self.client.list_objects_v2(**params)
            if response.get("KeyCount"):
                if response.get("Contents"):
                    yield from yielder(response["Contents"])
                if response.get("CommonPrefixes"):
                    yield from yielder(response["CommonPrefixes"])
            truncated = response["IsTruncated"]
            if truncated:
                params["ContinuationToken"] = response["NextContinuationToken"]

    def upload(
        self,
        src,
        remote=None,
        public=False,
        content_type=None,
        download_filename=None,
        encoding="utf-8",
        content_disposition="attachment",
    ):
        """src can be a file object, a file path, or content in string/bytes format."""
        self.connect()
        needs_closing = True

        # prepare
        remote = self.get_remote(remote or os.path.basename(src))
        content_type = content_type or get_mimetype(src)
        if hasattr(src, "seek"):
            needs_closing = False
        elif isinstance(src, bytes):
            src = BytesIO(src)
        elif isinstance(src, str):
            if os.path.exists(src):
                src = open(src, "rb")  # convert path to file object
            else:
                src = BytesIO(src.encode(encoding))
        else:
            raise Exception(f"Unexpected input type: {type(src)}")

        # upload
        params = {
            "ACL": "public-read" if public else "private",
            "ContentType": content_type,
        }
        if download_filename:
            params["ContentDisposition"] = f'{content_disposition}; filename="{download_filename}"'
        try:
            self.client.upload_fileobj(src, self.bucket, remote, ExtraArgs=params)
        finally:
            if needs_closing:
                src.close()

    def download(self, local, remote=None):
        self.connect()
        remote = self.get_remote(remote or os.path.basename(local))
        self.client.download_file(Bucket=self.bucket, Key=remote, Filename=local)

    def download_object(self, remote=None):
        self.connect()
        remote = self.get_remote(remote)
        temp = BytesIO()
        self.client.download_fileobj(Bucket=self.bucket, Key=remote, Fileobj=temp)
        return temp.getvalue()

    def last_modified(self, remote):
        self.connect()
        remote = self.get_remote(remote)
        metadata = self.client.head_object(Bucket=self.bucket, Key=remote)
        return metadata["LastModified"]

    def file_size(self, remote):
        self.connect()
        remote = self.get_remote(remote)
        metadata = self.client.head_object(Bucket=self.bucket, Key=remote)
        return metadata["ContentLength"]

    def exists(self, remote):
        try:
            self.last_modified(remote)
            return True
        except ClientError:
            return False

    def delete(self, remote):
        self.connect()
        remote = self.get_remote(remote)
        self.client.delete_object(Bucket=self.bucket, Key=remote)

    def get_remote(self, path):
        if not path:
            path = os.path.dirname(self.dir) or ""
        elif self.dir:
            path = os.path.join(self.dir, path)
        return path

    def get_url(self, path):
        parts = [S3_BASE_URL, self.bucket, self.get_remote(path)]
        return delimit([part.strip("/") for part in parts], sep="/")

    def get_signed_url(self, path, bucket=None, dir=None, expiry=HOUR, params=None):
        self.connect()
        params = params or {}
        params.update({"Bucket": bucket or self.bucket, "Key": join(dir or self.dir, path)})
        return self.client.generate_presigned_url("get_object", Params=params, ExpiresIn=expiry)

In [41]:
from nozama.utils.aws import s3_connect
s3 = s3_connect()

In [None]:
bucket_name = "mpd-biblio"
remote_path = "prd/covers/original/"

s3 = s3_connect(bucket=bucket_name)

local_directory = "/data/d/Desktop/"

os.makedirs(local_directory, exist_ok=True)

print("S3 Bucket:", bucket_name)
print("S3 Remote Path:", remote_path)

# List all objects in the S3 bucket
all_objects = list(s3.list(remote_path))

print("All Objects List:", all_objects)

for obj in objects[:10]:
    print(obj)
    file_name = os.path.basename(obj)
    local_path = os.path.join(local_directory, file_name)

    # Download the file
    s3.download(local_path, remote=os.path.join(remote_path, file_name))


S3 Bucket: mpd-biblio
S3 Remote Path: prd/covers/original/


In [44]:
list(s3.list(remote_path))

['9780174436089.jpg',
 '9780174436256.jpg',
 '9780174436270.jpg',
 '9780230000124.jpg',
 '9780230000285.jpg',
 '9780230000292.jpg',
 '9780230000308.jpg',
 '9780230000315.jpg',
 '9780230000346.jpg',
 '9780230000353.jpg',
 '9780230000377.jpg',
 '9780230000384.jpg',
 '9780230001435.jpg',
 '9780230001442.jpg',
 '9780230001480.jpg',
 '9780230001541.jpg',
 '9780230001558.jpg',
 '9780230001619.jpg',
 '9780230001626.jpg',
 '9780230001756.jpg',
 '9780230001763.jpg',
 '9780230001770.jpg',
 '9780230001787.jpg',
 '9780230001794.jpg',
 '9780230001800.jpg',
 '9780230001886.jpg',
 '9780230001893.jpg',
 '9780230002289.jpg',
 '9780230002296.jpg',
 '9780230002456.jpg',
 '9780230003347.jpg',
 '9780230003507.jpg',
 '9780230003514.jpg',
 '9780230004603.jpg',
 '9780230004627.jpg',
 '9780230004634.jpg',
 '9780230004641.jpg',
 '9780230005174.jpg',
 '9780230005181.jpg',
 '9780230005402.jpg',
 '9780230005419.jpg',
 '9780230006324.jpg',
 '9780230006720.jpg',
 '9780230006737.jpg',
 '9780230006744.jpg',
 '97802300

In [45]:
remote_path

'prd/covers/original/'

In [47]:
s3.bucket

'mpd-biblio'

In [61]:
import boto3
import os 

# set aws credentials 
s3r = boto3.resource('s3', aws_access_key_id='',
    aws_secret_access_key='')
bucket = s3r.Bucket('mpd-biblio')

# downloading folder 
prefix = 'dirname'
for object in bucket.objects.filter(Prefix = 'dirname'):
    print(object)
    if object.key == prefix:
        os.makedirs(os.path.dirname(object.key), exist_ok=True)
        continue;
    bucket.download_file(object.key, object.key)

In [63]:
from nozama.utils import aws

s3 = aws.s3_connect(bucket='mpd-biblio')
image_list = s3.list('prd/covers/original')
list(image_list)


['9780174436089.jpg',
 '9780174436256.jpg',
 '9780174436270.jpg',
 '9780230000124.jpg',
 '9780230000285.jpg',
 '9780230000292.jpg',
 '9780230000308.jpg',
 '9780230000315.jpg',
 '9780230000346.jpg',
 '9780230000353.jpg',
 '9780230000377.jpg',
 '9780230000384.jpg',
 '9780230001435.jpg',
 '9780230001442.jpg',
 '9780230001480.jpg',
 '9780230001541.jpg',
 '9780230001558.jpg',
 '9780230001619.jpg',
 '9780230001626.jpg',
 '9780230001756.jpg',
 '9780230001763.jpg',
 '9780230001770.jpg',
 '9780230001787.jpg',
 '9780230001794.jpg',
 '9780230001800.jpg',
 '9780230001886.jpg',
 '9780230001893.jpg',
 '9780230002289.jpg',
 '9780230002296.jpg',
 '9780230002456.jpg',
 '9780230003347.jpg',
 '9780230003507.jpg',
 '9780230003514.jpg',
 '9780230004603.jpg',
 '9780230004627.jpg',
 '9780230004634.jpg',
 '9780230004641.jpg',
 '9780230005174.jpg',
 '9780230005181.jpg',
 '9780230005402.jpg',
 '9780230005419.jpg',
 '9780230006324.jpg',
 '9780230006720.jpg',
 '9780230006737.jpg',
 '9780230006744.jpg',
 '97802300

In [67]:
import boto3
import os

s3 = boto3.client('s3')
bucket_name = 'mpd-biblio'
prefix = 'prd/covers/original'

# List objects in the specified bucket and prefix
response = s3.list_objects(Bucket=bucket_name, Prefix=prefix)

# Download each object (image) to a local directory within the Docker container
container_local_directory = '/data/d/Desktop/downloadimg/'

if not os.path.exists(container_local_directory):
    os.makedirs(container_local_directory)

for obj in response.get('Contents', []):
    key = obj['Key']
    local_filename = os.path.basename(key)  # Get the filename from the key
    local_path = os.path.join(container_local_directory, local_filename)

    s3.download_file(bucket_name, key, local_path)
    print(f"Downloaded {key} to {local_path}")



NotADirectoryError: [Errno 20] Not a directory: '/data/d/Desktop/downloadimg/.d6f750F5' -> '/data/d/Desktop/downloadimg/'

In [71]:
from nozama.data import title
from datetime import date, timedelta
import pandas as pd

# get title metadata
df_title = title.get_title_data()

# filter to frontlist
df_filtered = df_title[pd.to_datetime(df_title['pub_date']).dt.date > date.today() - timedelta(days=365)]

# filter to hardcover (only one isbn per frontlist work)
df_filtered = df_filtered[df_filtered['medium'] == 'TC']

# get frontlist hardcover images
isbns = df_filtered['isbn'].tolist()

for isbn in isbns:
    key = f'prd/covers/original/{isbn}.jpg'
    local_path = f'/data/d/Desktop/downloadimg/{isbn}.jpg'

    try:
        s3.download_file(bucket_name, key, local_path)
        print(f"Downloaded {key} to {local_path}")
    except FileNotFoundError:
        print(f"Image not found for {key}.")
    except Exception as e:
        print(f"Error downloading {key}: {e}")

print("Download complete.")


Getting title data


OperationalError: 250001: Could not connect to Snowflake backend after 0 attempt(s).Aborting