In [None]:
import boto3
import pandas as pd
import io
import os
import time

from botocore.credentials import RefreshableCredentials
from botocore.session import get_session

### Connect to S3

In [None]:
def aws_session(aws_profile='sg_stage'):
    """Create a a boto3 session.
    Params:
        (string): credentials profile name
    Returns:
        (boto3 client object)
    """
    session = boto3.Session()
    # If the session is run on a local machine, with AWS credentials fetched
    # from a shared file, use the DataScience role profile.
    if session.get_credentials().method == 'shared-credentials-file':
        session = boto3.Session(profile_name=aws_profile)
    creds = session.get_credentials()
    result = {
        'access_key': creds.access_key,
        'secret_key': creds.secret_key,
        'token': creds.token,
        'expiry_time': creds._expiry_time.isoformat()
    }
    return result

CREDS = RefreshableCredentials.create_from_metadata(
    metadata=aws_session(),
    refresh_using=aws_session,
    method="sts-assume-role",
)

SESSION = get_session()
SESSION._credentials = CREDS
SESSION.set_config_variable("region", 'ap-southeast-1')
AUTO_SESSION = boto3.Session(botocore_session=SESSION)

In [None]:
def s3_client():
    """Cache a boto3 client with credentias and MFA token."""
    return AUTO_SESSION.client('s3')

### Generate size of SCD Bucket
get_size function can also be used to obtain sizes of individual folders in bucket

In [None]:
bucket = 'yara-sh-dads-scd-stage'


def generate_all_keys(src, prefix=''):
    '''
    Generates and returns list of all keys in src bucket.
    '''
    s3 = s3_client()
    s3_paginator = s3.get_paginator('list_objects_v2')
    parameters = {'Bucket': src,
                  'Prefix': prefix}
    s3_object_keys = []

    for page in s3_paginator.paginate(**parameters):
        for content in page.get('Contents', ()):
            key = content['Key']
            if not key[-1] == '/':
                s3_object_keys += [key]

    return s3_object_keys

# keys = generate_all_keys(bucket)

In [None]:
start_time = time.time()


def get_size(src, prefix=''):
    '''
    Generates and returns size of bucket or folder.
    Input parameters are source bucket (src) and prefix, if any.
    '''
    s3 = s3_client()
    keys = generate_all_keys(src, prefix)
    total_size = 0

    for key in keys:
        key_size = float(s3.head_object(Bucket=bucket, Key=key)
                         ['ResponseMetadata']['HTTPHeaders']['content-length'])
        total_size += key_size

    size_gb = round(total_size / (1024*1024*1024),2)
    if prefix == '':
        print(f'size of {src} bucket: {size_gb} GB')
    else:
        folder_name = prefix.strip('/')
        print(f'size of {folder_name} folder: {size_gb} GB')

    return total_size


folders = ['clean_sample/', 'soil_health_cards/', 'state_soil_tests/']
sizes = [get_size(bucket, folder) for folder in folders]
print(sizes)

print("took", time.time() - start_time, "to run")

### Generate and Upload small sample (using Bihar)

In [None]:
bucket = 'yara-sh-dads-scd-stage'
key = 'state_soil_tests/Bihar.csv'
s3 = s3_client()
obj = s3.get_object(Bucket=bucket, Key=key)

In [None]:
df = pd.read_csv(io.BytesIO(obj['Body'].read()), encoding='utf8',sep = '\t')
df.head(5)

In [None]:
sample = df.head(100)
sample.to_csv('testsample_shc.csv', index=False, sep='\t')

In [None]:
sample_file = 'testsample_shc.csv'
sample_key = f'state_soil_tests/{sample_file}'
# local_file = os.path.abspath(file_name)

try:
    s3.upload_file(sample_file, bucket, sample_key)
    print("Upload Successful")
except FileNotFoundError:
    print("The file was not found")

### Download sample from s3

In [None]:
try:
    s3.download_file(bucket, sample_key, sample_file)
    print("Download Successful")
except Exception as e:
    print(e)