In [None]:
#| default_exp datalake

## Datalake interface

In [None]:
#| hide

import sys
from pathlib import Path

sys.path.append(str(Path('.').absolute().parent))

In [None]:
#| exporti

import boto3
import pandas as pd
import json
from io import BytesIO

from lakeinterface.config import ConfigManager


In [None]:
#| exporti

class S3ObjectNotFound(Exception):
    pass


class Datalake(object):
    """
    A class to wrap interface to an AWS S3 datalake
    Implemented as a singleton to reduce number of live sessions
    ...

    Attributes
    ----------
    session: a boto3 session
    s3 : a boto3 S3 client
    bucket : S3 bucket location of lake
    
    Methods
    -------
    __init__(config, profile='default'):
        Initializes the AWS S3 client using AWS profile_name and dict of parameters from ConfigManager
    
    get_object(key):
        Core method for loading objects using boto3 S3 client
    
    load_csv(key, delimiter=',', skiprows=None, line_terminator=None):
        Loads csv object with S3 prefix = key
    
    load_json(key):
        Loads json object with S3 prefix = key
        
    list_objects(prefix):
        Lists all objects with S3 prefix = key
    
    save_json(path, data, timestamp=None):
        Saves json object to specified path with an optional timestamp that will be inserted into path
    
    put_object(key, data, metadata={}):
        Core method for saving objects using boto3 S3 client
    
    most_recent(prefix):
        For a given S3 prefix returns object has most recent timestamp
     
    put(path, df, timestamp=None):
        Saves a dataframe as parquet to specified path with an optional timestamp that will be inserted into path
    
    get(path):
        Loads parquet object from specified path as a dataframe

    """
    _instance = None

    def __new__(cls, config, profile_name='default'):
        if cls._instance is None:
            print('Creating the object')
            cls._instance = super(Datalake, cls).__new__(cls)
            # Put any initialization here.
        return cls._instance
    
    def __init__(self, config, profile_name='default'):
        print('init-ing', self.__dict__)
        self.session = boto3.session.Session(profile_name=profile_name)
        
        self.bucket = config.get('bucket')
        self.s3 = self.session.client('s3')
        
    def get_object(self, key):
        try:
            return self.s3.get_object(Bucket=self.bucket, Key=key)
        except Exception as e:
            if e.response['Error']['Code'] == 'NoSuchKey':
                raise S3ObjectNotFound('No S3 object with key = %s' % key)
            else:
                raise

    def load_csv(self,key, delimiter=',', skiprows=None, line_terminator=None):
        obj = self.get_object(key)
        if line_terminator:
            return pd.read_csv(obj['Body'], delimiter=delimiter, skiprows=skiprows, lineterminator=line_terminator)
        else:
            return pd.read_csv(obj['Body'], delimiter=delimiter, skiprows=skiprows)
    
    
    def load_json(self, key):
        obj = self.get_object(key)
        return json.loads(obj['Body'].read())
        
    
    def list_objects(self, prefix):
        
        paginator = self.s3.get_paginator('list_objects_v2')
        pages = paginator.paginate(Bucket=self.bucket, Prefix=prefix)

        return sum([[obj['Key'] for obj in page['Contents']] for page in pages], [])
    

    def save_json(self, path, data, timestamp=None):
        if timestamp:
            key = f'{path}/timestamp={timestamp}/data.json'
        else:
            key = f'{path}/data.json'

        return self.put_object(key, json.dumps(data))
        
    def put_object(self, key, data, metadata={}):
        try:
            resp = self.s3.put_object(
                Bucket=self.bucket,
                Key=key,
                Body=data
            )
            status_code = resp['ResponseMetadata']['HTTPStatusCode']
            if status_code == 200:
                return True
            else:
                raise Exception(f'Unknown error. Status code: {status_code}')
        except Exception as e:
            raise Exception(f'Unknown error in put object for {key}. {str(e)}')

            
    def put(self, path, df, timestamp=None):
        if timestamp:
            key = f'{path}/timestamp={timestamp}/data.parquet'
        else:
            key = f'{path}/data.parquet'

        out_buffer = BytesIO()
        df.to_parquet(
            out_buffer,
            index=True,
            engine='pyarrow',
            compression='gzip',
            allow_truncated_timestamps=True
        )
        if self.put_object(key, out_buffer.getvalue()):
            return f'Saved to {key}'
        else:
            return f'Unknown error in save_parquet: {key}'
    
    def most_recent(self, prefix):
        matched_objects = self.list_objects(prefix=prefix)
        
        if len(matched_objects) > 1:
            print(f'Multiple objects found for prefix {prefix}')
            return None
        elif len(matched_objects) == 0:
            print(f'No objects found for prefix {prefix}')
            return None
        else:
            return matched_objects[0]

    
    def get(self, path):
        try:
            key = self.most_recent(path)
        except Exception as e:
            print(f'No objects found with path: {key}. {e}')
            return None

        resp = self.get_object(key)
        return pd.read_parquet(BytesIO(resp['Body'].read()))
    

In [None]:
cfgmgr = ConfigManager(profile='personal')
cfg = cfgmgr.fetch_config('bankdata')

In [None]:
lake = Datalake(cfg, profile_name='personal')

Creating the object
init-ing {}


In [None]:
d = {'col1': [1, 2], 'col2': [3, 4]}
df = pd.DataFrame(data=d)
df

Unnamed: 0,col1,col2
0,1,3
1,2,4


In [None]:
s3.put('test/example2', df)

'Saved to test/example2/data.parquet'

In [None]:
s3.list_objects(prefix='test')

['test/example1/data.parquet',
 'test/example2/data.parquet',
 'test/put_example/data.parquet']

In [None]:
s3.get('test/example1')

Unnamed: 0,col1,col2
0,1,3
1,2,4
