In [1]:
import sys
import polars as pl
sys.path.append('/Users/ianmcallister/tooling/lakeinterface')

from lakeinterface.datalake import Datalake

In [2]:
lake = Datalake(aws_profile='personal')

#### fixing the get

In [2]:
lake = Datalake(aws_profile='machinesp')

In [17]:
#pth = 'msp-cmo/pipeline/prosup_parsing/output/AGENCY=FHL/5065/paydown/Group 1'
pth = 'msp-cmo/pipeline/prosup_parsing/output/AGENCY=FHL/5065/paydown/Group 1/20240729/data.json'
lake.list_objects(pth)

['msp-cmo/pipeline/prosup_parsing/output/AGENCY=FHL/5065/paydown/Group 1/20240729/data.json']

In [6]:
def parse_path(path):
    if path[-1] == '/':
        path = path[:-1]

    parts = path.split('/')
    if len(parts) == 2:
        return parts[0], parts[1]
    else:
        return parts[0], '/'.join(parts[1:])


In [19]:
bucket, prefix = parse_path(pth)

In [20]:
paginator = lake.s3.s3.get_paginator('list_objects_v2')
pages = paginator.paginate(Bucket=bucket, Prefix=prefix)

In [21]:
all_objects = sum([[f"{bucket}/{obj['Key']}" for obj in page.get('Contents',[]) if obj['Size']>0] for page in pages], [])

In [22]:
all_objects

['msp-cmo/pipeline/prosup_parsing/output/AGENCY=FHL/5065/paydown/Group 1/20240729/data.json']

In [23]:
[o for o in all_objects if f'{bucket}/{prefix}/' in o or f'{bucket}/{prefix}' == o]

['msp-cmo/pipeline/prosup_parsing/output/AGENCY=FHL/5065/paydown/Group 1/20240729/data.json']

#### checking get and list

In [8]:
path = 'machinesp-datasets/test/banks/call_reports/raw'

matched_objects = lake.s3.list_objects(path)
most_recent = lake.s3.most_recent(path)

print(f'Matched objects: {matched_objects}')
print(f'Most recent: {most_recent}')

Matched objects: ['machinesp-datasets/test/banks/call_reports/raw/20200331/data.parquet', 'machinesp-datasets/test/banks/call_reports/raw/20210331/data.parquet', 'machinesp-datasets/test/banks/call_reports/raw/20230630/data.parquet']
Most recent: machinesp-datasets/test/banks/call_reports/raw/20230630/data.parquet


In [9]:
lake.get(path).head()

concept,idrssd,filing_date,context,namespace,unit,decimals,value
str,str,date,str,str,str,i64,str
"""RCONA573""","""1000052""",2023-06-30,"""CI_1000052_202…","""www.ffiec.gov/…","""USD""",0,"""145046000"""
"""RIAD4356""","""1000052""",2023-06-30,"""CD_1000052_202…","""www.ffiec.gov/…","""USD""",0,"""0"""
"""RCONA571""","""1000052""",2023-06-30,"""CI_1000052_202…","""www.ffiec.gov/…","""USD""",0,"""51634000"""
"""RCONA570""","""1000052""",2023-06-30,"""CI_1000052_202…","""www.ffiec.gov/…","""USD""",0,"""76364000"""
"""RCONS498""","""1000052""",2023-06-30,"""CI_1000052_202…","""www.ffiec.gov/…","""USD""",0,"""0"""


In [13]:
keys = lake.list_objects('machinesp-datasets/test/banks/call_reports/raw')

In [19]:
keys

['test/banks/call_reports/raw/20200331/data.parquet',
 'test/banks/call_reports/raw/20210331/data.parquet',
 'test/banks/call_reports/raw/20230630/data.parquet']

In [6]:
lake.most_recent('machinesp-datasets/test/banks/call_reports/raw')

'machinesp-datasets/test/banks/call_reports/raw/20230630/data.parquet'

#### Testing put

In [12]:
d = {'col1': [1, 2], 'col2': [3, 4]}
df = pl.DataFrame(data=d)
lake.put('machinesp-datasets/pytest/example1', df)

In [13]:
lake.list_objects('machinesp-datasets/pytest')

['machinesp-datasets/pytest/example1/data.parquet']

In [14]:
lake.get('machinesp-datasets/pytest/example1')

col1,col2
i64,i64
1,3
2,4


In [16]:
lake.put('machinesp-datasets/pytest/json_example', d)

In [17]:
lake.get('machinesp-datasets/pytest/json_example')

{'col1': [1, 2], 'col2': [3, 4]}

In [6]:
lake.s3.update_metadata('pytest/example1/data.parquet', {'saved': '20231101'})
lake.s3.fetch_metadata('pytest/example1/data.parquet')

{'saved': '20231101', 'foo': 'bar'}

In [1]:
from lakeinterface.logger import add_cloudwatch_handler, add_stream_handler
import logging
import json
import watchtower

from pythonjsonlogger import jsonlogger
import boto3

In [2]:
logger = logging.getLogger('machinesp-test')

In [3]:
def clear_all_handlers(logger_name):
    logger = logging.getLogger(logger_name)
    while len(logger.handlers)>0:
        logger.removeHandler(logger.handlers[0])


In [4]:
clear_all_handlers('machinesp-test')

In [5]:
import logging
import watchtower

from pythonjsonlogger import jsonlogger
import boto3

def _boto_filter(record):
    # Filter log messages from botocore and its dependency, urllib3, in watchtower handler for CloudWatch.
    # This is required to avoid an infinite loop when shutting down.
    if record.name.startswith("botocore"):
        return False
    if record.name.startswith("urllib3"):
        return False
    return True


wtower_handler = watchtower.CloudWatchLogHandler(
    log_group_name='machinesp/test',
    log_stream_name='json_lake_tester',
    send_interval=5,
    create_log_group=False,
    boto3_profile_name='personal'
)

logger.addFilter(_boto_filter)

formatter = jsonlogger.JsonFormatter()

wtower_handler.setFormatter(formatter)
logger.addHandler(wtower_handler)

logger.setLevel(logging.DEBUG)

In [13]:
logger.info('test message', extra={'foo': 'bar', 'tag': 'test2'})

In [17]:
logger.info('test message', extra={'foo': 'bar', 'tag': 'test2'})
logger.info({'a': 'foo', 'tag': 'test2', 'b': 'bar'})

In [14]:
client = boto3.client('logs')

In [18]:
client = boto3.client('logs')

response = client.filter_log_events(
    logGroupName='machinesp/test',
    logStreamNames=['json_lake_tester'],
    filterPattern='{$.tag="test2"}'
)

In [20]:
response['events']

[{'logStreamName': 'json_lake_tester',
  'timestamp': 1709390447232,
  'message': '{"message": "test message", "foo": "bar", "tag": "test2"}',
  'ingestionTime': 1709390452141,
  'eventId': '38120680808523138523223916546780937155534035796132233216'},
 {'logStreamName': 'json_lake_tester',
  'timestamp': 1709398127212,
  'message': '{"message": "", "a": "foo", "tag": "test2", "b": "bar"}',
  'ingestionTime': 1709398132285,
  'eventId': '38120852077800248334439040362990969515684461698837446656'}]

In [7]:
logger.debug('test message 2', extra={'foo':'bar'})

In [2]:
CLOUDWATCH_LOG_HANDLER = {
    'handler_type': 'cloudwatch', 
    'log_group_name': 'machinesp/test', 
    'log_stream_name': 'lake_tester', 
    'level': logging.DEBUG, 
    'format': '%(levelname)s - %(message)s',
    'aws_profile_name': 'personal'
}

CONSOLE_LOG_HANDLER = {
    'handler_type': 'stream', 
    'level': logging.INFO, 
    'format': '%(name)s - %(levelname)s - %(message)s'
}


add_stream_handler('machinesp-test', CONSOLE_LOG_HANDLER)
add_cloudwatch_handler('machinesp-test', CLOUDWATCH_LOG_HANDLER)

In [3]:
logger = logging.getLogger('machinesp-test')
logger.setLevel(logging.DEBUG)

In [13]:
logger.setLevel(0)

In [14]:
logger.debug('this is debug')

In [2]:
import logging, watchtower
FORMAT = '%(asctime)s %(clientip)-15s %(user)-8s %(message)s'
logging.basicConfig(format=FORMAT)
d = {'clientip': '192.168.0.1', 'user': 'fbloggs'}
logger = logging.getLogger('tcpserver')
logger.warning('Protocol problem: %s', 'connection reset', extra=d)


2024-03-01 21:46:50,017 192.168.0.1     fbloggs  Protocol problem: connection reset


In [8]:
d = {'clientip': '192.168.0.1', 'user': 'fbloggs'}

wtower_handler = watchtower.CloudWatchLogHandler(
    log_group_name=CLOUDWATCH_LOG_HANDLER.get('log_group_name'),
    log_stream_name=CLOUDWATCH_LOG_HANDLER.get('log_stream_name'),
    send_interval=10,
    create_log_group=False,
    boto3_profile_name=CLOUDWATCH_LOG_HANDLER.get('aws_profile_name')
)
wtower_handler.setLevel(CLOUDWATCH_LOG_HANDLER.get('level'))
#logger.addFilter(_boto_filter)

w_format = logging.Formatter(CLOUDWATCH_LOG_HANDLER.get('format'))
wtower_handler.setFormatter(w_format)
logger.addHandler(wtower_handler)

In [9]:
logger.error('test message', extra = {'clientip': '192.168.0.1', 'user': 'fbloggs'})

2024-03-01 21:49:12,701 192.168.0.1     fbloggs  test message


In [20]:
#json.dumps{'bar':'foo'}
logger.warning(json.dumps(
    [{'bar':'foo'},
    {'bar': 'celona'}]
))

