In [1]:
import logging

# 로거 생성 및 설정
logger = logging.getLogger()  # 기본 로거 가져오기
logger.setLevel(logging.INFO)  # 로그 수준 설정

# Jupyter Notebook에 출력될 수 있도록 스트림 핸들러 추가
if not logger.handlers:
    stream_handler = logging.StreamHandler()
    stream_handler.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s\n%(message)s')
    stream_handler.setFormatter(formatter)
    logger.addHandler(stream_handler)

In [2]:
import pandas as pd
import json
import yaml
import os

import boto3
from botocore.exceptions import NoCredentialsError, PartialCredentialsError, ClientError
from boto3.dynamodb.conditions import Key
import time
from datetime import datetime
import pytz
import zlib
import base64
import pickle

import pprint
pp = pprint.PrettyPrinter(indent=4)

In [3]:
import papermill as pm

In [4]:
import sys
sys.path.append('/home/ec2-user/SageMaker/gs-automl/fargate_task/run_papermill_tabular312_langchain/docker')
import run_pm_utils as utils

2024-11-07 06:43:44,079 - INFO
Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


In [5]:
region_name = 'us-east-1'

In [6]:
start_ts = int(time.time())

# get associated items from ddb

In [7]:
table_name = 'automl-regression-experiment'
project_hashkey = '2ee07a49'
experiment_hashkey = 'fc973d19'

In [8]:
item = utils.get_experiment_item(table_name, project_hashkey, experiment_hashkey)

In [9]:
pp.pprint(item)

{   'bucket_name': 'gs-automl-regression',
    'config_yaml': 'WANDB:\n'
                   '  project: gs-automl-forecasting\n'
                   '  secret_name: aws-credential\n'
                   '  secret_key: gs-wandb-yoloseankim1\n'
                   '  region_name: us-east-1\n'
                   '  infra: AWS\n'
                   '  job_type: train-model\n'
                   '  algorithm_name: lightgbm\n'
                   '  wandb_name: wine_quality_analysis\n'
                   '  model_suffix: base\n'
                   '  tags:\n'
                   '  - lightgbm-base\n'
                   '  - wine_quality\n'
                   '  - base\n'
                   'SEED: 2024\n'
                   'SPLIT:\n'
                   '  type: random\n'
                   '  # type: manual\n'
                   '  # manual_split_colname: data_type \n'
                   '  # type: chronological\n'
                   '  # chronological_split_colname: datetime\n'
                 

In [10]:
file_hashkey = item['file_hashkey']
dataset_table_name = 'automl-dataset'

In [11]:
dataset_item = utils.get_dataset_item(file_hashkey, dataset_table_name)

In [12]:
pp.pprint(dataset_item)

{   'bucket_name': 'gs-automl-data-profile',
    'connector_name': 'upload_csv',
    'created_dt': '2024-11-01 16:10:07',
    'created_ts': Decimal('1730445007'),
    'dataset_name': 'wine quality dataset',
    'dataset_type': 'tabular',
    'deleted': False,
    'file_hashkey': 'a1114a97',
    'file_name': 'Wine_Data_2.csv',
    'profiling': 'done',
    'project_hashkey': '2ee07a49',
    'project_name': 'wine quality 에 영향을 미치는 인자 연구 - 화학성분 중심으로',
    'published': True,
    's3_key_df_file': 'upload/sean@gs.co.kr/a1114a97/df.pickle',
    's3_key_upload_file': 'upload/sean@gs.co.kr/a1114a97/Wine_Data_2.csv',
    'username': 'sean@gs.co.kr'}


In [13]:
model_repo_table_name = 'automl-model-repo'
model_hashkey = item['model_hashkey']

In [14]:
model_repo_item = utils.get_model_repo_item(model_hashkey, model_repo_table_name)

In [15]:
pp.pprint(model_repo_item)

{   'bucket_name': 'gs-automl-model-repo',
    'created_dt': '2024-11-07 11:18:38',
    'created_ts': Decimal('1730945918'),
    'deleted': False,
    'ipynb_file_name': 'regression_lgbm.ipynb',
    'kernel_name': 'conda_tabular312_langchain',
    'model_hashkey': 'a43a5c39',
    'model_name': 'regression_lgbm',
    'model_type': 'regression',
    's3_zip_key_path': 'upload/sean@gs.co.kr/a43a5c39/zip_file',
    'uploaded_files': [   'config.yml',
                          'metric.py',
                          'modeling.py',
                          'prep.py',
                          'regression_lgbm.ipynb',
                          'utils.py'],
    'username': 'sean@gs.co.kr',
    'yml_file_name': 'config.yml',
    'zip_file_name': 'regression_lgbm.zip'}


In [16]:
ts = int(time.time())
tmp_item = {
    'pk': str(int(time.time())),
    'created_dt': utils.conv_ts_to_dt_str(ts),
    'experiment': item,
    'dataset': dataset_item,
    'model':  model_repo_item
}

In [17]:
utils.put_item_to_ddb('automl-temp', tmp_item)

Success : automl-temp table에 item이 저장되었습니다.


# create env - copy files from s3

## model files

In [18]:
testing = True

if testing:
    logger.info('use current dev')
else:
    logger.info('download modeling codes from S3')
    bucket = model_repo_item['bucket_name']
    key = model_repo_item['s3_zip_key_path']
    utils.download_s3_files_to_directory(bucket, key)
    utils.move_file_to_directory('./config.yml', './input')

2024-11-07 06:43:44,759 - INFO
use current dev


## dataset file

In [19]:
bucket = dataset_item['bucket_name']
key = dataset_item['s3_key_df_file']

In [20]:
utils.download_s3_file_to_directory(bucket, key, './input')

df.pickle 파일이 './input/df.pickle'에 다운로드되었습니다.


# papermill

In [21]:
try:
    input_nb = model_repo_item['ipynb_file_name']
    output_nb = input_nb.replace('.ipynb','_output.ipynb')

    logger.info(f'input_nb: {input_nb}')
    logger.info(f'output_nb: {output_nb}')
    logger.info('pm started...')
    
    folder_path = 'artifacts/model'
    os.makedirs(folder_path, exist_ok=True)
    pm.execute_notebook(
       input_nb,
       f'{folder_path}/{output_nb}',
       parameters=dict(),
       kernel_name="conda_tabular312_langchain"
    )
    logger.info('pm ended...')
except Exception as e:
    logger.error(e)

2024-11-07 06:43:45,115 - INFO
input_nb: regression_lgbm.ipynb
2024-11-07 06:43:45,123 - INFO
output_nb: regression_lgbm_output.ipynb
2024-11-07 06:43:45,124 - INFO
pm started...
2024-11-07 06:43:45,124 - INFO
Input Notebook:  regression_lgbm.ipynb
2024-11-07 06:43:45,125 - INFO
Output Notebook: artifacts/model/regression_lgbm_output.ipynb
  from .autonotebook import tqdm as notebook_tqdm
Executing:   0%|          | 0/77 [00:00<?, ?cell/s]2024-11-07 06:43:46,010 - INFO
Executing notebook with kernel: conda_tabular312_langchain
Executing: 100%|██████████| 77/77 [02:16<00:00,  1.77s/cell]
2024-11-07 06:46:01,799 - INFO
pm ended...


# upload artifacts to S3

In [22]:
pp.pprint(item)

{   'bucket_name': 'gs-automl-regression',
    'config_yaml': 'WANDB:\n'
                   '  project: gs-automl-forecasting\n'
                   '  secret_name: aws-credential\n'
                   '  secret_key: gs-wandb-yoloseankim1\n'
                   '  region_name: us-east-1\n'
                   '  infra: AWS\n'
                   '  job_type: train-model\n'
                   '  algorithm_name: lightgbm\n'
                   '  wandb_name: wine_quality_analysis\n'
                   '  model_suffix: base\n'
                   '  tags:\n'
                   '  - lightgbm-base\n'
                   '  - wine_quality\n'
                   '  - base\n'
                   'SEED: 2024\n'
                   'SPLIT:\n'
                   '  type: random\n'
                   '  # type: manual\n'
                   '  # manual_split_colname: data_type \n'
                   '  # type: chronological\n'
                   '  # chronological_split_colname: datetime\n'
                 

In [23]:
# 사용 예시
local_directory = "artifacts"
bucket = item['bucket_name']
s3_prefix = item['s3_key_prefix']
s3_prefix = f'{s3_prefix}/artifacts'

In [24]:
artifacts = utils.upload_directory_to_s3(local_directory, bucket, s3_prefix)

2024-11-07 06:46:02,026 - INFO
Uploaded artifacts/model/feature_importance.csv to s3://gs-automl-regression/upload/sean@gs.co.kr/fc973d19/artifacts/model/feature_importance.csv
2024-11-07 06:46:02,026 - INFO
Uploaded artifacts/model/feature_importance.csv to s3://gs-automl-regression/upload/sean@gs.co.kr/fc973d19/artifacts/model/feature_importance.csv
2024-11-07 06:46:02,171 - INFO
Uploaded artifacts/model/regression_lgbm_output.ipynb to s3://gs-automl-regression/upload/sean@gs.co.kr/fc973d19/artifacts/model/regression_lgbm_output.ipynb
2024-11-07 06:46:02,171 - INFO
Uploaded artifacts/model/regression_lgbm_output.ipynb to s3://gs-automl-regression/upload/sean@gs.co.kr/fc973d19/artifacts/model/regression_lgbm_output.ipynb
2024-11-07 06:46:02,258 - INFO
Uploaded artifacts/model/config.yml to s3://gs-automl-regression/upload/sean@gs.co.kr/fc973d19/artifacts/model/config.yml
2024-11-07 06:46:02,258 - INFO
Uploaded artifacts/model/config.yml to s3://gs-automl-regression/upload/sean@gs.co.k

In [25]:
pp.pprint(artifacts)

{   'upload/sean@gs.co.kr/fc973d19/artifacts/.': [],
    'upload/sean@gs.co.kr/fc973d19/artifacts/ebm_images': [   'fig_density.png',
                                                              'fig_volatile_acidity.png',
                                                              'fig_citric_acid.png',
                                                              'fig_free_sulfur_dioxide.png',
                                                              'fig_total_sulfur_dioxide.png',
                                                              'fig_fixed_acidity.png',
                                                              'fig_chlorides.png',
                                                              'fig_sulphates.png',
                                                              'fig_pH.png',
                                                              'fig_alcohol.png',
                                                              'fig_residual_sugar.png'],
    '

# save item to ddb

In [26]:
ts = int(time.time())

In [27]:
item['status'] = '실험 완료'
item['updated_dt'] =  utils.conv_ts_to_dt_str(ts)
item['updated_ts'] =  ts

utils.put_item_to_ddb('automl-regression-experiment', item)

Success : automl-regression-experiment table에 item이 저장되었습니다.


In [28]:
exp_res_item = {
    'artifacts': artifacts,
    'bucket_name': item['bucket_name'],
    's3_key_prefix': item['s3_key_prefix'],
    'created_ts': ts,
    'created_dt': utils.conv_ts_to_dt_str(ts),
    'dataset_name': item['dataset_name'],
    'experiment_hashkey': item['experiment_hashkey'],
    'experiment_name': item['experiment_name'],
    'file_hashkey': item['file_hashkey'],
    'model_hashkey': item['model_hashkey'],
    'model_name': item['model_name'],
    'project_hashkey': item['project_hashkey'],
    'project_name': item['dataset_name'],
    'dataset_name': item['project_name'],
    'username': item['username'],
    'elapsed': ts - start_ts,
}

utils.put_item_to_ddb('automl-regression-experiment-result', exp_res_item)

Success : automl-regression-experiment-result table에 item이 저장되었습니다.
