In [1]:
import pandas as pd
import json
import yaml
import os
import shutil

import boto3
from botocore.exceptions import NoCredentialsError, PartialCredentialsError, ClientError
from boto3.dynamodb.conditions import Key
import time
from datetime import datetime
import pytz
import zlib
import base64
import pickle
import traceback

In [2]:
import json
import boto3
import argparse
import warnings
import papermill as pm
import time
import os
import pickle
import shutil
import traceback
import pprint
pp = pprint.PrettyPrinter(width=41, compact=True, indent=4)

# user
import run_pm_utils as utils

import logging

# 로거 생성 및 설정
logger = logging.getLogger()  # 기본 로거 가져오기
logger.setLevel(logging.INFO)  # 로그 수준 설정

# Jupyter Notebook에 출력될 수 있도록 스트림 핸들러 추가
if not logger.handlers:
    stream_handler = logging.StreamHandler()
    stream_handler.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s\n%(message)s')
    stream_handler.setFormatter(formatter)
    logger.addHandler(stream_handler)


warnings.filterwarnings('ignore')

sfn_client = boto3.client('stepfunctions')

In [3]:
region_name = 'us-east-1'

In [4]:
def get_experiment_item(table_name, project_hashkey, experiment_hashkey, job_type):
    try:
        # DynamoDB 리소스 생성
        dynamodb = boto3.resource('dynamodb', region_name=region_name)

        # 테이블 객체 생성
        table = dynamodb.Table(table_name)

        # 키 조건 설정
        if job_type == 'training':
            key = {
                'project_hashkey': project_hashkey,
                'experiment_hashkey': experiment_hashkey,
            }
        elif job_type == 'inference':
            key = {
                'project_hashkey': project_hashkey,
                'inference_hashkey': experiment_hashkey,
            }
        # 항목 조회
        response = table.get_item(Key=key)

        # 항목 반환
        return response.get('Item', None)
    except Exception as e:
        print(f"오류 발생: {e}")
        raise e 

In [5]:
c_payload = {
  "cluster": "automl-fargate-cluster",
  "task_definition": "automl-fargate-papermill-tabular312-langchain",
  "table_name": "automl-classification-experiment",
  "project_hashkey": "2ee07a49",
  "experiment_hashkey": "1cbd8309",
  "dataset_table_name": "automl-dataset",
  "dataset_profile_table_name": "automl-dataset-profile-experiment-result",
  "model_repo_table_name": "automl-model-repo",
  "model_experiment_table_name": "automl-classification-experiment",
  "username": "sean@gs.co.kr",
  "job_type": "training",
  "cpu": 2048,
  "memory": 4096,
  "account_id": "703671896240",
  "ecs_role_name": "RoleStack-automlcdkprodEcsTaskExecutionRoleD78B1370-B8HLhToU7Ysq",
  "subnet": "subnet-0c930109e965894d3",
  "securityGroup": "sg-031827c50dde9638f"
}

In [12]:
i_payload = {
  "cluster": "automl-fargate-cluster",
  "task_definition": "automl-fargate-papermill-tabular312-langchain",
  "table_name": "automl-inference-experiment",
  "project_hashkey": "2ee07a49",
  "experiment_hashkey": "5ae46df0",
  "dataset_table_name": "automl-dataset",
  "dataset_profile_table_name": "automl-dataset-profile-experiment-result",
  "model_repo_table_name": "automl-model-repo",
  "model_experiment_result_table_name": "automl-classification-experiment-result",
  "username": "sean@gs.co.kr",
  "job_type": "inference",
  "cpu": 2048,
  "memory": 4096,
  "account_id": "703671896240",
  "ecs_role_name": "RoleStack-automlcdkprodEcsTaskExecutionRoleD78B1370-B8HLhToU7Ysq",
  "subnet": "subnet-0c930109e965894d3",
  "securityGroup": "sg-031827c50dde9638f"
}

In [13]:
payload = i_payload

In [14]:
table_name = payload['table_name']
project_hashkey = payload['project_hashkey']
experiment_hashkey = payload['experiment_hashkey']
dataset_table_name = payload['dataset_table_name']
dataset_profile_table_name = payload['dataset_profile_table_name']
model_repo_table_name = payload['model_repo_table_name']
model_experiment_result_table_name = payload['model_experiment_result_table_name']
username = payload['username']
job_type = payload['job_type']

In [15]:
task_token = 'dummy'

In [21]:
try:
    # init --------------------------------------------------------------------------------------------------------
    logger.info('# initialized ------------------')
    logger.info('# get items from ddb ------------------')

    root = './work'
    artifacts_dir = f'{root}/{job_type}/artifacts'
    input_dir = f'{root}/{job_type}/input'

    item = utils.get_experiment_item(table_name, project_hashkey, experiment_hashkey, job_type)
        
    file_hashkey = item['dataset_info']['file_hashkey']
    model_repo_hashkey = item['model_info']['model_hashkey']
    saved_model_hashkey = item['model_info']['experiment_hashkey']


    dataset_item = utils.get_dataset_item(project_hashkey, file_hashkey, dataset_table_name)
    dataset_profile_item = utils.get_dataset_profile_item(project_hashkey, file_hashkey, dataset_profile_table_name)
    model_repo_item = utils.get_model_repo_item(model_repo_hashkey, model_repo_table_name)
    saved_model_item = utils.get_saved_model_item(project_hashkey, saved_model_hashkey, model_experiment_result_table_name)
    
    ts = int(time.time())
    log_item = {
        'pk': str(ts),
        'created_dt': utils.conv_ts_to_dt_str(ts),
        'experiment': item,
        'dataset': dataset_item,
        'profile': dataset_profile_item,
        'model_repo':  model_repo_item,
        'saved_model':  saved_model_item,
        'username': username,
    }
    # dir make
    os.makedirs(artifacts_dir, exist_ok=True)
    os.makedirs(input_dir, exist_ok=True)
    # 피클 파일로 저장
    with open(f'{input_dir}/log_item.pkl', 'wb') as file:
        pickle.dump(log_item, file)
    shutil.copy(f'{input_dir}/log_item.pkl', f'{artifacts_dir}/log_item.pkl')

    logger.info('# download dataset from s3 ------------------')
    bucket = dataset_item['bucket_name']
    s3_key_df_file = dataset_item['s3_key_df_file']
    logger.info('%s - %s', bucket, key)
    utils.download_s3_file_to_directory(bucket, s3_key_df_file, input_dir)   

    logger.info('# download model files from s3 ------------------')
    bucket = model_repo_item['bucket_name']
    key = model_repo_item['s3_zip_key_path']
    utils.download_s3_files_to_directory(bucket, key, root)
    
    pp.pprint(saved_model_item)

    logger.info('# download saved model from s3 ------------------')
    bucket = saved_model_item['bucket_name']
    key = f'{saved_model_item['s3_key_prefix']}/artifacts/model' 
    utils.download_s3_files_to_directory(bucket, key, input_dir)

    logger.info('# download config.yml from s3 ------------------')
    bucket = saved_model_item['bucket_name']
    key = f'{saved_model_item['s3_key_prefix']}/artifacts/input' 
    key = f'{key}/config.yml'
    utils.download_s3_file_to_directory(bucket, key, input_dir)      
except Exception as error:
    logger.error(error)
    error_message = str(traceback.format_exc())
    sfn_client.send_task_failure(
        taskToken=task_token,
        error='failed in setting before running papermill',
        cause=error_message,
    )

2024-12-17 09:23:45,260 - INFO
# initialized ------------------
2024-12-17 09:23:45,262 - INFO
# get items from ddb ------------------
2024-12-17 09:23:45,907 - INFO
# download dataset from s3 ------------------
2024-12-17 09:23:45,908 - INFO
gs-automl-data-profile - upload/sean@gs.co.kr/102df50e/artifacts/input/config.yml
2024-12-17 09:23:46,012 - INFO
# download model files from s3 ------------------


df.pickle 파일이 './work/inference/input/df.pickle'에 다운로드되었습니다.
upload/sean@gs.co.kr/7576947d/zip_file/classification_lgbm.zip 파일이 './work/classification_lgbm.zip'에 다운로드되었습니다.
upload/sean@gs.co.kr/7576947d/zip_file/common/llm.py 파일이 './work/common/llm.py'에 다운로드되었습니다.
upload/sean@gs.co.kr/7576947d/zip_file/common/metric.py 파일이 './work/common/metric.py'에 다운로드되었습니다.
upload/sean@gs.co.kr/7576947d/zip_file/common/modeling.py 파일이 './work/common/modeling.py'에 다운로드되었습니다.
upload/sean@gs.co.kr/7576947d/zip_file/common/prep.py 파일이 './work/common/prep.py'에 다운로드되었습니다.
upload/sean@gs.co.kr/7576947d/zip_file/common/utils.py 파일이 './work/common/utils.py'에 다운로드되었습니다.
upload/sean@gs.co.kr/7576947d/zip_file/inference/inference_classification_lgbm.ipynb 파일이 './work/inference/inference_classification_lgbm.ipynb'에 다운로드되었습니다.
upload/sean@gs.co.kr/7576947d/zip_file/training/config.sample.yml 파일이 './work/training/config.sample.yml'에 다운로드되었습니다.
upload/sean@gs.co.kr/7576947d/zip_file/training/feature_dependency_prom

2024-12-17 09:23:46,793 - INFO
# download saved model from s3 ------------------


upload/sean@gs.co.kr/7576947d/zip_file/training/summary_prompt.md 파일이 './work/training/summary_prompt.md'에 다운로드되었습니다.
upload/sean@gs.co.kr/7576947d/zip_file/training/training_classification_lgbm.ipynb 파일이 './work/training/training_classification_lgbm.ipynb'에 다운로드되었습니다.
{   'artifacts': {   'files': [   'log_item.pkl',
                                  'feature_importance.csv',
                                  'factorized_unique_values.pkl',
                                  'metric.csv',
                                  'analysis_llm.pkl',
                                  'important_feature_cols.pkl',
                                  'model.pkl',
                                  'cutoff_metrics.csv',
                                  'log_item.pkl',
                                  'profile_df.pickle',
                                  'config.yml',
                                  'analysis_llm.pkl',
                                  'df.pickle',
                               

2024-12-17 09:23:47,664 - INFO
# download config.yml from s3 ------------------


upload/sean@gs.co.kr/102df50e/artifacts/model/important_feature_cols.pkl 파일이 './work/inference/input/important_feature_cols.pkl'에 다운로드되었습니다.
upload/sean@gs.co.kr/102df50e/artifacts/model/metric.csv 파일이 './work/inference/input/metric.csv'에 다운로드되었습니다.
upload/sean@gs.co.kr/102df50e/artifacts/model/model.pkl 파일이 './work/inference/input/model.pkl'에 다운로드되었습니다.
config.yml 파일이 './work/inference/input/config.yml'에 다운로드되었습니다.
