In [1]:
import argparse
import os
import subprocess
import sys
import yaml
import pandas as pd
from pathlib import Path

In [2]:
PROJECT_ROOT = Path().resolve()
sys.path.insert(0, str(PROJECT_ROOT))
print(f"PROJECT_ROOT: {PROJECT_ROOT}")

PROJECT_ROOT: /home/ec2-user/SageMaker/gs-ds-env/lightgbm311/sm_docker


## Í≤ΩÎ°ú ÏÑ§Ï†ï

In [3]:
S3_BUCKET = "retail-mlops-edu-2026-hjsong"
S3_DATA_PREFIX = "edu-202602-staff/titanic/data"
S3_MODEL_PREFIX = "edu-202602-staff/titanic/model"

sm_dir = {
    "train_path": os.environ.get(
        'SM_CHANNEL_TRAIN', PROJECT_ROOT / "data/train"
    ),
    "val_path": os.environ.get(
        'SM_CHANNEL_VAL', PROJECT_ROOT / "data/val"
    ),
    "model_path": os.environ.get(
        'SM_MODEL_DIR', PROJECT_ROOT / "model"
    ),
    "output_path": os.environ.get(
        'SM_OUTPUT_DATA_DIR', PROJECT_ROOT / "output"
    ),
}

## Î™®Îç∏Í¥ÄÎ†® ÌååÎùºÎØ∏ÌÑ∞ ÏÑ§Ï†ï

In [4]:
train_val_split={
  "val_ratio": 0.2,
  "random_state": 42
}

model_name ="titanic_model"
model_version="1.0.0"
model_description="Titanic Model"
model_algo="lightgbm"
hyperparameters={
    "objective": "binary",
    "metric": "binary_logloss",
    "num_leaves": 31,
    "learning_rate": 0.1,
    "n_estimators": 100,
    "max_depth": 10,
    "random_state": 42,
    "verbose": 0,
}

In [5]:
# Îç∞Ïù¥ÌÑ∞ Ï†ÑÏ≤òÎ¶¨
from model_pipeline.preprocess import load_data, preprocess_data, save_preprocessed
from sklearn.model_selection import train_test_split

print("üöÄ  Loading data...")
df = load_data(S3_BUCKET, S3_DATA_PREFIX)

print("üöÄ  Preprocessing data...")
df_preprocessed = preprocess_data(df)

if "target" not in df_preprocessed.columns:
    raise ValueError("Column 'target' not found after preprocessing")

val_ratio = train_val_split["val_ratio"]
random_state = train_val_split["random_state"]

train_df_preprocessed, val_df_preprocessed = train_test_split(
    df_preprocessed,
    test_size=val_ratio,
    random_state=random_state,
    stratify=df_preprocessed["target"],
)

print("üíæ Saving preprocessed data...")
save_preprocessed(train_df_preprocessed, sm_dir['train_path'], "train.csv")
save_preprocessed(val_df_preprocessed, sm_dir['val_path'], "validation.csv")

üöÄ  Loading data...
üîç Data shape: (891, 12)
üîç Columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
üöÄ  Preprocessing data...
üîç Features shape: (891, 12)
üîç Features: ['passenger_id', 'target', 'pclass', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked']
üíæ Saving preprocessed data...
üíæ Saved: /home/ec2-user/SageMaker/gs-ds-env/lightgbm311/sm_docker/data/train/train.csv
üíæ Saved: /home/ec2-user/SageMaker/gs-ds-env/lightgbm311/sm_docker/data/val/validation.csv


In [6]:
# Îç∞Ïù¥ÌÑ∞ ÌïôÏäµ
from model_pipeline.train import train_model, save_model, upload_to_s3

print("üöÄ Train step started.")
train_path = sm_dir['train_path']
if os.path.isdir(train_path):
    train_path = os.path.join(train_path, "train.csv")

train_df = pd.read_csv(train_path)
target_col = "survived" if "survived" in train_df.columns else "target"
X_train = train_df.drop(target_col, axis=1)
y_train = train_df[target_col]

model = train_model(X_train, y_train, hyperparameters)

print("üöÄ save model/artifact in model path.")
model_dir = sm_dir['model_path']
artifact_path = save_model(model, model_dir, model_name=f"{model_name}.joblib")

upload_to_s3(artifact_path, S3_BUCKET , S3_MODEL_PREFIX)

üöÄ Train step started.
üéØ Training LightGBM tree model...
‚úÖ Model training completed!
üöÄ save model/artifact in model path.
‚úÖ Model saved: /home/ec2-user/SageMaker/gs-ds-env/lightgbm311/sm_docker/model/titanic_model.joblib
‚úÖ Model artifact created: /home/ec2-user/SageMaker/gs-ds-env/lightgbm311/sm_docker/model/model.tar.gz
‚úÖ Model artifact uploaded: s3://retail-mlops-edu-2026-hjsong/edu-202602-staff/titanic/model/model.tar.gz


's3://retail-mlops-edu-2026-hjsong/edu-202602-staff/titanic/model/model.tar.gz'

In [7]:
# Îç∞Ïù¥ÌÑ∞ ÌèâÍ∞Ä
from model_pipeline.evaluate import load_validation_df, load_model, evaluate_model, save_metrics

print("Evaluate step started.")


val_path = sm_dir['val_path']
model_path = sm_dir['model_path']
output_path = sm_dir['output_path']

val_df = load_validation_df(val_path)
target_col = "survived" if "survived" in val_df.columns else "target"
X_val = val_df.drop(target_col, axis=1)
y_val = val_df[target_col]

model = load_model(model_path, model_name=f"{model_name}.joblib")
metrics = evaluate_model(model, X_val, y_val)

save_metrics(
    {
        "model_path": str(model_path),
        "metrics": metrics,
    },
    output_path,
    filename="evaluation.json",
)

Evaluate step started.
‚úÖ Validation Accuracy: 0.7765
‚úÖ Metrics saved: /home/ec2-user/SageMaker/gs-ds-env/lightgbm311/sm_docker/output/evaluation.json


'/home/ec2-user/SageMaker/gs-ds-env/lightgbm311/sm_docker/output/evaluation.json'