In [1]:
import argparse
import os
import subprocess
import sys
import yaml
import pandas as pd
from pathlib import Path

PROJECT_ROOT = Path().resolve()
sys.path.insert(0, str(PROJECT_ROOT))
print(f"PROJECT_ROOT: {PROJECT_ROOT}")

PROJECT_ROOT: /opt/ml/code


## Í≤ΩÎ°ú ÏÑ§Ï†ï

In [2]:
S3_BUCKET = "retail-mlops-edu-2026"
S3_DATA_PREFIX = "edu-2w/hjsong/data"
S3_MODEL_PREFIX = "edu-2w/hjsong/model"

sm_dir = {
    "train_path": os.environ.get(
        'SM_CHANNEL_TRAIN', PROJECT_ROOT / "data/train"
    ),
    "val_path": os.environ.get(
        'SM_CHANNEL_VAL', PROJECT_ROOT / "data/val"
    ),
    "model_path": os.environ.get(
        'SM_MODEL_DIR', PROJECT_ROOT / "model"
    ),
    "output_path": os.environ.get(
        'SM_OUTPUT_DATA_DIR', PROJECT_ROOT / "output"
    ),
}

## Î™®Îç∏Í¥ÄÎ†® ÌååÎùºÎØ∏ÌÑ∞ ÏÑ§Ï†ï

In [3]:
train_val_split={
  "val_ratio": 0.2,
  "random_state": 42
}

model_name ="titanic_model"
model_version="1.0.0"
model_description="Titanic Model"
model_algo="lightgbm"
hyperparameters={
    "objective": "binary",
    "metric": "binary_logloss",
    "num_leaves": 31,
    "learning_rate": 0.1,
    "n_estimators": 100,
    "max_depth": 10,
    "random_state": 42,
    "verbose": 0,
}

## Î™®Îç∏ Ïã§Ìñâ Ìï®Ïàò

### Ï†ÑÏ≤òÎ¶¨

In [4]:
import argparse
import os
import pandas as pd
import yaml
import boto3
from io import BytesIO



def load_data(bucket,data_prefix):
    """
    Îç∞Ïù¥ÌÑ∞ Î°úÎî©
    
    Args:
        data_path: ÏõêÎ≥∏ Îç∞Ïù¥ÌÑ∞ Ìè¥Îçî Í≤ΩÎ°ú

    Returns:
        df: Ï†ÑÏ≤¥ Îç∞Ïù¥ÌÑ∞ÌîÑÎ†àÏûÑ
    """
    key = f"{data_prefix}/train.csv"
    s3 = boto3.client("s3")
    obj = s3.get_object(Bucket=bucket, Key=key)
    df = pd.read_csv(BytesIO(obj["Body"].read()))
        
    print(f"üîç Data shape: {df.shape}")
    print(f"üîç Columns: {list(df.columns)}")
    return df

def preprocess_data(df):
    """
    Îç∞Ïù¥ÌÑ∞ Ï†ÑÏ≤òÎ¶¨
    
    Args:
        df: ÏõêÎ≥∏ Îç∞Ïù¥ÌÑ∞ÌîÑÎ†àÏûÑ
    
    Returns:
        df: Ï†ÑÏ≤òÎ¶¨Ìïú Îç∞Ïù¥ÌÑ∞ÌîÑÎ†àÏûÑ

    """
    
    df = df.copy()

    df = df.rename(columns={
        'PassengerId': 'passenger_id',
        'Survived': 'target',
        'Pclass': 'pclass',
        'Name': 'name',
        'Sex': 'sex',
        'Age': 'age',
        'SibSp': 'sibsp',
        'Parch': 'parch',
        'Ticket': 'ticket',
        'Fare': 'fare',
        'Cabin': 'cabin',
        'Embarked': 'embarked',
    })

    
    # Í∏∞Î≥∏ Í≤∞Ï∏°Ïπò Ï≤òÎ¶¨ + ÌÉÄÏûÖ Í∏∞Ï§Ä Îã®Ïàú Ï†ÑÏ≤òÎ¶¨
    numeric_cols = df.select_dtypes(include="number").columns
    object_cols = df.select_dtypes(exclude="number").columns

    for col in numeric_cols:
        if df[col].isnull().any():
            df[col] = df[col].fillna(0)
        df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)

    for col in object_cols:
        if df[col].isnull().any():
            if df[col].dropna().empty:
                df[col] = df[col].fillna("")
            else:
                df[col] = df[col].fillna(df[col].mode()[0])
        # Î≤îÏ£ºÌòï Ïª¨ÎüºÏùÄ Í∞ÑÎã®Ìûà Ïà´Ïûê Ïù∏ÏΩîÎî©
        df[col] = df[col].astype(str)
        df[col] = pd.factorize(df[col])[0]
    
    
    print(f"üîç Features shape: {df.shape}")
    print(f"üîç Features: {list(df.columns)}")
    
    return df


def save_preprocessed(df, output_dir, filename):
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, filename)

    df.to_csv(output_path, index=False)
    print(f"üíæ Saved: {output_path}")

### ÌïôÏäµ

In [5]:
import argparse
import json
import os
import tarfile
import yaml
import joblib
import lightgbm as lgb
import pandas as pd
from sklearn.metrics import accuracy_score


def train_model(X_train, y_train, hyperparameters):
    """
    Î™®Îç∏ ÌïôÏäµ
    
    Args:
        X_train: ÌïôÏäµ ÌäπÏßï Îç∞Ïù¥ÌÑ∞
        y_train: ÌïôÏäµ ÌÉÄÍ≤ü Îç∞Ïù¥ÌÑ∞
        hyperparameters: global_params.yaml Ïóê Ï†ïÏùòÎêú Î™®Îç∏ ÌååÎùºÎØ∏ÌÑ∞
    
    Returns:
        model: ÌïôÏäµÎêú Î™®Îç∏
    """
    print("üéØ Training LightGBM tree model...")

    model = lgb.LGBMClassifier(**hyperparameters)
    model.fit(X_train, y_train)

    print("‚úÖ Model training completed!")

    return model



def save_model(model, model_dir, model_name="model.joblib"):
    os.makedirs(model_dir, exist_ok=True)
    model_path = os.path.join(model_dir, model_name)
    joblib.dump(model, model_path)
    print(f"‚úÖ Model saved: {model_path}")

    artifact_path = os.path.join(model_dir, "model.tar.gz")
    with tarfile.open(artifact_path, "w:gz") as tar:
        tar.add(model_dir, arcname=".")
    print(f"‚úÖ Model artifact created: {artifact_path}")
    
    return artifact_path


def upload_to_s3(artifact_path, bucket, model_prefix):
    try:
        import boto3  # type: ignore
    except ImportError as exc:
        raise RuntimeError("boto3 is required for S3 upload") from exc

    key = model_prefix + "/" + os.path.basename(artifact_path)
    s3 = boto3.client("s3")
    s3.upload_file(artifact_path, bucket, key)
    uploaded_uri = f"s3://{bucket}/{key}"
    print(f"‚úÖ Model artifact uploaded: {uploaded_uri}")
    return uploaded_uri

### ÌèâÍ∞Ä

In [6]:
import argparse
import json
import os
import yaml
import joblib
import pandas as pd
from sklearn.metrics import accuracy_score


def load_validation_df(val_path):
    if os.path.isdir(val_path):
        val_path = os.path.join(val_path, "validation.csv")
    return pd.read_csv(val_path)


def load_model(model_path, model_name):
    if os.path.isdir(model_path):
        model_path = os.path.join(model_path, model_name)
    return joblib.load(model_path)


def evaluate_model(model, X_val, y_val):
    preds = model.predict(X_val)
    acc = accuracy_score(y_val, preds)
    print(f"‚úÖ Validation Accuracy: {acc:.4f}")
    return {"accuracy": acc}


def save_metrics(metrics, output_dir, filename="evaluation.json"):
    os.makedirs(output_dir, exist_ok=True)
    metrics_path = os.path.join(output_dir, filename)
    with open(metrics_path, "w") as f:
        json.dump(metrics, f, indent=2)
    print(f"‚úÖ Metrics saved: {metrics_path}")
    return metrics_path


## Ïã§Ï†ú Î™®Îç∏ ÏàúÏ∞®Ï†ÅÏúºÎ°ú Ïã§Ìñâ

In [7]:
# Îç∞Ïù¥ÌÑ∞ Ï†ÑÏ≤òÎ¶¨
from sklearn.model_selection import train_test_split

print("üöÄ  Loading data...")
df = load_data(S3_BUCKET, S3_DATA_PREFIX)

print("üöÄ  Preprocessing data...")
df_preprocessed = preprocess_data(df)

if "target" not in df_preprocessed.columns:
    raise ValueError("Column 'target' not found after preprocessing")

val_ratio = train_val_split["val_ratio"]
random_state = train_val_split["random_state"]

train_df_preprocessed, val_df_preprocessed = train_test_split(
    df_preprocessed,
    test_size=val_ratio,
    random_state=random_state,
    stratify=df_preprocessed["target"],
)

print("üíæ Saving preprocessed data...")
save_preprocessed(train_df_preprocessed, sm_dir['train_path'], "train.csv")
save_preprocessed(val_df_preprocessed, sm_dir['val_path'], "validation.csv")

üöÄ  Loading data...
üîç Data shape: (891, 12)
üîç Columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
üöÄ  Preprocessing data...


üîç Features shape: (891, 12)
üîç Features: ['passenger_id', 'target', 'pclass', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked']
üíæ Saving preprocessed data...
üíæ Saved: /opt/ml/code/data/train/train.csv
üíæ Saved: /opt/ml/code/data/val/validation.csv


In [8]:
# Îç∞Ïù¥ÌÑ∞ ÌïôÏäµ

print("üöÄ Train step started.")
train_path = sm_dir['train_path']
if os.path.isdir(train_path):
    train_path = os.path.join(train_path, "train.csv")

train_df = pd.read_csv(train_path)
target_col = "survived" if "survived" in train_df.columns else "target"
X_train = train_df.drop(target_col, axis=1)
y_train = train_df[target_col]

model = train_model(X_train, y_train, hyperparameters)

print("üöÄ save model/artifact in model path.")
model_dir = sm_dir['model_path']
artifact_path = save_model(model, model_dir, model_name=f"{model_name}.joblib")

upload_to_s3(artifact_path, S3_BUCKET , S3_MODEL_PREFIX)

üöÄ Train step started.
üéØ Training LightGBM tree model...
‚úÖ Model training completed!
üöÄ save model/artifact in model path.
‚úÖ Model saved: /opt/ml/model/titanic_model.joblib
‚úÖ Model artifact created: /opt/ml/model/model.tar.gz


‚úÖ Model artifact uploaded: s3://retail-mlops-edu-2026/edu-2w/hjsong/model/model.tar.gz


's3://retail-mlops-edu-2026/edu-2w/hjsong/model/model.tar.gz'

In [9]:
# Îç∞Ïù¥ÌÑ∞ ÌèâÍ∞Ä

print("Evaluate step started.")


val_path = sm_dir['val_path']
model_path = sm_dir['model_path']
output_path = sm_dir['output_path']

val_df = load_validation_df(val_path)
target_col = "survived" if "survived" in val_df.columns else "target"
X_val = val_df.drop(target_col, axis=1)
y_val = val_df[target_col]

model = load_model(model_path, model_name=f"{model_name}.joblib")
metrics = evaluate_model(model, X_val, y_val)

save_metrics(
    {
        "model_path": str(model_path),
        "metrics": metrics,
    },
    output_path,
    filename="evaluation.json",
)

Evaluate step started.


‚úÖ Validation Accuracy: 0.7765
‚úÖ Metrics saved: /opt/ml/output/data/evaluation.json


'/opt/ml/output/data/evaluation.json'