In [7]:
import argparse
import os
import subprocess
import sys
import yaml
import pandas as pd
from pathlib import Path
import boto3
from io import BytesIO
import json
import tarfile
import joblib
import lightgbm as lgb

PROJECT_ROOT = Path().resolve()
sys.path.insert(0, str(PROJECT_ROOT))
print(f"PROJECT_ROOT: {PROJECT_ROOT}")

PROJECT_ROOT: /home/ec2-user/SageMaker/gs-ds-env/lightgbm311/sm_docker


## Í≤ΩÎ°ú ÏÑ§Ï†ï

In [8]:
S3_BUCKET = "retail-mlops-edu-2026"
S3_DATA_PREFIX = "edu-2w/hjsong/input"
S3_MODEL_PREFIX = "edu-2w/hjsong/model"
S3_TRAIN_PREFIX = "edu-2w/hjsong/data/train"
S3_VAL_PREFIX = "edu-2w/hjsong/data/val"

sm_dir = {
    "train_path": os.environ.get(
        'SM_CHANNEL_TRAIN', PROJECT_ROOT / "data/train"
    ),
    "val_path": os.environ.get(
        'SM_CHANNEL_VAL', PROJECT_ROOT / "data/val"
    ),
    "model_path": os.environ.get(
        'SM_MODEL_DIR', PROJECT_ROOT / "model"
    ),
    "output_path": os.environ.get(
        'SM_OUTPUT_DATA_DIR', PROJECT_ROOT / "output"
    ),
}

## Î™®Îç∏Í¥ÄÎ†® ÌååÎùºÎØ∏ÌÑ∞ ÏÑ§Ï†ï

In [9]:
train_val_split={
  "val_ratio": 0.2,
  "random_state": 42
}

model_name ="titanic_model"
model_version="1.0.0"
model_description="Titanic Model"
model_algo="lightgbm"
hyperparameters={
    "objective": "binary",
    "metric": "binary_logloss",
    "num_leaves": 31,
    "learning_rate": 0.1,
    "n_estimators": 100,
    "max_depth": 10,
    "random_state": 42,
    "verbose": 0,
}

## Î™®Îç∏ Ïã§Ìñâ

### Ï†ÑÏ≤òÎ¶¨

In [11]:
# Îç∞Ïù¥ÌÑ∞ Ï†ÑÏ≤òÎ¶¨
from sklearn.model_selection import train_test_split

print("üöÄ  Loading data...")
key = f"{S3_DATA_PREFIX}/train.csv"
s3 = boto3.client("s3")
obj = s3.get_object(Bucket=S3_BUCKET, Key=key)
df = pd.read_csv(BytesIO(obj["Body"].read()))
print(f"üîç Data shape: {df.shape}")
print(f"üîç Columns: {list(df.columns)}")

print("üöÄ  Preprocessing data...")
df_preprocessed = df.copy()
df_preprocessed = df_preprocessed.rename(columns={
    'PassengerId': 'passenger_id',
    'Survived': 'target',
    'Pclass': 'pclass',
    'Name': 'name',
    'Sex': 'sex',
    'Age': 'age',
})


# Í∏∞Î≥∏ Í≤∞Ï∏°Ïπò Ï≤òÎ¶¨ + ÌÉÄÏûÖ Í∏∞Ï§Ä Îã®Ïàú Ï†ÑÏ≤òÎ¶¨
numeric_cols = df_preprocessed.select_dtypes(include="number").columns
object_cols = df_preprocessed.select_dtypes(exclude="number").columns

for col in numeric_cols:
    if df_preprocessed[col].isnull().any():
        df_preprocessed[col] = df_preprocessed[col].fillna(0)
    df_preprocessed[col] = pd.to_numeric(df_preprocessed[col], errors="coerce").fillna(0)

for col in object_cols:
    if df_preprocessed[col].isnull().any():
        if df_preprocessed[col].dropna().empty:
            df_preprocessed[col] = df_preprocessed[col].fillna("")
        else:
            df_preprocessed[col] = df_preprocessed[col].fillna(df[col].mode()[0])
    # Î≤îÏ£ºÌòï Ïª¨ÎüºÏùÄ Í∞ÑÎã®Ìûà Ïà´Ïûê Ïù∏ÏΩîÎî©
    df_preprocessed[col] = df_preprocessed[col].astype(str)
    df_preprocessed[col] = pd.factorize(df_preprocessed[col])[0]

print(f"üîç Features shape: {df_preprocessed.shape}")
print(f"üîç Features: {list(df_preprocessed.columns)}")

if "target" not in df_preprocessed.columns:
    raise ValueError("Column 'target' not found after preprocessing")

val_ratio = train_val_split["val_ratio"]
random_state = train_val_split["random_state"]

train_df_preprocessed, val_df_preprocessed = train_test_split(
    df_preprocessed,
    test_size=val_ratio,
    random_state=random_state,
    stratify=df_preprocessed["target"],
)

print("üíæ Saving preprocessed data...")
def save_upload_to_s3(output_dir, filename, s3_prefix,df=None):
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, filename)

    if df is not None:
        df.to_csv(output_path, index=False)
        print(f"üíæ Saved: {output_path}")
    
    # S3 Ï†ÄÏû•
    s3_key = f"{s3_prefix}/{filename}"
    s3 = boto3.client("s3")
    s3.upload_file(output_path, S3_BUCKET, s3_key)
    print(f"üíæ Saved S3: s3://{S3_BUCKET}/{s3_key}")
save_upload_to_s3(sm_dir['train_path'], "train.csv",S3_TRAIN_PREFIX, train_df_preprocessed)
save_upload_to_s3(sm_dir['val_path'], "validation.csv",S3_VAL_PREFIX, val_df_preprocessed)

üöÄ  Loading data...
üîç Data shape: (891, 12)
üîç Columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
üöÄ  Preprocessing data...
üîç Features shape: (891, 12)
üîç Features: ['passenger_id', 'target', 'pclass', 'name', 'sex', 'age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
üíæ Saving preprocessed data...
üíæ Saved: /home/ec2-user/SageMaker/gs-ds-env/lightgbm311/sm_docker/data/train/train.csv
üíæ Saved S3: s3://retail-mlops-edu-2026/edu-2w/hjsong/data/train/train.csv
üíæ Saved: /home/ec2-user/SageMaker/gs-ds-env/lightgbm311/sm_docker/data/val/validation.csv
üíæ Saved S3: s3://retail-mlops-edu-2026/edu-2w/hjsong/data/val/validation.csv


### ÌïôÏäµ

In [14]:
# Îç∞Ïù¥ÌÑ∞ ÌïôÏäµ

print("üöÄ Train step started.")
train_path = sm_dir['train_path']
if os.path.isdir(train_path):
    train_path = os.path.join(train_path, "train.csv")

train_df = pd.read_csv(train_path)
target_col = "survived" if "survived" in train_df.columns else "target"
X_train = train_df.drop(target_col, axis=1)
y_train = train_df[target_col]

print("üéØ Training LightGBM tree model...")
model = lgb.LGBMClassifier(**hyperparameters)
model.fit(X_train, y_train)

print("‚úÖ Model training completed!")

print("üöÄ save model/artifact in model path.")
model_dir = sm_dir['model_path']
model_name=f"{model_name}.joblib"
os.makedirs(model_dir, exist_ok=True)
model_path = os.path.join(model_dir, model_name)
joblib.dump(model, model_path)
print(f"‚úÖ Model saved: {model_path}")

artifact_path = os.path.join(model_dir, "model.tar.gz")
with tarfile.open(artifact_path, "w:gz") as tar:
    tar.add(model_dir, arcname=".")
print(f"‚úÖ Model artifact created: {artifact_path}")


key = S3_MODEL_PREFIX + "/" + os.path.basename(artifact_path)
s3 = boto3.client("s3")
s3.upload_file(artifact_path, S3_BUCKET, key)
uploaded_uri = f"s3://{S3_BUCKET}/{key}"
print(f"‚úÖ Model artifact uploaded: {uploaded_uri}")

üöÄ Train step started.
üéØ Training LightGBM tree model...
‚úÖ Model training completed!
üöÄ save model/artifact in model path.
‚úÖ Model saved: /home/ec2-user/SageMaker/gs-ds-env/lightgbm311/sm_docker/model/titanic_model.joblib.joblib
‚úÖ Model artifact created: /home/ec2-user/SageMaker/gs-ds-env/lightgbm311/sm_docker/model/model.tar.gz
‚úÖ Model artifact uploaded: s3://retail-mlops-edu-2026/edu-2w/hjsong/model/model.tar.gz


### ÌèâÍ∞Ä

In [16]:
# Îç∞Ïù¥ÌÑ∞ ÌèâÍ∞Ä
from sklearn.metrics import accuracy_score

print("Evaluate step started.")


val_path = sm_dir['val_path']
model_path = sm_dir['model_path']
output_path = sm_dir['output_path']

if os.path.isdir(val_path):
    val_path = os.path.join(val_path, "validation.csv")
val_df = pd.read_csv(val_path)

target_col = "survived" if "survived" in val_df.columns else "target"
X_val = val_df.drop(target_col, axis=1)
y_val = val_df[target_col]

if os.path.isdir(model_path):
    model_path = os.path.join(model_path, model_name)
model = joblib.load(model_path)


preds = model.predict(X_val)
acc = accuracy_score(y_val, preds)
metrics = {"accuracy": acc}
print(f"‚úÖ Validation Accuracy: {acc:.4f}")


metrics = {
        "model_path": str(model_path),
        "metrics": metrics,
    }
os.makedirs(output_path, exist_ok=True)
metrics_path = os.path.join(output_path, "evaluation.json")
with open(metrics_path, "w") as f:
    json.dump(metrics, f, indent=2)
print(f"‚úÖ Metrics saved: {metrics_path}")

Evaluate step started.
‚úÖ Validation Accuracy: 0.7765
‚úÖ Metrics saved: /home/ec2-user/SageMaker/gs-ds-env/lightgbm311/sm_docker/output/evaluation.json
