In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import yaml
import os
import dvc.api

# DVC 실험을 위한 기본 설정
def setup_experiment(data_version):
    # params.yaml 파일 생성
    params = {
        'data': {
            'version': data_version,
            'path': f'data/dataset_{data_version}.csv'
        },
        'model': {
            'n_estimators': 100,
            'max_depth': 10,
            'random_state': 42
        }
    }
    
    with open('params.yaml', 'w') as f:
        yaml.dump(params, f)

# 데이터 준비 함수
def prepare_data(version):
    # DVC로 특정 버전의 데이터 가져오기
    data_path = f'data/dataset_{version}.csv'
    data_url = f's3://your-bucket/dataset_{version}.csv'  # 실제 S3 버킷 주소로 변경 필요
    
    try:
        with dvc.api.open(
            data_path,
            repo='.',
            mode='r',
            rev=f'data-v{version}'
        ) as f:
            df = pd.read_csv(f)
    except:
        print(f"Error: Cannot access version {version} data")
        return None
    
    X = df.drop('target', axis=1)
    y = df['target']
    
    return train_test_split(X, y, test_size=0.2, random_state=42)

# 모델 학습 및 평가
def train_and_evaluate(X_train, X_test, y_train, y_test, params):
    model = RandomForestClassifier(
        n_estimators=params['model']['n_estimators'],
        max_depth=params['model']['max_depth'],
        random_state=params['model']['random_state']
    )
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    return model, accuracy

# 실험 결과 저장
def save_results(version, accuracy):
    results_dir = 'results'
    os.makedirs(results_dir, exist_ok=True)
    
    with open(f'{results_dir}/metrics_{version}.yaml', 'w') as f:
        yaml.dump({'accuracy': accuracy}, f)

# 메인 실험 실행
def run_experiment():
    # 두 가지 데이터 버전에 대해 실험
    versions = ['1', '2']
    results = {}
    
    for version in versions:
        print(f"\nRunning experiment with data version {version}")
        
        # 실험 설정
        setup_experiment(version)
        
        # 데이터 준비
        data = prepare_data(version)
        if data is None:
            continue
        X_train, X_test, y_train, y_test = data
        
        # 파라미터 로드
        with open('params.yaml', 'r') as f:
            params = yaml.safe_load(f)
        
        # 모델 학습 및 평가
        model, accuracy = train_and_evaluate(X_train, X_test, y_train, y_test, params)
        
        # 결과 저장
        save_results(version, accuracy)
        results[version] = accuracy
        
        print(f"Data version {version} - Accuracy: {accuracy:.4f}")
    
    # 결과 비교
    print("\nExperiment Results Comparison:")
    for version, accuracy in results.items():
        print(f"Data version {version}: {accuracy:.4f}")

if __name__ == "__main__":
    run_experiment()