In [1]:
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('../artifacts/train.csv')
print("Data 'train.csv' berhasil dimuat.")

Data 'train.csv' berhasil dimuat.


In [None]:
drop_cols = ['Id', 'PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu']
df = df.drop(columns=drop_cols, errors='ignore')

In [4]:
# Menangani missing values di kolom kategorikal
for col in df.select_dtypes(include=['object']).columns:
    if df[col].isnull().any():
        df[col].fillna('None', inplace=True)

In [5]:
# Menangani missing values di kolom numerik
for col in df.select_dtypes(include=['number']).columns:
    if df[col].isnull().any():
        df[col].fillna(df[col].median(), inplace=True)

In [6]:
#  Memisahkan Fitur (X) dan Target (y)
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Mengidentifikasi kolom numerik dan kategorikal
numerical_features = X.select_dtypes(include=['number']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

In [22]:
print(categorical_features)


['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition']


In [8]:
# Menggabungkan transformer menggunakan ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [9]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [10]:
# Konfigurasi Model Regresi
models= {
    'random_forest': {
        'model': RandomForestRegressor(),
        'params': {
            'n_estimators': 150,
            'max_depth': 15,
            'random_state': 42
        }
    },
    'gradient_boosting': {
        'model': GradientBoostingRegressor(),
        'params': {
            'n_estimators': 150,
            'learning_rate': 0.1,
            'max_depth': 5,
            'random_state': 42
        }
    }
}

In [11]:
def evaluate_model_regression(y_true, y_pred):
    """
    Calculates regression evaluation metrics (RMSE, MAE, R^2).
    """
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    return {
        'rmse': rmse,
        'mae': mae,
        'r2_score': r2
    }

In [12]:
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('House Price Prediction')

# Train and evaluate models
for model_name, model_info in models.items():
    with mlflow.start_run(run_name=model_name):
        # Log model parameters
        mlflow.log_params(model_info['params'])
        
        # Train model
        model = model_info['model']
        model.set_params(**model_info['params'])
        model.fit(X_train_processed, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test_processed)
        
        # Calculate and log metrics
        metrics = evaluate_model_regression(y_test, y_pred)
        mlflow.log_metrics(metrics)
        
        # Log model
        mlflow.sklearn.log_model(model, 
                                 model_name,
                                 registered_model_name=f"House Price Prediction{model_name}")
        
        print(f"\nModel: {model_name}")
        for metric_name, value in metrics.items():
            print(f"{metric_name}: {value:.4f}")

2025/08/13 00:49:20 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/08/13 00:49:20 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
2025/08/13 00:49:45 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/08/13 00:49:45 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
Registered model 'House Price Predictionrandom_forest' already exists. Creating a new version of this model...
Created version '4' of model 'House Price Predictionrandom_forest'.



Model: random_forest
rmse: 28680.9981
mae: 17504.3937
r2_score: 0.8928





Model: gradient_boosting
rmse: 25880.5164
mae: 15902.6839
r2_score: 0.9127


Registered model 'House Price Predictiongradient_boosting' already exists. Creating a new version of this model...
Created version '4' of model 'House Price Predictiongradient_boosting'.


In [13]:
# Transition the best model to production
from mlflow.tracking import MlflowClient

def load_production_model(model_name):
    model = mlflow.pyfunc.load_model(
        model_uri=f"models:/House Price Prediction_{model_name}/Production"
    )
    return model



client = MlflowClient()

def transition_model_to_production(model_name):
    client = MlflowClient()
    latest_version = client.get_latest_versions(f"House Price Prediction{model_name}", stages=["None"])[0]
    client.transition_model_version_stage(
        name=f"House Price Prediction{model_name}",
        version=latest_version.version,
        stage="Production"
    )

In [14]:
# Example: Transition the best performing model to production
# Note: You should choose the best model based on your evaluation metrics
transition_model_to_production('gradient_boosting')

In [15]:
def get_all_runs():
    experiment = mlflow.get_experiment_by_name('House Price Prediction')
    runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id])
    return runs

In [16]:
runs_df = get_all_runs()
metrics_comparison = runs_df[['tags.mlflow.runName', 'metrics.rmse', 'metrics.mae', 'metrics.r2_score']]
print("\nModel Performance Comparison:")
print(metrics_comparison)


Model Performance Comparison:
   tags.mlflow.runName  metrics.rmse   metrics.mae  metrics.r2_score
0    gradient_boosting  25880.516413  15902.683917          0.912676
1        random_forest  28680.998095  17504.393709          0.892756
2    gradient_boosting  25880.516413  15902.683917          0.912676
3        random_forest  28680.998095  17504.393709          0.892756
4    gradient_boosting  25880.516413  15902.683917          0.912676
5        random_forest  28680.998095  17504.393709          0.892756
6    gradient_boosting  25880.516413  15902.683917          0.912676
7        random_forest  28680.998095  17504.393709          0.892756
8        random_forest           NaN           NaN               NaN
9        random_forest           NaN           NaN               NaN
10       random_forest           NaN           NaN               NaN
11       random_forest           NaN           NaN               NaN
12       random_forest           NaN           NaN               NaN
13 

In [17]:
# Analyze feature importance for the best model (Random Forest)
best_model = models['gradient_boosting']['model']
trained_gradient_boosting = best_model.fit(X_train_processed, y_train)

ohe_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)
feature_names = numerical_features + list(ohe_feature_names)

# Sort features by importance
feature_importance = pd.DataFrame({
    'feature': feature_names,
    'importance': trained_gradient_boosting.feature_importances_
})

# Display top 10 most important features
print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))


Top 10 Most Important Features:
        feature  importance
0    MSSubClass    0.001535
1   LotFrontage    0.011707
2       LotArea    0.015965
3   OverallQual    0.557619
4   OverallCond    0.008011
5     YearBuilt    0.014199
6  YearRemodAdd    0.006843
7    MasVnrArea    0.001013
8    BsmtFinSF1    0.033361
9    BsmtFinSF2    0.000476
