Connect to MLFlow

In [None]:
import dagshub
import mlflow
from mlflow.models.signature import infer_signature

dagshub.init(repo_owner='lchik22', repo_name='first_assignment', mlflow=True)
mlflow.set_tracking_uri('https://dagshub.com/lchik22/first_assignment.mlflow')

mlflow.sklearn.autolog(
    log_input_examples=True,
    log_model_signatures=True,
    log_models=True
)

Import Data

In [94]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import RFE
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

pd.set_option('display.max_columns', None)  
pd.set_option('display.width', None)        
pd.set_option('display.expand_frame_repr', False)

df = pd.read_csv('./data/train.csv')

In [2]:
X = df.drop(columns=["SalePrice"])
y = df["SalePrice"]

Cleaning

In [4]:
class DataCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, drop_duplicates=True, numerical_fillna_strategy='median'):
        self.drop_duplicates = drop_duplicates
        self.numerical_fillna_strategy = numerical_fillna_strategy
        
    def fit(self, X, y=None):
        self.num_features = X.select_dtypes(include='number').columns
        if self.numerical_fillna_strategy == 'median':
            self.fill_values_ = X[self.num_features].median()
        elif self.numerical_fillna_strategy == 'mean':
            self.fill_values_ = X[self.num_features].mean()
        return self
    
    def transform(self, X):
        X = X.copy()

        if self.numerical_fillna_strategy in ['median', 'mean']:
            X[self.num_features] = X[self.num_features].fillna(self.fill_values_)
        
        if self.drop_duplicates:
            X = X.drop_duplicates()
        
        return X

Feature Engineering

In [5]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self, cat_features=None):
        self.cat_features = cat_features
        self.imputer_ = None
        self.encoder_ = None
        
    def fit(self, X, y=None):
        if self.cat_features is None:
            self.cat_features = X.select_dtypes(include=['object', 'category']).columns
            
        self.imputer_ = SimpleImputer(strategy='most_frequent')
        self.imputer_.fit(X[self.cat_features])
        
        self.encoder_ = OneHotEncoder(handle_unknown='ignore')
        self.encoder_.fit(self.imputer_.transform(X[self.cat_features]))
        
        return self
    
    def transform(self, X):
        X_cat = self.imputer_.transform(X[self.cat_features])
        
        X_encoded = self.encoder_.transform(X_cat)
        
        num_features = [col for col in X.columns if col not in self.cat_features]
        X_num = X[num_features].values if len(num_features) > 0 else None
        
        if X_num is not None:
            return np.hstack([X_encoded.toarray(), X_num])
        return X_encoded.toarray()

Feature Selection

In [6]:
class CorrelationFilter(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.9):
        self.threshold = threshold
        self.to_drop = None

    def fit(self, X, y=None):
        corr_matrix = pd.DataFrame(X).corr().abs()
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        self.to_drop = [column for column in upper.columns if any(upper[column] > self.threshold)]
        return self

    def transform(self, X):
        return pd.DataFrame(X).drop(columns=self.to_drop, errors="ignore")

Training

In [92]:
trainer = Pipeline([
        ('cleaning', DataCleaner()),
        ('feature_engineering', FeatureEngineer()),
        ('corr_filter',CorrelationFilter(threshold=0.75)),
        ('rfe', RFE(estimator=LinearRegression())),
        ('model', LinearRegression())
    ])

In [100]:
param_grid = {
    'corr_filter__threshold': [0.7, 0.8, 0.9],  
    'rfe__n_features_to_select': [
        0.1, 0.3, 0.5, 0.7,
        10, 20, 30      
    ]
}

grid_search = GridSearchCV(
    estimator=trainer,
    param_grid=param_grid,
    cv=KFold(n_splits=5, shuffle=True, random_state=42),
    scoring='neg_mean_squared_error',  
    refit=True, 
    n_jobs=-1,   
    verbose=0
)
grid_search.fit(X, y)

In [98]:
best_score = -grid_search.best_score_
print(f"Best mean RMSE from CV: {np.sqrt(best_score):.4f}")

Best mean RMSE from CV: 34916.6424


In [97]:
cv_results = grid_search.cv_results_

# Mean and std for each run (over the folds)
for mean, std, params in zip(
    cv_results["mean_test_score"], 
    cv_results["std_test_score"], 
    cv_results["params"]
):
    print(f"RMSE: {np.sqrt(-mean):.4f} ± {np.sqrt(std):.4f} for {params}")


RMSE: 46360.1843 ± 11801.9830 for {'corr_filter__threshold': 0.7, 'rfe__n_features_to_select': 0.1}
RMSE: 36607.4627 ± 9573.7465 for {'corr_filter__threshold': 0.7, 'rfe__n_features_to_select': 0.3}
RMSE: 35782.6308 ± 8131.8033 for {'corr_filter__threshold': 0.7, 'rfe__n_features_to_select': 0.5}
RMSE: 34916.6424 ± 9587.4917 for {'corr_filter__threshold': 0.7, 'rfe__n_features_to_select': 0.7}
RMSE: 56644.6987 ± 23835.6569 for {'corr_filter__threshold': 0.7, 'rfe__n_features_to_select': 10}
RMSE: 47095.4401 ± 12434.3542 for {'corr_filter__threshold': 0.7, 'rfe__n_features_to_select': 20}
RMSE: 41885.2419 ± 14696.3730 for {'corr_filter__threshold': 0.7, 'rfe__n_features_to_select': 30}
RMSE: 47087.5304 ± 10099.3997 for {'corr_filter__threshold': 0.8, 'rfe__n_features_to_select': 0.1}
RMSE: 37065.9048 ± 10002.0337 for {'corr_filter__threshold': 0.8, 'rfe__n_features_to_select': 0.3}
RMSE: 36135.4355 ± 12069.5371 for {'corr_filter__threshold': 0.8, 'rfe__n_features_to_select': 0.5}
RMSE: 

In [96]:
import pandas as pd

results_df = pd.DataFrame(grid_search.cv_results_)
# Convert negative MSE to positive RMSE
results_df["mean_rmse"] = (-results_df["mean_test_score"]) ** 0.5

# See top 5 results
results_df[["params", "mean_rmse", "std_test_score"]].sort_values("mean_rmse").head()

Unnamed: 0,params,mean_rmse,std_test_score
3,"{'corr_filter__threshold': 0.7, 'rfe__n_featur...",34916.642413,91920000.0
17,"{'corr_filter__threshold': 0.9, 'rfe__n_featur...",35078.605888,207651400.0
10,"{'corr_filter__threshold': 0.8, 'rfe__n_featur...",35474.571701,154386900.0
16,"{'corr_filter__threshold': 0.9, 'rfe__n_featur...",35570.604788,179782100.0
2,"{'corr_filter__threshold': 0.7, 'rfe__n_featur...",35782.630764,66126230.0


In [None]:
mlflow.set_experiment("House Price Prediction | Linear Regression")

with mlflow.start_run(run_name="LinearRegression_Newbie") as run:
    trainer.fit(X_train, y_train)
    
    mlflow.log_param("test_size", 0.2)
    mlflow.log_param("random_state", 42)
    
    y_pred = trainer.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mlflow.log_metric("rmse", rmse)
    
    signature = infer_signature(X_train, y_pred)
    mlflow.sklearn.log_model(
        sk_model=trainer,
        artifact_path="house_price_model_linear_regression",
        signature=signature,
        registered_model_name="HousePricePredictor_LinearRegression"
    )

Upload Model

In [None]:
from mlflow.tracking import MlflowClient

client = MlflowClient()

client.transition_model_version_stage(
    name="HousePricePredictor_LinearRegression",
    version=1,
    stage="Staging"
)