Connect to MLFlow

In [1]:
import dagshub
import mlflow
from mlflow.models.signature import infer_signature

dagshub.init(repo_owner='lchik22', repo_name='first_assignment', mlflow=True)
mlflow.set_tracking_uri('https://dagshub.com/lchik22/first_assignment.mlflow')

mlflow.sklearn.autolog(
    log_input_examples=True,
    log_model_signatures=True,
    log_models=True
)

Import Data

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.feature_selection import RFE
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, cross_val_score

pd.set_option('display.max_columns', None)  
pd.set_option('display.width', None)        
pd.set_option('display.expand_frame_repr', False)

df = pd.read_csv('./data/train.csv')

In [3]:
X = df.drop(columns=["SalePrice"])
y = df["SalePrice"]

Cleaning

In [4]:
class DataCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.8, drop_duplicates=True, numerical_fillna_strategy='median'):
        self.drop_duplicates = drop_duplicates
        self.numerical_fillna_strategy = numerical_fillna_strategy
        self.threshold = threshold
        
    def fit(self, X, y=None):
        self.removed_features = X.isna().mean()
        self.removed_features = self.removed_features[self.removed_features > self.threshold].index.tolist()
        self.num_features = [col for col in X.select_dtypes(include='number').columns if col not in self.removed_features]
        if self.numerical_fillna_strategy == 'median':
            self.fill_values_ = X[self.num_features].median()
        elif self.numerical_fillna_strategy == 'mean':
            self.fill_values_ = X[self.num_features].mean()
        return self
    
    def transform(self, X):
        X = X.copy()

        X = X.drop(columns=self.removed_features)

        if self.numerical_fillna_strategy in ['median', 'mean']:
            X[self.num_features] = X[self.num_features].fillna(self.fill_values_)
        
        if self.drop_duplicates:
            X = X.drop_duplicates()
        
        return X

Feature Engineering

In [5]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from scipy import sparse

class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self, cat_features=None, scale_numeric=True, log_transform_skewed=True):
        self.cat_features = cat_features
        self.scale_numeric = scale_numeric
        self.log_transform_skewed = log_transform_skewed

    def fit(self, X, y=None):
        X = X.copy()

        if self.cat_features is None:
            self.cat_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
        
        self.num_features_ = [col for col in X.columns if col not in self.cat_features]

        self.cat_imputer_ = SimpleImputer(strategy='most_frequent')
        self.cat_imputer_.fit(X[self.cat_features])
        
        if self.log_transform_skewed:
            skewness = X[self.num_features_].skew().abs()
            self.skewed_cols_ = skewness[skewness > 1].index.tolist()
        else:
            self.skewed_cols_ = []

        if self.scale_numeric:
            X_num = X[self.num_features_].copy()
            if self.skewed_cols_:
                X_num[self.skewed_cols_] = np.log1p(X_num[self.skewed_cols_])
            self.scaler_ = StandardScaler()
            self.scaler_.fit(X_num)

        X_cat = self.cat_imputer_.transform(X[self.cat_features])
        self.encoder_ = OneHotEncoder(handle_unknown='ignore', sparse_output=True)
        self.encoder_.fit(X_cat)

        self.encoded_cat_features_ = self.encoder_.get_feature_names_out(self.cat_features)

        return self

    def transform(self, X):
        X = X.copy()
        
        X_cat = self.cat_imputer_.transform(X[self.cat_features])
        X_cat_encoded = self.encoder_.transform(X_cat)
        
        X_num = X[self.num_features_].copy()
        
        if self.skewed_cols_:
            for col in self.skewed_cols_:
                if col in X_num.columns:
                    X_num[col] = np.log1p(X_num[col])

        if self.scale_numeric:
            X_num_scaled = self.scaler_.transform(X_num)
        else:
            X_num_scaled = X_num.values
        
        X_transformed = sparse.hstack([X_cat_encoded, sparse.csr_matrix(X_num_scaled)]).tocsr()

        all_feature_names = list(self.encoded_cat_features_) + list(self.num_features_)
        return pd.DataFrame.sparse.from_spmatrix(X_transformed, columns=all_feature_names)

Feature Selection

In [6]:
class CorrelationFilter(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.9):
        self.threshold = threshold
        self.to_drop = None

    def fit(self, X, y=None):
        corr_matrix = pd.DataFrame(X).corr().abs()
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        self.to_drop = [column for column in upper.columns if any(upper[column] > self.threshold)]
        return self

    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
        return pd.DataFrame(X).drop(columns=self.to_drop, errors="ignore")

Training

In [7]:
trainer = Pipeline([
        ('cleaning', DataCleaner()),
        ('feature_engineering', FeatureEngineer()),
        ('corr_filter',CorrelationFilter(threshold=0.75)),
        ('rfe', RFE(estimator=LinearRegression())),
        ('model', LinearRegression())
    ])

In [8]:
param_grid = {
    'cleaning__threshold': [0.05, 0.4, 0.8],
    'cleaning__numerical_fillna_strategy': ['mean', 'median'],
    'corr_filter__threshold': [0.7, 0.8, 0.9],  
    'rfe__n_features_to_select': [0.3, 0.7, 10]
}

In [9]:
mlflow.set_experiment("House Price Prediction | Linear Regression")

with mlflow.start_run(run_name="LinearRegression_Pro") as run:
    mlflow.log_param("cv_folds", 5)
    mlflow.log_param("grid_search_strategy", "exhaustive")
    
    grid_search = GridSearchCV(
        estimator=trainer,
        param_grid=param_grid,
        cv=KFold(n_splits=5, shuffle=True, random_state=42),
        scoring='neg_mean_squared_error',  
        refit=True, 
        n_jobs=-1,   
        verbose=2,
        return_train_score=True
    )
    
    grid_search.fit(X, y)
    
    best_params = grid_search.best_params_
    for param, value in best_params.items():
        mlflow.log_param(f"best_{param}", value)

    cv_neg_mse = grid_search.best_score_
    cv_rmse = np.sqrt(-cv_neg_mse)
    
    best_model = grid_search.best_estimator_
    cv_mae = -np.mean(cross_val_score(best_model, X, y, 
                      cv=5, scoring='neg_mean_absolute_error'))
    cv_r2 = np.mean(cross_val_score(best_model, X, y, 
                    cv=5, scoring='r2'))
    
    mlflow.log_metric("cv_rmse", cv_rmse)
    mlflow.log_metric("cv_mae", cv_mae)
    mlflow.log_metric("cv_r2", cv_r2)
    
    signature = infer_signature(X, y)
    mlflow.sklearn.log_model(
        sk_model=best_model,
        artifact_path="house_price_model_linear_regression",
        signature=signature,
        registered_model_name="HousePricePredictor_LinearRegression"
    )
    
    results = pd.DataFrame(grid_search.cv_results_)
    
    results.to_csv("grid_search_results.csv", index=False)
    mlflow.log_artifact("grid_search_results.csv")
    
    print(f"Best parameters: {best_params}")
    print(f"RMSE: {cv_rmse:.4f}")
    print(f"MAE: {cv_mae:.4f}")
    print(f"R²: {cv_r2:.4f}")



Fitting 5 folds for each of 54 candidates, totalling 270 fits


2025/04/10 16:35:47 INFO mlflow.sklearn.utils: Logging the 5 best runs, 49 runs will be omitted.


🏃 View run treasured-deer-350 at: https://dagshub.com/lchik22/first_assignment.mlflow/#/experiments/1/runs/f8085fbfa83543f7b623960e120ad9ba
🧪 View experiment at: https://dagshub.com/lchik22/first_assignment.mlflow/#/experiments/1
🏃 View run hilarious-shrimp-441 at: https://dagshub.com/lchik22/first_assignment.mlflow/#/experiments/1/runs/c50d0c25ad51444183cb28200d6ba9c8
🧪 View experiment at: https://dagshub.com/lchik22/first_assignment.mlflow/#/experiments/1
🏃 View run bustling-frog-299 at: https://dagshub.com/lchik22/first_assignment.mlflow/#/experiments/1/runs/6703111fefa344bc936c8823b1ca18eb
🧪 View experiment at: https://dagshub.com/lchik22/first_assignment.mlflow/#/experiments/1
🏃 View run chill-sponge-133 at: https://dagshub.com/lchik22/first_assignment.mlflow/#/experiments/1/runs/44b66813fc9c4f7790021b1ff40da67a
🧪 View experiment at: https://dagshub.com/lchik22/first_assignment.mlflow/#/experiments/1
🏃 View run unequaled-elk-131 at: https://dagshub.com/lchik22/first_assignment.mlf

Registered model 'HousePricePredictor_LinearRegression' already exists. Creating a new version of this model...
2025/04/10 16:36:37 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: HousePricePredictor_LinearRegression, version 6
Created version '6' of model 'HousePricePredictor_LinearRegression'.


Best parameters: {'cleaning__numerical_fillna_strategy': 'mean', 'cleaning__threshold': 0.05, 'corr_filter__threshold': 0.7, 'rfe__n_features_to_select': 0.7}
RMSE: 33854.6252
MAE: 19357.6380
R²: 0.8323
🏃 View run LinearRegression_Pro at: https://dagshub.com/lchik22/first_assignment.mlflow/#/experiments/1/runs/55a0999668a84af494c7a7b5b4aa0cbd
🧪 View experiment at: https://dagshub.com/lchik22/first_assignment.mlflow/#/experiments/1
