In [65]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [66]:
train_df = pd.read_csv("../data/train.csv")

In [67]:
df = train_df.copy()

In [68]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [69]:
train_df["MiscFeature"].unique()

array([nan, 'Shed', 'Gar2', 'Othr', 'TenC'], dtype=object)

In [70]:
train_df["MasVnrArea"].describe()

count    1452.000000
mean      103.685262
std       181.066207
min         0.000000
25%         0.000000
50%         0.000000
75%       166.000000
max      1600.000000
Name: MasVnrArea, dtype: float64

In [71]:
none_cols = ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
             'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu', 'GarageType',
             'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']

df[none_cols] = df[none_cols].fillna('None')

zero_cols = ['MasVnrArea', 'GarageYrBlt','LotFrontage']
df[zero_cols] = df[zero_cols].fillna(0)

# Electrical - fill with mode
df['Electrical'] = df['Electrical'].fillna(df['Electrical'].mode()[0])

# Confirm no missing values
print(df.isnull().sum().sort_values(ascending=False))


Id             0
CentralAir     0
GarageYrBlt    0
GarageType     0
FireplaceQu    0
              ..
MasVnrArea     0
MasVnrType     0
Exterior2nd    0
Exterior1st    0
SalePrice      0
Length: 81, dtype: int64


In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1460 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          1460 non-null   object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [73]:
df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']


In [74]:
df['TotalBath'] = (df['BsmtFullBath'] + df['FullBath'] + 
                   0.5 * (df['BsmtHalfBath'] + df['HalfBath']))


In [75]:
df['HouseAge'] = df['YrSold'] - df['YearBuilt']
df['RemodAge'] = df['YrSold'] - df['YearRemodAdd']


In [76]:
df['GarageAge'] = df['YrSold'] - df['GarageYrBlt']
df['GarageAge'] = df['GarageAge'].replace(0, df['GarageAge'].median())  # Replace 0 if needed


In [77]:
df['IsRemodeled'] = (df['YearBuilt'] != df['YearRemodAdd']).astype(int)


In [78]:
df['TotalPorchSF'] = (df['OpenPorchSF'] + df['EnclosedPorch'] +
                      df['3SsnPorch'] + df['ScreenPorch'])


In [79]:
df['OverallScore'] = df['OverallQual'] * df['OverallCond']


In [80]:
df['HasPool'] = (df['PoolArea'] > 0).astype(int)
df['Has2ndFloor'] = (df['2ndFlrSF'] > 0).astype(int)
df['HasGarage'] = (df['GarageArea'] > 0).astype(int)
df['HasBsmt'] = (df['TotalBsmtSF'] > 0).astype(int)
df['HasFireplace'] = (df['Fireplaces'] > 0).astype(int)


In [81]:
# Set 'Id' column as the index and drop it from the dataset
df.set_index('Id', inplace=True)


In [82]:
# Drop Features
cols_to_drop = [
    'YearBuilt', 'YearRemodAdd', 'GarageYrBlt',
    'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
    'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch',
    'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
    'PoolArea', 'GarageArea', 'GarageCars', 'Fireplaces'
]

# Optional: Drop high-missing-value features if not encoded yet
optional_drop = ['PoolQC', 'MiscFeature', 'Alley', 'Fence']  

df = df.drop(columns=cols_to_drop + optional_drop)


In [83]:
df

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,RemodAge,GarageAge,IsRemodeled,TotalPorchSF,OverallScore,HasPool,Has2ndFloor,HasGarage,HasBsmt,HasFireplace
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,5,5.0,0,61,35,0,1,1,1,0
2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,31,31.0,0,0,48,0,0,1,1,1
3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,6,7.0,1,42,35,0,1,1,1,1
4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,36,8.0,1,307,35,0,1,1,1,1
5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,8,8.0,0,84,40,0,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,60,RL,62.0,7917,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,7,8.0,1,40,30,0,1,1,1,1
1457,20,RL,85.0,13175,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,22,32.0,1,0,36,0,0,1,1,1
1458,70,RL,66.0,9042,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,4,69.0,1,60,63,0,1,1,1,1
1459,20,RL,68.0,9717,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,14,60.0,1,112,30,0,0,1,1,0


In [84]:
cdata =df.select_dtypes(include=['float64', 'int64'])

In [85]:
corr = cdata.corr()['SalePrice'].sort_values(ascending=False)

In [86]:
corr

SalePrice       1.000000
OverallQual     0.790982
TotalSF         0.782260
GrLivArea       0.708624
TotalBath       0.631731
OverallScore    0.565294
TotRmsAbvGrd    0.533723
MasVnrArea      0.472614
HasFireplace    0.471908
BsmtFinSF1      0.386420
WoodDeckSF      0.324413
LotArea         0.263843
HasGarage       0.236832
BsmtUnfSF       0.214479
LotFrontage     0.209624
TotalPorchSF    0.195739
BedroomAbvGr    0.168213
HasBsmt         0.152829
Has2ndFloor     0.137656
HasPool         0.093708
MoSold          0.046432
BsmtFinSF2     -0.011378
MiscVal        -0.021190
IsRemodeled    -0.021933
LowQualFinSF   -0.025606
YrSold         -0.028923
OverallCond    -0.077856
MSSubClass     -0.084284
KitchenAbvGr   -0.135907
GarageAge      -0.257150
RemodAge       -0.509079
HouseAge       -0.523350
Name: SalePrice, dtype: float64

In [87]:
weak_corr_features = corr[(corr > -0.2) & (corr < 0.2)].index.tolist()

In [88]:
weak_corr_features

['TotalPorchSF',
 'BedroomAbvGr',
 'HasBsmt',
 'Has2ndFloor',
 'HasPool',
 'MoSold',
 'BsmtFinSF2',
 'MiscVal',
 'IsRemodeled',
 'LowQualFinSF',
 'YrSold',
 'OverallCond',
 'MSSubClass',
 'KitchenAbvGr']

In [89]:
weak_corr_features = [feat for feat in weak_corr_features if feat != 'SalePrice']

# Drop from dataframe
df = df.drop(columns=weak_corr_features)

print(f"Dropped features due to weak correlation: {weak_corr_features}")

Dropped features due to weak correlation: ['TotalPorchSF', 'BedroomAbvGr', 'HasBsmt', 'Has2ndFloor', 'HasPool', 'MoSold', 'BsmtFinSF2', 'MiscVal', 'IsRemodeled', 'LowQualFinSF', 'YrSold', 'OverallCond', 'MSSubClass', 'KitchenAbvGr']


In [90]:
X = df.drop(columns=['SalePrice'])  # Features only
y = df['SalePrice']  # Target

In [91]:
import category_encoders as ce

# Identify categorical columns
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

# Apply Target Encoding
encoder = ce.TargetEncoder(cols=cat_cols)
X_encoded = encoder.fit_transform(X, y)

In [92]:
X_encoded

Unnamed: 0_level_0,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,SaleType,SaleCondition,TotalSF,TotalBath,HouseAge,RemodAge,GarageAge,OverallScore,HasGarage,HasFireplace
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,191004.994787,65.0,8450,181130.538514,164754.818378,180183.746758,180950.95682,176938.047529,179956.799566,197965.734807,...,173401.836622,175202.219533,2566,3.5,5,5,5.0,35,1,0
2,191004.994787,80.0,9600,181130.538514,164754.818378,180183.746758,180950.95682,178122.652042,179956.799566,197643.209810,...,173401.836622,175202.219533,2524,2.5,31,31,31.0,48,1,1
3,191004.994787,68.0,11250,181130.538514,206101.665289,180183.746758,180950.95682,176938.047529,179956.799566,197965.734807,...,173401.836622,175202.219533,2706,3.5,7,6,7.0,35,1,1
4,191004.994787,60.0,9550,181130.538514,206101.665289,180183.746758,180950.95682,181623.425855,179956.799566,209344.287867,...,173401.836622,146537.060693,2473,2.0,91,36,8.0,35,1,1
5,191004.994787,84.0,14260,181130.538514,206101.665289,180183.746758,180950.95682,178122.652042,179956.799566,318453.591177,...,173401.836622,175202.219533,3343,3.5,8,8,8.0,40,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,191004.994787,62.0,7917,181130.538514,164754.818378,180183.746758,180950.95682,176938.047529,179956.799566,192821.904993,...,173401.836622,175202.219533,2600,2.5,8,7,8.0,30,1,1
1457,191004.994787,85.0,13175,181130.538514,164754.818378,180183.746758,180950.95682,176938.047529,179956.799566,189009.693995,...,173401.836622,175202.219533,3615,3.0,32,22,32.0,36,1,1
1458,191004.994787,66.0,9042,181130.538514,164754.818378,180183.746758,180950.95682,176938.047529,179956.799566,209344.287867,...,173401.836622,175202.219533,3492,2.0,69,4,69.0,63,1,1
1459,191004.994787,68.0,9717,181130.538514,164754.818378,180183.746758,180950.95682,176938.047529,179956.799566,145847.080044,...,173401.836622,175202.219533,2156,2.0,60,14,60.0,30,1,0


In [93]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Standardize numerical features
X_scaled = StandardScaler().fit_transform(X_encoded.select_dtypes(include=['float64', 'int64']))

# Apply PCA
pca = PCA(n_components=0.96)  # Keep 95% variance
pca_data = pca.fit_transform(X_scaled)


In [94]:
pca_data

array([[ 2.79338243, -0.55241438, -1.61470775, ...,  0.13068706,
         0.17193421, -0.24526956],
       [ 0.30444756,  1.43222835,  1.32666082, ...,  0.30417791,
        -0.11803226, -0.20536757],
       [ 3.45350246, -0.34427682, -0.77719934, ..., -0.03551073,
         0.12497743,  0.18017756],
       ...,
       [ 3.52791833, -0.94236037,  0.52256297, ..., -0.8793637 ,
         1.57732392,  0.73567831],
       [-2.11093634,  1.35543631, -0.13793843, ..., -1.01449226,
         0.28426232, -0.84987272],
       [-1.00616536,  1.85039307,  0.1759083 , ..., -0.28754582,
         0.65525238,  1.37697017]], shape=(1460, 41))

In [95]:
# Convert PCA data back to DataFrame
pca_df = pd.DataFrame(pca_data, index=df.index, 
                      columns=[f'PC{i+1}' for i in range(pca_data.shape[1])])

# Keep categorical features (if any)
cat_cols = X_encoded.select_dtypes(include=['object']).columns
final_df = pd.concat([pca_df, df[cat_cols]], axis=1)


In [96]:
final_df

Unnamed: 0_level_0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC32,PC33,PC34,PC35,PC36,PC37,PC38,PC39,PC40,PC41
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.793382,-0.552414,-1.614708,0.281985,-1.121446,-0.953057,-2.172451,-1.054694,0.164744,-0.276547,...,-0.500974,-0.085641,0.043880,-0.027480,-0.236786,-0.195111,0.329710,0.130687,0.171934,-0.245270
2,0.304448,1.432228,1.326661,1.418364,1.236119,0.957252,-0.485195,0.337887,0.499275,-1.818600,...,-0.208891,1.081969,0.955490,0.150911,1.493335,-0.014414,-1.273638,0.304178,-0.118032,-0.205368
3,3.453502,-0.344277,-0.777199,0.580173,-0.520067,-1.211740,-0.395664,-0.239754,-0.407653,-0.422813,...,0.549880,0.007416,0.442793,0.457162,-0.219378,-0.177382,0.742017,-0.035511,0.124977,0.180178
4,-0.967781,1.300965,1.614147,-0.254532,2.023425,-1.472367,0.183380,0.495970,0.289678,0.609367,...,1.605931,-0.789343,0.470715,-0.383270,0.431809,0.245027,0.964288,0.060649,0.439779,-0.090239
5,5.108256,-0.831599,0.927013,0.379447,-0.629529,-1.354774,-0.817845,-0.565492,-0.215722,0.047237,...,0.772894,-0.141546,0.578320,0.932040,0.441183,-0.457302,-0.062882,-0.251752,0.581029,-0.557546
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,1.381634,0.127719,-1.419896,-1.278468,0.103784,-1.854159,0.351669,-0.440352,0.922477,-0.218378,...,-0.143231,0.053874,0.181411,0.597286,0.259673,-0.113010,-0.316564,0.223081,-0.837837,-0.185361
1457,0.704403,1.105543,2.476818,-0.354137,-1.266441,0.153510,-0.383346,1.086878,1.860535,-0.540879,...,0.405127,-0.783990,-0.170366,0.164819,-0.726229,-0.780750,0.290882,1.529096,-0.596640,-0.168195
1458,3.527918,-0.942360,0.522563,-1.696351,2.366809,-1.967799,-0.692708,-0.951724,-0.412351,-1.598156,...,0.360721,-0.340175,0.569834,-0.642885,0.181975,-0.943703,0.640187,-0.879364,1.577324,0.735678
1459,-2.110936,1.355436,-0.137938,0.611714,-0.550074,1.274630,-0.455907,0.082623,-0.567236,-0.631343,...,0.295753,0.201721,-0.202307,1.262020,-0.405852,-0.436644,2.264773,-1.014492,0.284262,-0.849873


In [33]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

model = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(final_df,y)

print("Best Params:", grid_search.best_params_)


Fitting 3 folds for each of 162 candidates, totalling 486 fits
Best Params: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}


In [101]:
import mlflow
import mlflow.sklearn
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Import your custom modules
from feature_engineering import FeatureEngineer
from tencoder import TargetEncoderWrapper

# Prepare your data
X = train_df.drop(columns=['SalePrice'])
y = train_df['SalePrice']

# Apply feature engineering first to get categorical columns
fe = FeatureEngineer()
X_fe = fe.fit_transform(X)
cat_cols = X_fe.select_dtypes(include=['object']).columns.tolist()

# Create the full pipeline
pipeline = Pipeline([
    ('feature_engineering', FeatureEngineer()),
    ('target_encoding', TargetEncoderWrapper(cols=cat_cols)),
    ('scaler', StandardScaler()),
    ('imputer', SimpleImputer(strategy='median')),
    ('pca', PCA(n_components=0.96)),
    ('model', RandomForestRegressor(n_estimators=500, random_state=42, min_samples_split=2, min_samples_leaf=1, max_features="log2"))
])

# Split data for training and validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Start MLflow run
with mlflow.start_run(run_name="house_price_prediction_pipeline"):
    
    # Log parameters
    mlflow.log_param("model_type", "RandomForestRegressor")
    mlflow.log_param("n_estimators", 500)
    mlflow.log_param("max_depth", "None")
    mlflow.log_param("max_features", "log2")
    mlflow.log_param("min_samples_leaf", 1)
    mlflow.log_param("min_samples_split", 2)
    mlflow.log_param("random_state", 42)
    mlflow.log_param("pca_components", 0.96)
    mlflow.log_param("imputer_strategy", "median")
    mlflow.log_param("scaler_type", "StandardScaler")
    mlflow.log_param("categorical_columns", cat_cols)
    
    # Fit the pipeline
    print("Training pipeline...")
    pipeline.fit(X_train, y_train)
    
    # Make predictions
    y_pred_train = pipeline.predict(X_train)
    y_pred_val = pipeline.predict(X_val)
    
    # Calculate metrics
    train_mse = mean_squared_error(y_train, y_pred_train)
    val_mse = mean_squared_error(y_val, y_pred_val)
    train_rmse = np.sqrt(train_mse)
    val_rmse = np.sqrt(val_mse)
    train_r2 = r2_score(y_train, y_pred_train)
    val_r2 = r2_score(y_val, y_pred_val)
    
    # Log metrics
    mlflow.log_metric("train_mse", train_mse)
    mlflow.log_metric("val_mse", val_mse)
    mlflow.log_metric("train_rmse", train_rmse)
    mlflow.log_metric("val_rmse", val_rmse)
    mlflow.log_metric("train_r2", train_r2)
    mlflow.log_metric("val_r2", val_r2)
    
    # Log the pipeline model
    mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="model",
        registered_model_name="house_price_prediction_pipeline"
    )
    
    # Log additional artifacts if needed
    # You can save feature importance, plots, etc.
    if hasattr(pipeline.named_steps['model'], 'feature_importances_'):
        import matplotlib.pyplot as plt
        
        # Get feature names after all transformations
        feature_names = [f"feature_{i}" for i in range(len(pipeline.named_steps['model'].feature_importances_))]
        
        # Create feature importance plot
        plt.figure(figsize=(10, 6))
        indices = np.argsort(pipeline.named_steps['model'].feature_importances_)[::-1][:20]
        plt.bar(range(len(indices)), pipeline.named_steps['model'].feature_importances_[indices])
        plt.title('Top 20 Feature Importances')
        plt.xlabel('Features')
        plt.ylabel('Importance')
        plt.xticks(range(len(indices)), [feature_names[i] for i in indices], rotation=45)
        plt.tight_layout()
        plt.savefig("feature_importance.png")
        mlflow.log_artifact("feature_importance.png")
        plt.close()
    
    # Log pipeline configuration as text
    pipeline_info = f"""
    Pipeline Steps:
    1. Feature Engineering: {type(pipeline.named_steps['feature_engineering']).__name__}
    2. Target Encoding: {type(pipeline.named_steps['target_encoding']).__name__}
    3. Scaling: {type(pipeline.named_steps['scaler']).__name__}
    4. Imputation: {type(pipeline.named_steps['imputer']).__name__}
    5. PCA: {type(pipeline.named_steps['pca']).__name__}
    6. Model: {type(pipeline.named_steps['model']).__name__}
    
    Categorical Columns: {cat_cols}
    """
    
    with open("pipeline_info.txt", "w") as f:
        f.write(pipeline_info)
    mlflow.log_artifact("pipeline_info.txt")
    
    print(f"Pipeline saved successfully!")
    print(f"Run ID: {mlflow.active_run().info.run_id}")
    print(f"Validation RMSE: {val_rmse:.4f}")
    print(f"Validation R²: {val_r2:.4f}")

# Optional: Load the model later
def load_saved_pipeline(run_id):
    """Load a saved pipeline from MLflow"""
    model_uri = f"runs:/{run_id}/model"
    loaded_pipeline = mlflow.sklearn.load_model(model_uri)
    return loaded_pipeline

# Example of loading and using the saved pipeline
# loaded_pipeline = load_saved_pipeline("your_run_id_here")
# predictions = loaded_pipeline.predict(new_data)

Training pipeline...


Registered model 'house_price_prediction_pipeline' already exists. Creating a new version of this model...
Created version '20' of model 'house_price_prediction_pipeline'.


Pipeline saved successfully!
Run ID: 6209c932dfc94d88a8457683182f336d
Validation RMSE: 31015.7382
Validation R²: 0.8746


In [100]:
!mlflow ui

[2025-07-11 19:19:01 +0530] [57603] [INFO] Starting gunicorn 23.0.0
[2025-07-11 19:19:01 +0530] [57603] [INFO] Listening at: http://127.0.0.1:5000 (57603)
[2025-07-11 19:19:01 +0530] [57603] [INFO] Using worker: sync
[2025-07-11 19:19:01 +0530] [57604] [INFO] Booting worker with pid: 57604
[2025-07-11 19:19:01 +0530] [57608] [INFO] Booting worker with pid: 57608
[2025-07-11 19:19:01 +0530] [57609] [INFO] Booting worker with pid: 57609
[2025-07-11 19:19:02 +0530] [57610] [INFO] Booting worker with pid: 57610
^C
[2025-07-11 19:20:05 +0530] [57603] [INFO] Handling signal: int
[2025-07-11 19:20:06 +0530] [57608] [INFO] Worker exiting (pid: 57608)
[2025-07-11 19:20:06 +0530] [57604] [INFO] Worker exiting (pid: 57604)
[2025-07-11 19:20:06 +0530] [57609] [INFO] Worker exiting (pid: 57609)
[2025-07-11 19:20:06 +0530] [57610] [INFO] Worker exiting (pid: 57610)


In [49]:
import joblib

In [102]:
joblib.dump(pipeline, '../models/full_model_pipeline.pkl')

['../models/full_model_pipeline.pkl']