# Importing Libraries

In [1]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-2.21.3-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.21.3 (from mlflow)
  Downloading mlflow_skinny-2.21.3-py3-none-any.whl.metadata (31 kB)
[0mCollecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.21.3->mlflow)
  Downloading databricks_sdk-0.50.0-py3-none-any.whl.metadata (38 kB)
Collecting fastapi<1 (from mlflow-skinny==2.21.3->mlflow)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn<1 (from mlflow-skinny==2.21.3->mlflow)
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting graphql-core<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql_core-3.2.6-py3-none-any.whl.metadata (11 kB)
Collecting graphql-relay<3.3,>=3.1 (from graphene<4->mlflow)


In [2]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import  GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import category_encoders as ce
import dill
from sklearn.decomposition import PCA

import mlflow
import mlflow.sklearn

# pip install --upgruad scikit-learn
import platform
print(platform.python_version())


# Check for GPU availability
try:
    import torch
    GPU_AVAILABLE = torch.cuda.is_available()
    # GPU_AVAILABLE=False
    print(f"GPU Available: {GPU_AVAILABLE}")
except ImportError:
    GPU_AVAILABLE = False
    print("PyTorch not installed; assuming no GPU support.")

# GPU-Compatible Libraries
if GPU_AVAILABLE:
    from cuml.ensemble import RandomForestRegressor
    from cuml.svm import SVR
    from xgboost import XGBRegressor
    from lightgbm import LGBMRegressor
    from cuml.preprocessing import PolynomialFeatures
    from cuml.linear_model import LinearRegression
    from cuml.preprocessing import StandardScaler, OneHotEncoder
    
    print("Using GPU-accelerated libraries.")
else:
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.svm import SVR
    from xgboost import XGBRegressor
    from lightgbm import LGBMRegressor
    from sklearn.preprocessing import StandardScaler, OneHotEncoder
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.linear_model import LinearRegression
   
    import sklearn
    # from sklearn.model_selection import train_test_split
    print("Using CPU-based libraries.")

3.10.12
GPU Available: True
Using GPU-accelerated libraries.


# Data Preparation

In [3]:
# Load the data
file_path = '/kaggle/input/data-property-v5/Egypt_Houses_Price_Cleaned.csv'
df = pd.read_csv(file_path)  

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20591 entries, 0 to 20590
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Type            20591 non-null  object 
 1   Price           20591 non-null  float64
 2   Bedrooms        20591 non-null  float64
 3   Bathrooms       20591 non-null  float64
 4   Area            20591 non-null  float64
 5   Furnished       20591 non-null  object 
 6   Level           20591 non-null  float64
 7   Compound        20591 non-null  object 
 8   Payment_Option  20591 non-null  object 
 9   Delivery_Date   20591 non-null  float64
 10  Delivery_Term   20591 non-null  object 
 11  City            20591 non-null  object 
 12  Price_Category  20591 non-null  object 
 13  Area_Category   20591 non-null  object 
 14  Price_per_sqm   20591 non-null  float64
 15  in_Compound     20591 non-null  int64  
 16  Immediate_Move  20591 non-null  int64  
dtypes: float64(7), int64(2), object

In [5]:
df.nunique()

Type                 7
Price             3429
Bedrooms             8
Bathrooms            8
Area               726
Furnished            3
Level               17
Compound           539
Payment_Option       4
Delivery_Date       19
Delivery_Term        5
City               175
Price_Category       6
Area_Category        4
Price_per_sqm     9265
in_Compound          2
Immediate_Move       2
dtype: int64

In [6]:
df.head()


Unnamed: 0,Type,Price,Bedrooms,Bathrooms,Area,Furnished,Level,Compound,Payment_Option,Delivery_Date,Delivery_Term,City,Price_Category,Area_Category,Price_per_sqm,in_Compound,Immediate_Move
0,Duplex,4000000.0,3.0,3.0,400.0,No,7.0,Unknown,Cash,0.0,Finished,Nasr City,Premium,Very Large,10000.0,0,1
1,Apartment,4000000.0,3.0,3.0,160.0,No,10.0,Unknown,Cash,0.0,Finished,Camp Caesar,Premium,Medium,25000.0,0,1
2,Apartment,2250000.0,3.0,2.0,165.0,No,1.0,Unknown,Cash,0.0,Finished,Smoha,Premium,Large,13636.363636,0,1
3,Apartment,1900000.0,3.0,2.0,230.0,No,10.0,Unknown,Cash,0.0,Finished,Nasr City,Premium,Large,8260.869565,0,1
4,Apartment,1844900.0,4.0,3.0,222.0,No,1.0,Beit Al Watan,Cash or Installment,36.0,Semi Finished,New Cairo - El Tagamoa,Premium,Large,8310.36036,1,0


In [7]:
# df=df.drop(['Unnamed: 0','price'],axis=1)
df=df.drop(["Price_Category","Price_per_sqm","Area_Category","in_Compound","Immediate_Move"],axis=1)

In [8]:
df.isnull().sum()

Type              0
Price             0
Bedrooms          0
Bathrooms         0
Area              0
Furnished         0
Level             0
Compound          0
Payment_Option    0
Delivery_Date     0
Delivery_Term     0
City              0
dtype: int64

In [9]:
# Log transformation
df["Log_Price"] = np.log1p(df["Price"])

In [10]:
df.head()

Unnamed: 0,Type,Price,Bedrooms,Bathrooms,Area,Furnished,Level,Compound,Payment_Option,Delivery_Date,Delivery_Term,City,Log_Price
0,Duplex,4000000.0,3.0,3.0,400.0,No,7.0,Unknown,Cash,0.0,Finished,Nasr City,15.201805
1,Apartment,4000000.0,3.0,3.0,160.0,No,10.0,Unknown,Cash,0.0,Finished,Camp Caesar,15.201805
2,Apartment,2250000.0,3.0,2.0,165.0,No,1.0,Unknown,Cash,0.0,Finished,Smoha,14.626441
3,Apartment,1900000.0,3.0,2.0,230.0,No,10.0,Unknown,Cash,0.0,Finished,Nasr City,14.457365
4,Apartment,1844900.0,4.0,3.0,222.0,No,1.0,Beit Al Watan,Cash or Installment,36.0,Semi Finished,New Cairo - El Tagamoa,14.427936


In [11]:
df.columns

Index(['Type', 'Price', 'Bedrooms', 'Bathrooms', 'Area', 'Furnished', 'Level',
       'Compound', 'Payment_Option', 'Delivery_Date', 'Delivery_Term', 'City',
       'Log_Price'],
      dtype='object')

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20591 entries, 0 to 20590
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Type            20591 non-null  object 
 1   Price           20591 non-null  float64
 2   Bedrooms        20591 non-null  float64
 3   Bathrooms       20591 non-null  float64
 4   Area            20591 non-null  float64
 5   Furnished       20591 non-null  object 
 6   Level           20591 non-null  float64
 7   Compound        20591 non-null  object 
 8   Payment_Option  20591 non-null  object 
 9   Delivery_Date   20591 non-null  float64
 10  Delivery_Term   20591 non-null  object 
 11  City            20591 non-null  object 
 12  Log_Price       20591 non-null  float64
dtypes: float64(7), object(6)
memory usage: 2.0+ MB


In [13]:
# Define features and target
target_column="Price"
X = df.drop(columns=[target_column,"Log_Price"])  

y = df["Log_Price"].astype("float32")  

In [14]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# Splitting data features into numeric and categorical
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

In [16]:
# Custom Transformer to Convert Output to Float32
from sklearn.base import BaseEstimator, TransformerMixin

class Float32Transformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.astype('float32')

In [17]:
#Define preprocessing for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
   
])


# Ensure the data passed to cuML's encoder is a pandas DataFrame
from sklearn.preprocessing import FunctionTransformer

# This function makes sure the output is a pandas DataFrame
def ensure_dataframe(X):
    return pd.DataFrame(X)

ensure_dataframe_transformer = FunctionTransformer(ensure_dataframe)

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent"))
    ,("binary_encode", ce.BinaryEncoder())
    
    
])



# Combine preprocessors in a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
         ("cat",categorical_transformer,categorical_features)
        ,('num', numerical_transformer, numeric_features)
    ]
)


In [18]:
preprocessor

In [19]:
# print(pipeline_RandomForest.steps)
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20591 entries, 0 to 20590
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Type            20591 non-null  object 
 1   Bedrooms        20591 non-null  float64
 2   Bathrooms       20591 non-null  float64
 3   Area            20591 non-null  float64
 4   Furnished       20591 non-null  object 
 5   Level           20591 non-null  float64
 6   Compound        20591 non-null  object 
 7   Payment_Option  20591 non-null  object 
 8   Delivery_Date   20591 non-null  float64
 9   Delivery_Term   20591 non-null  object 
 10  City            20591 non-null  object 
dtypes: float64(5), object(6)
memory usage: 1.7+ MB


In [20]:
preprocessor.fit_transform(X_train)

array([[ 0.        ,  0.        ,  1.        , ..., -0.7438756 ,
        -0.71561294, -0.29751831],
       [ 0.        ,  1.        ,  0.        , ...,  0.41070453,
        -0.71561294,  1.81846851],
       [ 0.        ,  1.        ,  1.        , ..., -0.23431231,
        -0.24450561,  1.81846851],
       ...,
       [ 1.        ,  1.        ,  0.        , ..., -0.20206147,
         0.22660171,  3.35736801],
       [ 0.        ,  1.        ,  1.        , ...,  0.53970789,
         0.22660171, -0.48988075],
       [ 0.        ,  1.        ,  0.        , ...,  1.62333617,
        -0.71561294, -0.48988075]])

In [21]:
df.head()

Unnamed: 0,Type,Price,Bedrooms,Bathrooms,Area,Furnished,Level,Compound,Payment_Option,Delivery_Date,Delivery_Term,City,Log_Price
0,Duplex,4000000.0,3.0,3.0,400.0,No,7.0,Unknown,Cash,0.0,Finished,Nasr City,15.201805
1,Apartment,4000000.0,3.0,3.0,160.0,No,10.0,Unknown,Cash,0.0,Finished,Camp Caesar,15.201805
2,Apartment,2250000.0,3.0,2.0,165.0,No,1.0,Unknown,Cash,0.0,Finished,Smoha,14.626441
3,Apartment,1900000.0,3.0,2.0,230.0,No,10.0,Unknown,Cash,0.0,Finished,Nasr City,14.457365
4,Apartment,1844900.0,4.0,3.0,222.0,No,1.0,Beit Al Watan,Cash or Installment,36.0,Semi Finished,New Cairo - El Tagamoa,14.427936


In [22]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20591 entries, 0 to 20590
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Type            20591 non-null  object 
 1   Bedrooms        20591 non-null  float64
 2   Bathrooms       20591 non-null  float64
 3   Area            20591 non-null  float64
 4   Furnished       20591 non-null  object 
 5   Level           20591 non-null  float64
 6   Compound        20591 non-null  object 
 7   Payment_Option  20591 non-null  object 
 8   Delivery_Date   20591 non-null  float64
 9   Delivery_Term   20591 non-null  object 
 10  City            20591 non-null  object 
dtypes: float64(5), object(6)
memory usage: 1.7+ MB


In [23]:
train_accs = {}
test_accs = {}

# Random Forest

In [24]:
# Define the model
if GPU_AVAILABLE:
    model = RandomForestRegressor(n_streams=1,random_state=42)
else :
    model = RandomForestRegressor(random_state=42)

# Create a pipeline that includes preprocessing and the model
pipeline_RandomForest = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('to_float32', Float32Transformer()),
    ('model', model)
])


# Define hyperparameter grid for model tuning
param_grid = {
    'model__n_estimators': [100,200,300,400]
     ,
    'model__max_depth': [ 30,40,50,60],
   'model__min_samples_split': [5,7, 10,15],
  'model__min_samples_leaf': [1,2,4,6]
}

# param_grid = {
#     'model__n_estimators': [400],
#     'model__max_depth': [30],
#    'model__min_samples_split': [5],
#   'model__min_samples_leaf': [1]
# }

# Use GridSearchCV to find the best parameters  
grid_search = GridSearchCV(pipeline_RandomForest, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=1)

# Fit the pipeline to the training data  
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 256 candidates, totalling 1280 fits


In [25]:
# Evaluate the model on the train set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_train)
y_pred = y_pred.get() if hasattr(y_pred, 'get') else y_pred

mse = mean_squared_error(y_train, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_train, y_pred)

train_accs['Random Forest'] = {'RMSE': rmse, 'R2_Score': r2}

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Train RMSE: {rmse:.2f}")
print(f"Train R2 Score: {r2:.2f}")

Best Parameters: {'model__max_depth': 40, 'model__min_samples_leaf': 1, 'model__min_samples_split': 5, 'model__n_estimators': 400}
Train RMSE: 0.53
Train R2 Score: 0.80


In [26]:
# Evaluate the model on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_pred = y_pred.get() if hasattr(y_pred, 'get') else y_pred

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

test_accs['Random Forest'] = {'RMSE': rmse, 'R2_Score': r2}

print(f"Test RMSE: {rmse:.2f}")
print(f"Test R2 Score: {r2:.2f}")

Test RMSE: 0.70
Test R2 Score: 0.65


In [27]:
# Tracking model
from mlflow.models.signature import infer_signature

input_example = X_train.iloc[:5]
pred_example = best_model.predict(input_example)
signature = infer_signature(input_example, pred_example)

with mlflow.start_run(run_name="RandomForestRegressor"):
    
    mlflow.log_params(grid_search.best_params_)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)

    # ✅ Updated model logging
    mlflow.sklearn.log_model(
        sk_model=best_model,
        artifact_path="RandomForest_model",
        input_example=input_example,
        signature=signature
    )

In [28]:
# Saving the best model
with open('/kaggle/working/random_forest_best_model.pkl', 'wb') as f:
    dill.dump(best_model, f)
print("Model saved successfully with dill.")

Model saved successfully with dill.


In [29]:
# Define PCA transformer
pca = PCA(n_components=0.80)  # Retain 95% of the variance

# Define the model (Random Forest Regressor)
model = RandomForestRegressor(random_state=42)

# Create a pipeline that includes preprocessing (scaling, PCA) and the model
pipeline_RandomForest = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('to_float32', Float32Transformer()),
    ('pca', pca),
    ('model', model)
])

# Define hyperparameter grid for model tuning
param_grid = {
    'model__n_estimators': [100, 200, 300, 400],
    'model__max_depth': [30, 40, 50, 60],
    'model__min_samples_split': [5, 7, 10, 15],
    'model__min_samples_leaf': [1, 2, 4, 6]
}

# param_grid = {
#     'model__n_estimators': [300],
#     'model__max_depth': [30],
#    'model__min_samples_split': [5],
#   'model__min_samples_leaf': [2]
# }

# Use GridSearchCV to find the best parameters
grid_search = GridSearchCV(pipeline_RandomForest, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)

# Fit the pipeline to the training data
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 256 candidates, totalling 1280 fits


  return func(**kwargs)


In [30]:
# Evaluate the model on the train set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_train)

# Convert predictions if needed (e.g., if using GPU-based libraries like cuML)
y_pred = y_pred.get() if hasattr(y_pred, 'get') else y_pred

# Compute evaluation metrics
mse = mean_squared_error(y_train, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_train, y_pred)

train_accs['Random Forest(PCA)'] = {'RMSE': rmse, 'R2_Score': r2}

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Train RMSE: {rmse:.2f}")
print(f"Train R2 Score: {r2:.2f}")

Best Parameters: {'model__max_depth': 30, 'model__min_samples_leaf': 2, 'model__min_samples_split': 5, 'model__n_estimators': 300}
Train RMSE: 0.42
Train R2 Score: 0.87


In [31]:
# Evaluate the model on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Convert predictions if needed (e.g., if using GPU-based libraries like cuML)
y_pred = y_pred.get() if hasattr(y_pred, 'get') else y_pred

# Compute evaluation metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

test_accs['Random Forest(PCA)'] = {'RMSE': rmse, 'R2_Score': r2}

print(f"Test RMSE: {rmse:.2f}")
print(f"Test R2 Score: {r2:.2f}")

Test RMSE: 0.81
Test R2 Score: 0.53


In [32]:
# Tracking model
with mlflow.start_run(run_name="Random Forest(PCA)"):
    

    mlflow.log_params(grid_search.best_params_)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)

    mlflow.sklearn.log_model(best_model, "Random Forest(PCA)_model",pip_requirements=[])



In [33]:
# Saving the best model using dill
with open('/kaggle/working/random_forest_pca_best_model.pkl', 'wb') as f:
    dill.dump(best_model, f)
print("Model saved successfully with dill.")

Model saved successfully with dill.


# Support Vector Regressor (SVR)

In [34]:
model = SVR()

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# Define hyperparameter grid for model tuning
param_grid = {
    'model__kernel': ['linear', 'rbf', 'poly']
    ,'model__C': [0.1, 1, 10]
    ,'model__epsilon': [0.1, 0.2, 0.5]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[W] [02:33:44.108628] SVR with the linear kernel can be much faster using the specialized solver provided by LinearSVR. Consider switching to LinearSVR if tranining takes too long.


In [35]:
#Evaluating the model on training set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_train)
y_pred = y_pred.get() if hasattr(y_pred, 'get') else y_pred



mse = mean_squared_error(y_train, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_train, y_pred)

train_accs['SVR'] = {'RMSE': rmse, 'R2_Score': r2}

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Train RMSE: {rmse:.2f}")
print(f"Train R2 Score: {r2:.2f}")

Best Parameters: {'model__C': 10, 'model__epsilon': 0.2, 'model__kernel': 'linear'}
Train RMSE: 0.84
Train R2 Score: 0.49


In [36]:
#Evaluating the model on test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_pred = y_pred.get() if hasattr(y_pred, 'get') else y_pred



mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

test_accs['SVR'] = {'RMSE': rmse, 'R2_Score': r2}

print(f"Test RMSE: {rmse:.2f}")
print(f"Test R2 Score: {r2:.2f}")

Test RMSE: 0.83
Test R2 Score: 0.50


In [37]:
# Tracking model
with mlflow.start_run(run_name="svr"):
    

    mlflow.log_params(grid_search.best_params_)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)

    mlflow.sklearn.log_model(best_model, "svr_model")



In [38]:
# Saving the best model
with open('/kaggle/working/svr_best_model.pkl', 'wb') as f:
    dill.dump(best_model, f)
print("Model saved successfully with dill.")

Model saved successfully with dill.


# XGBoost

In [39]:
# Dynamically set the device parameter
device = 'cuda' if GPU_AVAILABLE else 'cpu'


# Define the model
model = XGBRegressor(tree_method='hist', device=device, random_state=42)


pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

param_grid = {
    'model__n_estimators': [50, 100, 200],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__max_depth': [3, 5, 7]
}

# param_grid = {
#     'model__n_estimators': [ 200],
#     'model__learning_rate': [0.1],
#     'model__max_depth': [7]
# }

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [40]:
#Evaluting the model on training set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_train)
y_pred = y_pred.get() if hasattr(y_pred, 'get') else y_pred

mse = mean_squared_error(y_train, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_train, y_pred)

train_accs['XGBoost'] = {'RMSE': rmse, 'R2_Score': r2}

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Train RMSE: {rmse:.2f}")
print(f"Train R2 Score: {r2:.2f}")

Best Parameters: {'model__learning_rate': 0.1, 'model__max_depth': 7, 'model__n_estimators': 200}
Train RMSE: 0.54
Train R2 Score: 0.79


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [41]:
#Evaluting the model on test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_pred = y_pred.get() if hasattr(y_pred, 'get') else y_pred

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

test_accs['XGBoost'] = {'RMSE': rmse, 'R2_Score': r2}

print(f"Test RMSE: {rmse:.2f}")
print(f"Test R2 Score: {r2:.2f}")

Test RMSE: 0.70
Test R2 Score: 0.65


In [42]:
# Tracking model
with mlflow.start_run(run_name="xgboost"):
    

    mlflow.log_params(grid_search.best_params_)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)

    mlflow.sklearn.log_model(best_model, "xgboost_model")



In [43]:
# Saving the best model
with open('/kaggle/working/xgboost_best_model.pkl', 'wb') as f:
    dill.dump(best_model, f)
print("Model saved successfully with dill.")

Model saved successfully with dill.


# LightGBM

In [44]:
model = LGBMRegressor(device='gpu' if GPU_AVAILABLE else 'cpu',random_state=42)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

param_grid = {
    'model__n_estimators': [50, 100, 200],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__max_depth': [-1, 10, 20]
}

# param_grid = {
#     'model__n_estimators': [200],
#     'model__learning_rate': [0.2],
#     'model__max_depth': [10]
# }

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 360
[LightGBM] [Info] Number of data points in the train set: 16472, number of used features: 33
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 12 dense feature groups (0.19 MB) transferred to GPU in 0.001203 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 14.539200


In [45]:
#Evaluating the model on training set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_train)
y_pred = y_pred.get() if hasattr(y_pred, 'get') else y_pred

mse = mean_squared_error(y_train, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_train, y_pred)

train_accs['LightGBM'] = {'RMSE': rmse, 'R2_Score': r2}

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Train RMSE: {rmse:.2f}")
print(f"Train R2 Score: {r2:.2f}")

Best Parameters: {'model__learning_rate': 0.2, 'model__max_depth': 10, 'model__n_estimators': 200}
Train RMSE: 0.57
Train R2 Score: 0.76


In [46]:
#Evaluating the model on test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_pred = y_pred.get() if hasattr(y_pred, 'get') else y_pred

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

test_accs['LightGBM'] = {'RMSE': rmse, 'R2_Score': r2}

print(f"Test RMSE: {rmse:.2f}")
print(f"Test R2 Score: {r2:.2f}")

Test RMSE: 0.70
Test R2 Score: 0.65


In [47]:
# Tracking model
with mlflow.start_run(run_name="lightgbm"):
    

    mlflow.log_params(grid_search.best_params_)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)

    mlflow.sklearn.log_model(best_model, "lightgbm_model")



In [48]:
# Saving the best model
with open('/kaggle/working/lightgbm_best_model.pkl', 'wb') as f:
    dill.dump(best_model, f)
print("Model saved successfully with dill.")

Model saved successfully with dill.


# Neural Network

In [49]:
import numpy as np
import dill
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.regularizers import l2
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score


gpu_devices = tf.config.list_physical_devices('GPU')
if gpu_devices:
    try:
        for device in gpu_devices:
            tf.config.experimental.set_virtual_device_configuration(
                device, [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4096*20)]
            )
        print("✅ GPU detected and memory allocated:", gpu_devices)
    except RuntimeError as e:
        print(f"⚠️ GPU Initialization Error: {e}")
else:
    print("⚠️ No GPU detected. Running on CPU.")


np.random.seed(42)
tf.random.set_seed(42)

# ✅ Data Preprocessing
X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)

# ✅ Get Correct Input Shape
input_dim = X_train_scaled.shape[1]

# ✅ Define Model Function (GPU Optimized)
def build_model(hidden_layer_sizes=(50,), activation='relu', optimizer='adam', alpha=0.0001):
    model = Sequential()
    
    # Input layer
    model.add(Input(shape=(input_dim,)))  # ✅ Explicitly define input shape
    
    # Hidden layers
    for units in hidden_layer_sizes:
        model.add(Dense(units, activation=activation, kernel_regularizer=l2(alpha)))
    
    # Output layer
    model.add(Dense(1, dtype="float32"))  # ✅ Ensure float32 output for stability
    
    # Compile model
    model.compile(optimizer=optimizer, loss='mse')
    
    return model

# ✅ Define Hyperparameter Options (Manual Search)
hidden_layer_options = [(50,), (100,), (50, 50)]
activation_options = ['relu', 'tanh']
optimizer_options = ['adam']#, 'sgd']
alpha_options = [0.0001, 0.001, 0.01]

best_rmse = float("inf")
best_model = None
best_params = None

# ✅ Manual Hyperparameter Search (GPU Execution)
for hidden_layers in hidden_layer_options:
    for activation in activation_options:
        for optimizer in optimizer_options:
            for alpha in alpha_options:
                print(f"Training model with: hidden_layers={hidden_layers}, activation={activation}, optimizer={optimizer}, alpha={alpha}")

                # ✅ Train Model on GPU (TensorFlow automatically selects GPU if available)
                model = build_model(hidden_layers, activation, optimizer, alpha)

                # Train model
                model.fit(X_train_scaled, y_train, epochs=100, batch_size=8, verbose=0)

                # Predict on test set
                y_pred = model.predict(X_test_scaled)

                # Evaluate performance
                mse = mean_squared_error(y_test, y_pred)
                rmse = np.sqrt(mse)
                r2 = r2_score(y_test, y_pred)

                print(f"RMSE: {rmse:.2f}, R2: {r2:.2f}")

                # Save best model
                if rmse < best_rmse:
                    best_rmse = rmse
                    best_model = model
                    best_params = {
                        "hidden_layers": hidden_layers,
                        "activation": activation,
                        "optimizer": optimizer,
                        "alpha": alpha

                    
                }

✅ GPU detected and memory allocated: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]
Training model with: hidden_layers=(50,), activation=relu, optimizer=adam, alpha=0.0001
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
RMSE: 0.77, R2: 0.58
Training model with: hidden_layers=(50,), activation=relu, optimizer=adam, alpha=0.001
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
RMSE: 0.76, R2: 0.59
Training model with: hidden_layers=(50,), activation=relu, optimizer=adam, alpha=0.01
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
RMSE: 0.77, R2: 0.57
Training model with: hidden_layers=(50,), activation=tanh, optimizer=adam, alpha=0.0001
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
RMSE: 0.75, R2: 0.60
Training model with: hidden_layers=(50,), activation=tanh, optimizer=adam, alpha=0.001
[1m129/12

In [50]:
# Model Evaluation on training set
y_pred = best_model.predict(X_train_scaled)
mse = mean_squared_error(y_train, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_train, y_pred)

train_accs['Neural Network'] = {'RMSE': rmse, 'R2_Score': r2}

print(f"Train RMSE: {rmse:.2f}")
print(f"Train R2 Score: {r2:.2f}")

[1m515/515[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Train RMSE: 0.70
Train R2 Score: 0.64


In [51]:
# Model Evaluation on test set
y_pred = best_model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

test_accs['Neural Network'] = {'RMSE': rmse, 'R2_Score': r2}

print(f"Test RMSE: {rmse:.2f}")
print(f"Test R2 Score: {r2:.2f}")

[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Test RMSE: 0.75
Test R2 Score: 0.60


In [52]:
# Tracking model
with mlflow.start_run(run_name="Neural Network"):
    

    mlflow.log_params(best_params)
    mlflow.log_metric("rmse", best_rmse)
    mlflow.log_metric("r2", r2)

    mlflow.sklearn.log_model(best_model, "Neural Network_model")



In [53]:
# ✅ Saving the Best Model
with open('/kaggle/working/neural_network_best_model.pkl', 'wb') as f:
    dill.dump(best_model, f)
print("✅ Model saved successfully with dill.")

✅ Model saved successfully with dill.


# Stacking

In [54]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import torch

# Check GPU Availability
def is_gpu_available():
    return torch.cuda.is_available()

GPU_AVAILABLE = is_gpu_available()
print(f"GPU Available: {GPU_AVAILABLE}")

# Load Data
df_ = df.copy()

# Preprocessing Pipeline
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns
numerical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])
preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, numeric_features), ('cat', categorical_transformer, categorical_features)])

# Base Models (GPU-Enabled)
rf = RandomForestRegressor(n_estimators=300, max_depth=20, min_samples_split=5, min_samples_leaf=2, random_state=42)
xgb = XGBRegressor(n_estimators=400, max_depth=10, learning_rate=0.05, tree_method='gpu_hist' if GPU_AVAILABLE else 'hist', random_state=42)
lgb = LGBMRegressor(n_estimators=400, max_depth=-1, learning_rate=0.05, device='gpu' if GPU_AVAILABLE else 'cpu', random_state=42)
cat = CatBoostRegressor(iterations=500, depth=10, learning_rate=0.05, task_type='GPU' if GPU_AVAILABLE else 'CPU', verbose=0, random_state=42)

# Stacking Model
stacking_model = StackingRegressor(
    estimators=[('rf', rf), ('xgb', xgb), ('lgb', lgb), ('cat', cat)],
    final_estimator=GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42)
)



        
# Pipeline
pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", stacking_model)])

# Train Model
pipeline.fit(X_train, y_train)

# Save Model

df = df_.copy()


GPU Available: True


  return func(**kwargs)
  return func(**kwargs)
  ret = func(*args, **kwargs)

    E.g. tree_method = "hist", device = "cuda"



[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 608
[LightGBM] [Info] Number of data points in the train set: 16472, number of used features: 157
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 11 dense feature groups (0.19 MB) transferred to GPU in 0.000951 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 14.539200


  return func(**kwargs)
  ret = func(*args, **kwargs)

    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 586
[LightGBM] [Info] Number of data points in the train set: 13177, number of used features: 146
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 11 dense feature groups (0.15 MB) transferred to GPU in 0.000825 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 14.545519
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 573
[LightGBM] [Info] Number of data points in the train set: 13177, number of used features: 140
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 11 dense feature 

In [55]:
# Model Evaluation on training set
y_pred = pipeline.predict(X_train)
mse = mean_squared_error(y_train, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_train, y_pred)

train_accs['Stacking'] = {'RMSE': rmse, 'R2_Score': r2}

print(f"Train RMSE: {rmse:.2f}")
print(f"Train R2 Score: {r2:.2f}")


    E.g. tree_method = "hist", device = "cuda"



Train RMSE: 0.60
Train R2 Score: 0.74


In [56]:
# Model Evaluation on test set
y_pred = pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

test_accs['Stacking'] = {'RMSE': rmse, 'R2_Score': r2}

print(f"Test RMSE: {rmse:.2f}")
print(f"Test R2 Score: {r2:.2f}")

Test RMSE: 0.70
Test R2 Score: 0.65


In [57]:
# Tracking model
with mlflow.start_run(run_name="stacking"):
    

    mlflow.log_params(grid_search.best_params_)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)

    mlflow.sklearn.log_model(best_model, "stacking_model")



In [58]:
rows = []
for model in train_accs:
    train_rmse = train_accs[model]['RMSE']
    test_rmse = test_accs[model]['RMSE']
    train_r2 = train_accs[model]['R2_Score']
    test_r2 = test_accs[model]['R2_Score']
    
    rows.append({
        'Model': model,
        'Train RMSE': train_rmse,
        'Test RMSE': test_rmse,
        'Train R2_Score': train_r2,
        'Test R2_Score': test_r2,
    })

# Create the DataFrame
models_df = pd.DataFrame(rows)
print(models_df)

                Model  Train RMSE  Test RMSE  Train R2_Score  Test R2_Score
0       Random Forest    0.531282   0.695523        0.796620       0.651308
1  Random Forest(PCA)    0.417449   0.809764        0.874436       0.527354
2                 SVR    0.837547   0.833075        0.494553       0.499749
3             XGBoost    0.536989   0.700816        0.792227       0.645980
4            LightGBM    0.572864   0.701483        0.763538       0.645307
5      Neural Network    0.702107   0.745891        0.644807       0.598976
6            Stacking    0.598100   0.698591        0.742247       0.648225
