In [1]:
import pandas as pd
df1=pd.read_csv('final.csv')

In [2]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
numerical_features = df1.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = df1.select_dtypes(exclude=[np.number]).columns.tolist()

In [4]:
encoders = {} 

for col in categorical_features:
    le = LabelEncoder()
    df1[col] = le.fit_transform(df1[col]) 
    encoders[col] = le 

In [5]:
categorical_features

['gold', 'ClientType', 'Micromarket', 'DayOfWeek']

In [6]:
numerical_features

['inventory_id', 'Capacity', 'pre_booking', 'valid_from_hour', 'price']

In [None]:
import pickle
import os
import re


def clean_filename(filename):
    """Removes invalid characters from filenames"""
    return re.sub(r'[<>:"/\\|?*]', '', filename).replace(" ", "_")

encoder_dir = "label_encoders_1"
os.makedirs(encoder_dir, exist_ok=True) 

for col, encoder in encoders.items():
    with open(f"{encoder_dir}/{clean_filename(col)}_encoder.pkl", "wb") as f:
        pickle.dump(encoder, f)


In [None]:
final_df = df1[numerical_features + categorical_features]

In [None]:
empty_columns = final_df.columns[final_df.isnull().all()].tolist()
print(empty_columns)

In [None]:
missing_values = final_df.isnull().sum()
print(missing_values)

In [None]:
missing_price_rows = final_df[final_df['price'].isnull()]
print(missing_price_rows)

In [None]:
import pandas as pd
from pycaret.regression import *


reg = setup(data=final_df, target='price', session_id=123, normalize=True)

best_model = compare_models()

print(best_model)


In [None]:
final_df.columns

In [None]:
best_model

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

final_model = finalize_model(best_model)

df = final_df

target = 'price'
X = df.drop(columns=[target])
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

final_model.fit(X_train, y_train)

y_pred = final_model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

print(f"R² Score: {r2:.4f}")
print(f"Mean Squared Error: {mse:.4f}")
print(f"Root Mean Squared Error: {rmse:.4f}")


In [None]:
import mlflow
import mlflow.sklearn

# Start an MLflow run
with mlflow.start_run():
    
    # Fit the model
    final_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = final_model.predict(X_test)
    
    # Evaluate the model
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5

    # Log parameters (if applicable, e.g., hyperparameters)
    mlflow.log_param("test_size", 0.2)
    mlflow.log_param("random_state", 42)

    # Log metrics
    mlflow.log_metric("R2 Score", r2)
    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("RMSE", rmse)

    # Log the trained model
    mlflow.sklearn.log_model(final_model, "model")

    print(f"R² Score: {r2:.4f}")
    print(f"Mean Squared Error: {mse:.4f}")
    print(f"Root Mean Squared Error: {rmse:.4f}")

# End of MLflow tracking
print("Training logged in MLflow!")


In [None]:
import mlflow
import mlflow.sklearn
import os
from sklearn.metrics import r2_score, mean_squared_error

# Set DagsHub Tracking URI
MLFLOW_TRACKING_URI = "https://dagshub.com/kaarthikkvishwa/app.mlflow"

# Authenticate with DagsHub using Access Token
os.environ["MLFLOW_TRACKING_USERNAME"] = "kaarthikkvishwa"
os.environ["MLFLOW_TRACKING_PASSWORD"] = "ff622802a60c8f6586599ad22f4ce79e52bff840"

# Set MLflow tracking server
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

# Define Experiment Name
mlflow.set_experiment("House_Price_Prediction")

# Start MLflow Run
with mlflow.start_run() as run:
    # Train the model
    final_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = final_model.predict(X_test)
    
    # Compute Metrics
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5

    # Log Parameters
    mlflow.log_param("test_size", 0.2)
    mlflow.log_param("random_state", 42)

    # Log Metrics
    mlflow.log_metric("R2 Score", r2)
    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("RMSE", rmse)

    # Log Model
    model_info = mlflow.sklearn.log_model(final_model, "model")

    print(f"Model logged with R² Score: {r2:.4f}")

# Model Name in MLflow
model_name = "HousePriceModel"

# Fetch the Best Previous Model
client = mlflow.tracking.MlflowClient()
latest_versions = client.get_latest_versions(model_name)

best_r2 = -float("inf")
best_version = None

for version in latest_versions:
    if version.run_id:  # Ensure run_id is valid
        metrics = client.get_run(version.run_id).data.metrics
        if "R2 Score" in metrics and metrics["R2 Score"] > best_r2:
            best_r2 = metrics["R2 Score"]
            best_version = version.version

print(f"Best previous model R² Score: {best_r2:.4f}")

# **Register the New Model Only if It’s Better**
if r2 > best_r2:
    print(f"New model is better (R²: {r2:.4f}), updating the registry.")
    
    registered_model = mlflow.register_model(model_info.model_uri, model_name)
    
    print(f"Model registered as version {registered_model.version}")
    
    # Optionally, move the best version to "Production"
    client.transition_model_version_stage(name=model_name, version=registered_model.version, stage="Production")

else:
    print(f"New model is worse (R²: {r2:.4f}), keeping version {best_version}.")