In [1]:
import dagshub
dagshub.init(repo_owner='kaarthikkvishwa', repo_name='app', mlflow=True)

In [2]:
import dagshub
dagshub.init(repo_owner='kaarthikkvishwa', repo_name='app', mlflow=True)

In [3]:
import mlflow
import mlflow.sklearn
import os
from sklearn.metrics import r2_score, mean_squared_error

# Set your DagsHub MLflow Tracking Server
MLFLOW_TRACKING_URI = "https://dagshub.com/kaarthikkvishwa/app.mlflow"

# Authenticate with DagsHub
os.environ["MLFLOW_TRACKING_USERNAME"] = "kaarthikkvishwa"
os.environ["MLFLOW_TRACKING_PASSWORD"] = "ff622802a60c8f6586599ad22f4ce79e52bff840"

# Set the tracking URI
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

# Define Experiment Name
mlflow.set_experiment("House_Price_Prediction")

with mlflow.start_run():

    # Train the model
    final_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = final_model.predict(X_test)
    
    # Calculate Metrics
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5

    # Log Parameters
    mlflow.log_param("test_size", 0.2)
    mlflow.log_param("random_state", 42)

    # Log Metrics
    mlflow.log_metric("R2 Score", r2)
    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("RMSE", rmse)

    # Log Model in DagsHub
    model_info = mlflow.sklearn.log_model(final_model, "model")

    # Register the model in MLflow
    model_uri = model_info.model_uri
    registered_model = mlflow.register_model(model_uri, "HousePriceModel")

    print(f"R² Score: {r2:.4f}")
    print(f"Mean Squared Error: {mse:.4f}")
    print(f"Root Mean Squared Error: {rmse:.4f}")

print("Training logged and model registered in MLflow on DagsHub!")

In [4]:
import pandas as pd
df1=pd.read_csv('final.csv')

In [5]:
df1

In [6]:
import pandas as pd
import os

df = df1
folder_path = "C:/Users/DS/Desktop/shellkode/MEETING_PRICE_PREDICTION/app/inventory_ids"
file_path = os.path.join(folder_path, "unique_inventory.csv")

os.makedirs(folder_path, exist_ok=True)
unique_inventory = df[['inventory_id', 'Capacity']].drop_duplicates().sort_values(by=['inventory_id', 'Capacity'])

unique_inventory.to_csv(file_path, index=False)

In [7]:
unique_inventory

In [8]:
multi_capacity_inventory = unique_inventory.groupby('inventory_id')['Capacity'].nunique()
multi_capacity_inventory = multi_capacity_inventory[multi_capacity_inventory > 1]

# Print details
if not multi_capacity_inventory.empty:
    print("Inventory IDs with multiple capacities:")
    for inventory_id in multi_capacity_inventory.index:
        capacities = unique_inventory[unique_inventory['inventory_id'] == inventory_id]['Capacity'].tolist()
        print(f"Inventory ID: {inventory_id}, Capacities: {capacities}")
else:
    print("No inventory IDs have multiple capacities.")

In [9]:
unique_values = df['Micromarket'].unique()
with open("app/unique_micromarkets.txt", "w") as f:
    for value in unique_values:
        f.write(str(value) + "\n")

In [10]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
numerical_features = df1.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = df1.select_dtypes(exclude=[np.number]).columns.tolist()

In [11]:
encoders = {} 

for col in categorical_features:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col]) 
    encoders[col] = le 

In [12]:
categorical_features

In [13]:
numerical_features

In [14]:
import pickle
import os
import re


def clean_filename(filename):
    """Removes invalid characters from filenames"""
    return re.sub(r'[<>:"/\\|?*]', '', filename).replace(" ", "_")

encoder_dir = "C:/Users/DS/Desktop/shellkode/MEETING_PRICE_PREDICTION/app/label_encoders_full"
os.makedirs(encoder_dir, exist_ok=True) 

for col, encoder in encoders.items():
    with open(f"{encoder_dir}/{clean_filename(col)}_encoder.pkl", "wb") as f:
        pickle.dump(encoder, f)

In [15]:
final_df = df1[numerical_features + categorical_features]

In [16]:
final_df.info()

In [17]:
empty_columns = final_df.columns[final_df.isnull().all()].tolist()
print(empty_columns)

In [18]:
missing_values = final_df.isnull().sum()
print(missing_values)

In [19]:
missing_price_rows = final_df[final_df['price'].isnull()]
print(missing_price_rows)

In [20]:
import pandas as pd
from pycaret.regression import *


reg = setup(data=final_df, target='price', session_id=123, normalize=True)

best_model = compare_models()

print(best_model)

In [21]:
final_df.columns

In [22]:
best_model

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

final_model = finalize_model(best_model)

df = final_df

target = 'price'
X = df.drop(columns=[target])
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

final_model.fit(X_train, y_train)

y_pred = final_model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

print(f"R² Score: {r2:.4f}")
print(f"Mean Squared Error: {mse:.4f}")
print(f"Root Mean Squared Error: {rmse:.4f}")

In [24]:
import pickle

with open("app/final_model.pkl", "wb") as model_file:
    pickle.dump(final_model, model_file)

print("Model saved as final_model.pkl")

In [25]:
final_df.columns

In [26]:
import mlflow
import mlflow.sklearn

with mlflow.start_run():
    
    final_model.fit(X_train, y_train)
    
    y_pred = final_model.predict(X_test)
    
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5

    mlflow.log_param("test_size", 0.2)
    mlflow.log_param("random_state", 42)

    mlflow.log_metric("R2 Score", r2)
    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("RMSE", rmse)

    mlflow.sklearn.log_model(final_model, "model")

    print(f"R² Score: {r2:.4f}")
    print(f"Mean Squared Error: {mse:.4f}")
    print(f"Root Mean Squared Error: {rmse:.4f}")

print("Training logged in MLflow!")

In [27]:
import dagshub
dagshub.init(repo_owner='kaarthikkvishwa', repo_name='app', mlflow=True)

In [28]:
import mlflow
import mlflow.sklearn
import os
from sklearn.metrics import r2_score, mean_squared_error

# Set your DagsHub MLflow Tracking Server
MLFLOW_TRACKING_URI = "https://dagshub.com/kaarthikkvishwa/app.mlflow"

# Authenticate with DagsHub
os.environ["MLFLOW_TRACKING_USERNAME"] = "kaarthikkvishwa"
os.environ["MLFLOW_TRACKING_PASSWORD"] = "ff622802a60c8f6586599ad22f4ce79e52bff840"

# Set the tracking URI
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

# Define Experiment Name
mlflow.set_experiment("House_Price_Prediction")

with mlflow.start_run():

    # Train the model
    final_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = final_model.predict(X_test)
    
    # Calculate Metrics
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5

    # Log Parameters
    mlflow.log_param("test_size", 0.2)
    mlflow.log_param("random_state", 42)

    # Log Metrics
    mlflow.log_metric("R2 Score", r2)
    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("RMSE", rmse)

    # Log Model in DagsHub
    model_info = mlflow.sklearn.log_model(final_model, "model")

    # Register the model in MLflow
    model_uri = model_info.model_uri
    registered_model = mlflow.register_model(model_uri, "HousePriceModel")

    print(f"R² Score: {r2:.4f}")
    print(f"Mean Squared Error: {mse:.4f}")
    print(f"Root Mean Squared Error: {rmse:.4f}")

print("Training logged and model registered in MLflow on DagsHub!")

In [29]:
import mlflow
import mlflow.sklearn
import os
from sklearn.metrics import r2_score, mean_squared_error

# Set DagsHub Tracking URI
MLFLOW_TRACKING_URI = "https://dagshub.com/kaarthikkvishwa/app.mlflow"

# Authenticate with DagsHub using Access Token
os.environ["MLFLOW_TRACKING_USERNAME"] = "kaarthikkvishwa"
os.environ["MLFLOW_TRACKING_PASSWORD"] = "ff622802a60c8f6586599ad22f4ce79e52bff840"

# Set MLflow tracking server
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

# Define Experiment Name
mlflow.set_experiment("House_Price_Prediction")

# Start MLflow Run
with mlflow.start_run():
    # Train the model
    final_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = final_model.predict(X_test)
    
    # Compute Metrics
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5

    # Log Parameters
    mlflow.log_param("test_size", 0.2)
    mlflow.log_param("random_state", 42)

    # Log Metrics
    mlflow.log_metric("R2 Score", r2)
    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("RMSE", rmse)

    # Log Model to MLflow
    model_info = mlflow.sklearn.log_model(final_model, "model")

    print(f"Model logged with R² Score: {r2:.4f}")

# Model Name in MLflow
model_name = "HousePriceModel"

try:
    latest_version = mlflow.register_model(model_info.model_uri, model_name)
    print(f"New model registered as version {latest_version.version}")
except mlflow.exceptions.MlflowException:
    print("Model registration failed. Check if tracking server is configured correctly.")

client = mlflow.tracking.MlflowClient()
latest_versions = client.get_latest_versions(model_name)

best_r2 = -float("inf")
best_version = None

for version in latest_versions:
    metrics = client.get_run(version.run_id).data.metrics
    if "R2 Score" in metrics and metrics["R2 Score"] > best_r2:
        best_r2 = metrics["R2 Score"]
        best_version = version.version

print(f"Best previous model R² Score: {best_r2:.4f}")

if r2 > best_r2:
    print(f"New model is better (R²: {r2:.4f}), updating the registry.")
    mlflow.register_model(model_info.model_uri, model_name)
else:
    print(f"New model is worse (R²: {r2:.4f}), keeping version {best_version}.")

In [30]:
from mlflow.tracking import MlflowClient

client = MlflowClient()
model_name = "HousePriceModel"

# Get registered model details
versions = client.get_latest_versions(model_name)

for v in versions:
    print(f"Version: {v.version}, Stage: {v.current_stage}, Run ID: {v.run_id}")

In [31]:
from mlflow.tracking import MlflowClient

client = MlflowClient()
model_name = "HousePriceModel"

# Delete the incorrect version (Example: Version 2)
client.delete_model_version(name=model_name, version=2)

print("Deleted incorrectly registered model version.")

In [32]:
import mlflow
import mlflow.sklearn
import os
from sklearn.metrics import r2_score, mean_squared_error

# Set DagsHub Tracking URI
MLFLOW_TRACKING_URI = "https://dagshub.com/kaarthikkvishwa/app.mlflow"

# Authenticate with DagsHub using Access Token
os.environ["MLFLOW_TRACKING_USERNAME"] = "kaarthikkvishwa"
os.environ["MLFLOW_TRACKING_PASSWORD"] = "ff622802a60c8f6586599ad22f4ce79e52bff840"

# Set MLflow tracking server
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

# Define Experiment Name
mlflow.set_experiment("House_Price_Prediction")

# Start MLflow Run
with mlflow.start_run() as run:
    # Train the model
    final_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = final_model.predict(X_test)
    
    # Compute Metrics
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5

    # Log Parameters
    mlflow.log_param("test_size", 0.2)
    mlflow.log_param("random_state", 42)

    # Log Metrics
    mlflow.log_metric("R2 Score", r2)
    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("RMSE", rmse)

    # Log Model
    model_info = mlflow.sklearn.log_model(final_model, "model")

    print(f"Model logged with R² Score: {r2:.4f}")

# Model Name in MLflow
model_name = "HousePriceModel"

# Fetch the Best Previous Model
client = mlflow.tracking.MlflowClient()
latest_versions = client.get_latest_versions(model_name)

best_r2 = -float("inf")
best_version = None

for version in latest_versions:
    if version.run_id:  # Ensure run_id is valid
        metrics = client.get_run(version.run_id).data.metrics
        if "R2 Score" in metrics and metrics["R2 Score"] > best_r2:
            best_r2 = metrics["R2 Score"]
            best_version = version.version

print(f"Best previous model R² Score: {best_r2:.4f}")

# **Register the New Model Only if It’s Better**
if r2 > best_r2:
    print(f"New model is better (R²: {r2:.4f}), updating the registry.")
    
    registered_model = mlflow.register_model(model_info.model_uri, model_name)
    
    print(f"Model registered as version {registered_model.version}")
    
    # Optionally, move the best version to "Production"
    client.transition_model_version_stage(name=model_name, version=registered_model.version, stage="Production")

else:
    print(f"New model is worse (R²: {r2:.4f}), keeping version {best_version}.")

In [33]:
from mlflow.tracking import MlflowClient

client = MlflowClient()
model_name = "HousePriceModel"

# Get registered model details
versions = client.get_latest_versions(model_name)

for v in versions:
    print(f"Version: {v.version}, Stage: {v.current_stage}, Run ID: {v.run_id}")

In [34]:
import mlflow
import mlflow.sklearn
import os
from sklearn.metrics import r2_score, mean_squared_error

# Set DagsHub Tracking URI
MLFLOW_TRACKING_URI = "https://dagshub.com/kaarthikkvishwa/app.mlflow"

# Authenticate with DagsHub using Access Token
os.environ["MLFLOW_TRACKING_USERNAME"] = "kaarthikkvishwa"
os.environ["MLFLOW_TRACKING_PASSWORD"] = "ff622802a60c8f6586599ad22f4ce79e52bff840"

# Set MLflow tracking server
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

# Define Experiment Name
mlflow.set_experiment("House_Price_Prediction")

# Start MLflow Run
with mlflow.start_run() as run:
    # Train the model
    final_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = final_model.predict(X_test)
    
    # Compute Metrics
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5

    # Log Parameters
    mlflow.log_param("test_size", 0.2)
    mlflow.log_param("random_state", 42)

    # Log Metrics
    mlflow.log_metric("R2 Score", r2)
    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("RMSE", rmse)

    # Log Model
    model_info = mlflow.sklearn.log_model(final_model, "model")

    print(f"Model logged with R² Score: {r2:.4f}")

# Model Name in MLflow
model_name = "HousePriceModel"

# Fetch the Best Previous Model
client = mlflow.tracking.MlflowClient()
latest_versions = client.get_latest_versions(model_name)

best_r2 = -float("inf")
best_version = None

for version in latest_versions:
    if version.run_id:  # Ensure run_id is valid
        metrics = client.get_run(version.run_id).data.metrics
        if "R2 Score" in metrics and metrics["R2 Score"] > best_r2:
            best_r2 = metrics["R2 Score"]
            best_version = version.version

print(f"Best previous model R² Score: {best_r2:.4f}")

# **Register the New Model Only if It’s Better**
if r2 > best_r2:
    print(f"New model is better (R²: {r2:.4f}), updating the registry.")
    
    registered_model = mlflow.register_model(model_info.model_uri, model_name)
    
    print(f"Model registered as version {registered_model.version}")
    
    # Optionally, move the best version to "Production"
    client.transition_model_version_stage(name=model_name, version=registered_model.version, stage="Production")

else:
    print(f"New model is worse (R²: {r2:.4f}), keeping version {best_version}.")

In [35]:
import os

os.system("dvc init")

dagshub_repo_url = "https://dagshub.com/kaarthikkvishwa/app.dvc"  # Replace with your repo
os.system(f"dvc remote add origin {dagshub_repo_url}")
os.system("dvc remote modify origin --local auth basic")
os.system("dvc remote modify origin --local user YOUR_USERNAME")
os.system("dvc remote modify origin --local password YOUR_DAGSHUB_TOKEN")  # Securely store this in ENV variables

dataset_path = "C:/Users/DS/Desktop/shellkode/MEETING_PRICE_PREDICTION/Awfis_Data.csv"
os.system(f"dvc add {dataset_path}")

os.system("dvc push")

In [36]:
from dagshub.upload import Repo
repo = Repo('kaarthikkvishwa', 'app')
repo.upload(local_path='C:/Users/DS/Desktop/shellkode/MEETING_PRICE_PREDICTION/Awfis_Data.csv', remote_path='s3:/app', versioning='dvc')

In [37]:
from dagshub.upload import Repo
repo = Repo('kaarthikkvishwa', 'app')
repo.upload(local_path='C:/Users/DS/Desktop/shellkode/MEETING_PRICE_PREDICTION/Awfis_Data.csv', remote_path='s3:/Awfis_Data.csv', versioning='dvc')

In [38]:
from dagshub.upload import Repo

# Initialize the Repo
repo = Repo('kaarthikkvishwa', 'app')

# Upload file to the "dataset" folder inside the S3 bucket
repo.upload(
    local_path='C:/Users/DS/Desktop/shellkode/MEETING_PRICE_PREDICTION/Awfis_Data.csv',
    remote_path='s3://dataset/Awfis_Data.csv',  # Uploading inside "dataset" folder
    versioning='dvc'  # Track with DVC
)

In [39]:
from dagshub.notebook import save_notebook

save_notebook(repo="kaarthikkvishwa/app", path="C:Users/DS/Desktop/shellkode/ci-cd-mpp/research/model_training.ipynb")