**Creating and Logging an Experiment**

In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

**Setting the tracking URI for MLflow to a local server running on port 5000.**

In [6]:
mlflow.set_tracking_uri("http://localhost:5000")

**Simulate a dataset**

In [7]:
X = np.random.rand(100, 1) * 10 # Random data
y = 2.5 * X.flatten() + np.random.randn(100) * 2 # Linear relationship with noise
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**Train and track the model**

In [8]:
with mlflow.start_run():
 # Define and train the model
 model = LinearRegression()
 model.fit(X_train, y_train)
 # Log parameters
 mlflow.log_param("fit_intercept", model.fit_intercept)
 
 # Predict and log metrics
 y_pred = model.predict(X_test)
 rmse = mean_squared_error(y_test, y_pred, squared=False)
 mlflow.log_metric("rmse", rmse)
 
 # Log the model
 mlflow.sklearn.log_model(model, "linear_model")



🏃 View run marvelous-ant-542 at: http://localhost:5000/#/experiments/0/runs/71697b257854461c85be74c66d5ecc3b
🧪 View experiment at: http://localhost:5000/#/experiments/0


**Viewing / Comparing Results**

In [9]:
with mlflow.start_run():
 model = LinearRegression(fit_intercept=False)
 model.fit(X_train, y_train)
 y_pred = model.predict(X_test)
 rmse = mean_squared_error(y_test, y_pred, squared=False)
 
 mlflow.log_param("fit_intercept", model.fit_intercept)
 mlflow.log_metric("rmse", rmse)
 mlflow.sklearn.log_model(model, "linear_model_no_intercept")




🏃 View run sassy-dolphin-571 at: http://localhost:5000/#/experiments/0/runs/0c860768a5a84798b8ea5b0b62050df5
🧪 View experiment at: http://localhost:5000/#/experiments/0


In [10]:
trainData = pd.read_csv('data/trainingData.csv')
testData = pd.read_csv('data/validationData.csv')

**Preprocessing the dataset**

In [11]:
# Handle missing values
trainData.fillna(trainData.mean(), inplace=True)
testData.fillna(testData.mean(), inplace=True)

**Separating features and targets**

In [12]:
X_train = trainData.drop(['FLOOR', 'BUILDINGID', 'USERID','PHONEID','TIMESTAMP'], axis=1)
y1_train = trainData['FLOOR']
y2_train = trainData['BUILDINGID']

X_test = testData.drop(['FLOOR', 'BUILDINGID', 'USERID','PHONEID','TIMESTAMP'], axis=1)
y1_test = testData['FLOOR']
y2_test = testData['BUILDINGID']

**Scaling the features**

In [13]:
# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

**Training model and evaluating it**

In [14]:
def train_and_evaluate(X_train, X_test, y_train, y_test, model, run_name):
    with mlflow.start_run(run_name=run_name):
        # Train the model
        model.fit(X_train, y_train)

        # Predictions
        y_pred = model.predict(X_test)

        # Metrics
        accuracy = accuracy_score(y_test, y_pred)
        conf_matrix = confusion_matrix(y_test, y_pred)

        # Log parameters, metrics, and model
        mlflow.log_param("model", "RandomForest")
        if hasattr(model, "get_params"):
            mlflow.log_params(model.get_params())
        mlflow.log_metric("accuracy", accuracy)

        # Save and log confusion matrix as a CSV
        np.savetxt("confusion_matrix.csv", conf_matrix, delimiter=",")
        mlflow.log_artifact("confusion_matrix.csv")

        # Visualize and log confusion matrix as an image
        plt.figure(figsize=(8, 6))
        sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))
        plt.title("Confusion Matrix")
        plt.xlabel("Predicted")
        plt.ylabel("True")
        plt.tight_layout()
        plt.savefig("confusion_matrix.png")
        plt.close()
        mlflow.log_artifact("confusion_matrix.png")

        # Create an input example
        input_example = pd.DataFrame(X_test[:5], columns=[f"feature_{i}" for i in range(X_test.shape[1])])

        # Log the trained model with input example
        mlflow.sklearn.log_model(
            sk_model=model,
            artifact_path="RandomForest",
            input_example=input_example,
            registered_model_name=None,  # Set a name if needed
        )

        print(f"Run: {run_name} - Accuracy: {accuracy}")
        return accuracy, conf_matrix


**Random Forest Classifier with default parameters**

In [15]:
# Run 1 - Default hyperparameters
rf_model_default = RandomForestClassifier(random_state=42)
accuracy_default, conf_matrix_default = train_and_evaluate(X_train, X_test, y1_train, y1_test, rf_model_default, "RFC Default Hyperparameters")

  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1115.08it/s]


Run: RFC Default Hyperparameters - Accuracy: 0.7632763276327633
🏃 View run RFC Default Hyperparameters at: http://localhost:5000/#/experiments/0/runs/292ffaa83a2a4ea5b5782373d41bd7e4
🧪 View experiment at: http://localhost:5000/#/experiments/0


**Random Forest Classifier with hyperparameters**

In [16]:
# Run 2 - Tuned hyperparameters
rf_model_tuned = RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42)
accuracy_tuned, conf_matrix_tuned = train_and_evaluate(X_train, X_test, y1_train, y1_test, rf_model_tuned, "RFC Tuned Hyperparameters")

Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 547.12it/s]


Run: RFC Tuned Hyperparameters - Accuracy: 0.7398739873987399
🏃 View run RFC Tuned Hyperparameters at: http://localhost:5000/#/experiments/0/runs/d09c313fbdf84c938cc415adb5e4c163
🧪 View experiment at: http://localhost:5000/#/experiments/0


**XGBoost Classifier with default parameters**

In [17]:
import xgboost as xgb

# Run 1 - Default hyperparameters
xgb_model_default = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric="mlogloss")
accuracy_default, conf_matrix_default = train_and_evaluate(
    X_train, X_test, y2_train, y2_test, xgb_model_default, "XGB Default Hyperparameters"
)


Parameters: { "use_label_encoder" } are not used.

Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1400.30it/s]


Run: XGB Default Hyperparameters - Accuracy: 0.630963096309631
🏃 View run XGB Default Hyperparameters at: http://localhost:5000/#/experiments/0/runs/898359e68c8a452b988a42af1f740bea
🧪 View experiment at: http://localhost:5000/#/experiments/0


**XGBoost Classifier with hyperparameters**

In [18]:
# Run 2 - Tuned hyperparameters
xgb_model_tuned = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=8,
    learning_rate=0.1,
    random_state=42,
    use_label_encoder=False,
    eval_metric="mlogloss"
)
accuracy_tuned, conf_matrix_tuned = train_and_evaluate(
    X_train, X_test, y2_train, y2_test, xgb_model_tuned, "XGB Tuned Hyperparameters"
)

Parameters: { "use_label_encoder" } are not used.

Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1750.86it/s]


Run: XGB Tuned Hyperparameters - Accuracy: 0.630963096309631
🏃 View run XGB Tuned Hyperparameters at: http://localhost:5000/#/experiments/0/runs/af178c2c836e493b9dd7c3c5097e0add
🧪 View experiment at: http://localhost:5000/#/experiments/0


**BONUS**

In [19]:
from scipy.stats import entropy

In [20]:
# Add data drift by introducing noise to WAP signal strengths
def introduce_drift(data, noise_percentage=0.1):
    drifted_data = data.copy()
    num_drifted_rows = int(len(data) * noise_percentage)
    drift_indices = np.random.choice(data.index, num_drifted_rows, replace=False)

    for col in [col for col in data.columns if col.startswith("WAP")]:
        noise = np.random.randint(-5, 6, num_drifted_rows)
        drifted_data.loc[drift_indices, col] += noise

    return drifted_data

In [21]:
# Compute KL divergence between two distributions
def compute_kl_divergence(original, drifted):
    kl_metrics = {}
    for col in original.columns:
        if col.startswith("WAP"):
            p, _ = np.histogram(original[col], bins=50, density=True)
            q, _ = np.histogram(drifted[col], bins=50, density=True)
            p += 1e-10  # To avoid division by zero
            q += 1e-10
            kl_metrics[col] = entropy(p, q)
    return kl_metrics

In [28]:
original_data = pd.read_csv("data/trainingData.csv")
# Introduce drift (adding random noise to 10% of the records)
drifted_data = introduce_drift(original_data, noise_percentage=0.1)
# Compute KL divergence
kl_metrics = compute_kl_divergence(original_data, drifted_data)
# Log results in MLflow
mlflow.set_experiment("data_drift_experiment")
with mlflow.start_run(run_name="data_drift_run"):
    for col, kl_value in kl_metrics.items():
        mlflow.log_metric(col, kl_value)
print("KL divergence metrics logged in MLflow.")


2024/12/13 11:43:09 INFO mlflow.tracking.fluent: Experiment with name 'data_drift_experiment' does not exist. Creating a new experiment.


🏃 View run data_drift_run at: http://localhost:5000/#/experiments/317304398516309058/runs/e0d84ca1ddb14b1b82a05343222227c8
🧪 View experiment at: http://localhost:5000/#/experiments/317304398516309058
KL divergence metrics logged in MLflow.
