## Preprocessing

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load dataset from Azure ML
from azureml.core import Workspace, Dataset

ws = Workspace.from_config()
dataset = Dataset.get_by_name(ws, name="stroke-prediction-dataset")
df = dataset.to_pandas_dataframe()

# Drop irrelevant columns
if "id" in df.columns:
    df.drop(columns=["id"], inplace=True)

# Replace "N/A" and missing values properly
df.replace("N/A", None, inplace=True)  # Ensure "N/A" is treated as None
df.fillna(df.select_dtypes(include=['number']).mean(), inplace=True)  # Fill missing numerical values with mean
df.fillna(df.select_dtypes(include=['object']).mode().iloc[0], inplace=True)  # Fill missing categorical values with mode

# Encode categorical features
label_encoders = {}
categorical_cols = ["gender", "ever_married", "work_type", "Residence_type", "smoking_status"]
for col in categorical_cols:
    df[col] = df[col].astype(str)  # Convert to string to avoid NAType issues
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Normalize numerical columns
scaler = StandardScaler()
numerical_cols = ["age", "avg_glucose_level", "bmi"]
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Split into train & test sets
train, test = train_test_split(df, test_size=0.2, random_state=42)

# Save preprocessed data
train.to_csv("train_data.csv", index=False)
test.to_csv("test_data.csv", index=False)

print("✅ Preprocessing complete. Train & Test datasets saved.")


{'infer_column_types': 'False', 'activity': 'to_pandas_dataframe'}
{'infer_column_types': 'False', 'activity': 'to_pandas_dataframe', 'activityApp': 'TabularDataset'}
✅ Preprocessing complete. Train & Test datasets saved.


In [2]:
print(train.head())
print(test.head())


      gender       age  hypertension  heart_disease  ever_married  work_type  \
802        1  1.582163             0              0             1          3   
3927       0  0.830297             0              0             1          2   
2337       0 -0.983025             0              0             0          2   
3910       1 -0.540751             0              0             1          0   
1886       0 -0.540751             0              0             0          2   

      Residence_type  avg_glucose_level       bmi  smoking_status  stroke  
802                0           0.143384 -0.054183               1       0  
3927               1          -0.393728  0.940938               0       0  
2337               0          -1.029783  0.609231               2       0  
3910               1          -0.893296  0.188219               1       0  
1886               0          -1.027354 -1.151367               2       0  
      gender       age  hypertension  heart_disease  ever_marri

## Training the model

In [3]:
!pip install catboost



In [4]:
import sys
!{sys.executable} -m pip install catboost




In [5]:
import catboost
print("CatBoost works.")


CatBoost works.


In [6]:
import sys
!{sys.executable} -m pip install mlflow[azureml]




In [7]:
import sys
!{sys.executable} -m pip show mlflow

Name: mlflow
Version: 2.16.2
Summary: MLflow is an open source platform for the complete machine learning lifecycle
Home-page: https://mlflow.org
Author: 
Author-email: 
License: Copyright 2018 Databricks, Inc.  All rights reserved.
        
                                        Apache License
                                   Version 2.0, January 2004
                                http://www.apache.org/licenses/
        
           TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
        
           1. Definitions.
        
              "License" shall mean the terms and conditions for use, reproduction,
              and distribution as defined by Sections 1 through 9 of this document.
        
              "Licensor" shall mean the copyright owner or entity authorized by
              the copyright owner that is granting the License.
        
              "Legal Entity" shall mean the union of the acting entity and all
              other e

In [11]:
import mlflow
# import mlflow.azureml

print("✅ MLflow is working!")


✅ MLflow is working!


In [17]:
import mlflow
from azureml.core import Workspace

# Connect to Azure ML Workspace
ws = Workspace.from_config()

# Set MLflow Tracking URI (DO NOT use "azureml://mlflow")
mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())

# Set experiment name
mlflow.set_experiment("catboost-thrombosis-prediction")

print("✅ MLflow is now connected to Azure ML!")


✅ MLflow is now connected to Azure ML!


In [18]:
import mlflow

with mlflow.start_run():
    mlflow.log_param("test_param", 123)
    mlflow.log_metric("test_accuracy", 0.95)

print("✅ MLflow test successful! Check Azure ML Studio → Experiments.")


2025/01/31 20:46:04 INFO mlflow.tracking._tracking_service.client: 🏃 View run clever_pillow_rp69zj6j at: https://westeurope.api.azureml.ms/mlflow/v2.0/subscriptions/a9e6b3ce-7394-4a87-9482-c314c87e5743/resourceGroups/mlops-simple-flow/providers/Microsoft.MachineLearningServices/workspaces/mlops-simple-flow/#/experiments/e6dd27e2-d888-4c3f-9d31-c4d93c893e96/runs/fe4dbcdc-962c-456c-9ec1-cf6d0e109e00.
2025/01/31 20:46:04 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://westeurope.api.azureml.ms/mlflow/v2.0/subscriptions/a9e6b3ce-7394-4a87-9482-c314c87e5743/resourceGroups/mlops-simple-flow/providers/Microsoft.MachineLearningServices/workspaces/mlops-simple-flow/#/experiments/e6dd27e2-d888-4c3f-9d31-c4d93c893e96.


✅ MLflow test successful! Check Azure ML Studio → Experiments.


In [19]:
import pandas as pd
import mlflow
import mlflow.catboost
from azureml.core import Workspace
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# ✅ Step 1: Connect to Azure ML Workspace & Set MLflow Tracking
ws = Workspace.from_config()
mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
mlflow.set_experiment("catboost-thrombosis-prediction")  # Experiment name in Azure ML

print("✅ MLflow is now connected to Azure ML!")

# ✅ Step 2: Load Preprocessed Dataset
train = pd.read_csv("train_data.csv")
test = pd.read_csv("test_data.csv")

# Define features & target variable
X_train = train.drop(columns=["stroke"])  # Features
y_train = train["stroke"]                 # Target
X_test = test.drop(columns=["stroke"])
y_test = test["stroke"]

# Identify categorical features
categorical_cols = ["gender", "ever_married", "work_type", "Residence_type", "smoking_status"]

# ✅ Step 3: Start MLflow Run & Train CatBoost Model
with mlflow.start_run():
    # Log hyperparameters
    mlflow.log_param("iterations", 500)
    mlflow.log_param("learning_rate", 0.1)
    mlflow.log_param("depth", 6)

    # Train the CatBoost model
    model = CatBoostClassifier(
        iterations=500, 
        learning_rate=0.1, 
        depth=6, 
        cat_features=categorical_cols, 
        verbose=100
    )
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"✅ CatBoost Accuracy: {accuracy:.4f}")
    print("Classification Report:\n", classification_report(y_test, y_pred))

    # Log metrics to MLflow
    mlflow.log_metric("accuracy", accuracy)

    # Save the trained model locally
    model.save_model("catboost_model.cbm")

    # Log the model to MLflow
    mlflow.catboost.log_model(model, "catboost_model")

print("✅ Model training and logging to MLflow complete!")


✅ MLflow is now connected to Azure ML!
0:	learn: 0.5366213	total: 56.5ms	remaining: 28.2s
100:	learn: 0.1089053	total: 487ms	remaining: 1.92s
200:	learn: 0.0760920	total: 963ms	remaining: 1.43s
300:	learn: 0.0556229	total: 1.44s	remaining: 953ms
400:	learn: 0.0429990	total: 1.93s	remaining: 475ms
499:	learn: 0.0341632	total: 2.4s	remaining: 0us
✅ CatBoost Accuracy: 0.9374
Classification Report:
               precision    recall  f1-score   support

           0       0.94      1.00      0.97       960
           1       0.00      0.00      0.00        62

    accuracy                           0.94      1022
   macro avg       0.47      0.50      0.48      1022
weighted avg       0.88      0.94      0.91      1022

✅ Model training and logging to MLflow complete!


2025/01/31 20:47:10 INFO mlflow.tracking._tracking_service.client: 🏃 View run dreamy_leather_33zmlyxv at: https://westeurope.api.azureml.ms/mlflow/v2.0/subscriptions/a9e6b3ce-7394-4a87-9482-c314c87e5743/resourceGroups/mlops-simple-flow/providers/Microsoft.MachineLearningServices/workspaces/mlops-simple-flow/#/experiments/e6dd27e2-d888-4c3f-9d31-c4d93c893e96/runs/8b7107fa-04ec-49a5-bc6b-ec2fc0e29267.
2025/01/31 20:47:10 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://westeurope.api.azureml.ms/mlflow/v2.0/subscriptions/a9e6b3ce-7394-4a87-9482-c314c87e5743/resourceGroups/mlops-simple-flow/providers/Microsoft.MachineLearningServices/workspaces/mlops-simple-flow/#/experiments/e6dd27e2-d888-4c3f-9d31-c4d93c893e96.
