## Preprocessing

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load dataset from Azure ML
from azureml.core import Workspace, Dataset

ws = Workspace.from_config()
dataset = Dataset.get_by_name(ws, name="stroke-prediction-dataset")
df = dataset.to_pandas_dataframe()

# Drop irrelevant columns
if "id" in df.columns:
    df.drop(columns=["id"], inplace=True)

# Replace "N/A" and missing values properly
df.replace("N/A", None, inplace=True)  # Ensure "N/A" is treated as None
df.fillna(df.select_dtypes(include=['number']).mean(), inplace=True)  # Fill missing numerical values with mean
df.fillna(df.select_dtypes(include=['object']).mode().iloc[0], inplace=True)  # Fill missing categorical values with mode

# Encode categorical features
label_encoders = {}
categorical_cols = ["gender", "ever_married", "work_type", "Residence_type", "smoking_status"]
for col in categorical_cols:
    df[col] = df[col].astype(str)  # Convert to string to avoid NAType issues
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Normalize numerical columns
scaler = StandardScaler()
numerical_cols = ["age", "avg_glucose_level", "bmi"]
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Split into train & test sets
train, test = train_test_split(df, test_size=0.2, random_state=42)

# Save preprocessed data
train.to_csv("train_data.csv", index=False)
test.to_csv("test_data.csv", index=False)

print("✅ Preprocessing complete. Train & Test datasets saved.")


{'infer_column_types': 'False', 'activity': 'to_pandas_dataframe'}
{'infer_column_types': 'False', 'activity': 'to_pandas_dataframe', 'activityApp': 'TabularDataset'}
✅ Preprocessing complete. Train & Test datasets saved.


In [2]:
print(train.head())
print(test.head())


      gender       age  hypertension  heart_disease  ever_married  work_type  \
802        1  1.582163             0              0             1          3   
3927       0  0.830297             0              0             1          2   
2337       0 -0.983025             0              0             0          2   
3910       1 -0.540751             0              0             1          0   
1886       0 -0.540751             0              0             0          2   

      Residence_type  avg_glucose_level       bmi  smoking_status  stroke  
802                0           0.143384 -0.054183               1       0  
3927               1          -0.393728  0.940938               0       0  
2337               0          -1.029783  0.609231               2       0  
3910               1          -0.893296  0.188219               1       0  
1886               0          -1.027354 -1.151367               2       0  
      gender       age  hypertension  heart_disease  ever_marri

## Training the model

In [4]:
!pip install catboost



In [6]:
import sys
!{sys.executable} -m pip install catboost


Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Collecting plotly (from catboost)
  Downloading plotly-6.0.0-py3-none-any.whl.metadata (5.6 kB)
Collecting narwhals>=1.15.1 (from plotly->catboost)
  Downloading narwhals-1.24.1-py3-none-any.whl.metadata (10.0 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m51.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hDownloading graphviz-0.20.3-py3-none-any.whl (47 kB)
Downloading plotly-6.0.0-py3-none-any.whl (14.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.8/14.8 MB[0m [31m113.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading narwhals-1.24.1-py3-none-any.whl (309 kB)
Installing collected packages: narwhals, graphviz, plotly, catboost
Successfull

In [8]:
import catboost
print("CatBoost works.")


CatBoost works.


In [9]:
import pandas as pd
import catboost
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Load the preprocessed dataset
train = pd.read_csv("train_data.csv")
test = pd.read_csv("test_data.csv")

# Define features & target variable
X_train = train.drop(columns=["stroke"])  # Features
y_train = train["stroke"]                 # Target
X_test = test.drop(columns=["stroke"])
y_test = test["stroke"]

# Identify categorical features (before encoding)
categorical_cols = ["gender", "ever_married", "work_type", "Residence_type", "smoking_status"]

# Train the CatBoost model
model = CatBoostClassifier(iterations=500, learning_rate=0.1, depth=6, cat_features=categorical_cols, verbose=100)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"CatBoost Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))


0:	learn: 0.5366213	total: 55.5ms	remaining: 27.7s
100:	learn: 0.1089053	total: 594ms	remaining: 2.35s
200:	learn: 0.0760920	total: 1.19s	remaining: 1.77s
300:	learn: 0.0556229	total: 2.02s	remaining: 1.34s
400:	learn: 0.0429990	total: 3s	remaining: 742ms
499:	learn: 0.0341632	total: 3.64s	remaining: 0us
CatBoost Accuracy: 0.9374
Classification Report:
               precision    recall  f1-score   support

           0       0.94      1.00      0.97       960
           1       0.00      0.00      0.00        62

    accuracy                           0.94      1022
   macro avg       0.47      0.50      0.48      1022
weighted avg       0.88      0.94      0.91      1022



In [11]:
from azureml.core import Run
run = Run.get_context()

# Log accuracy to Azure ML
run.log("CatBoost Accuracy", accuracy)

# Save model
model.save_model("catboost_model.cbm")

# Upload to Azure ML
run.upload_file("outputs/catboost_model.cbm", "catboost_model.cbm")

print("✅ Model saved & logged in Azure ML")


Attempted to log scalar metric CatBoost Accuracy:
0.9373776908023483
Attempted to track file outputs/catboost_model.cbm at catboost_model.cbm
✅ Model saved & logged in Azure ML
