In [1]:
# Import Packages

import os,mlflow
import pandas as pd
import numpy as np
from sklearn import model_selection,preprocessing,datasets,metrics
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier


In [2]:
# 2. Data Preparation
# Data Loading
data = datasets.load_breast_cancer(as_frame=True)
df = data.frame
features = data.data
label = data.target
label_names = data.target_names

In [3]:
# 3. EDA
print(df.info())
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [4]:
# check missing values and duplicate
print(df.isna().sum())
print(df.duplicated().sum())

mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
target                     0
dtype: int64
0


In [5]:
# 4. Data Spliting
SEED = 42
x_train, x_test, y_train, y_test = model_selection.train_test_split(features,label,train_size=0.8,shuffle=True,random_state=SEED)

In [6]:
# 5. Feature Scaling
standard_scaler = preprocessing.StandardScaler()
x_train = standard_scaler.fit_transform(x_train)
x_test = standard_scaler.transform(x_test)

In [7]:
# 6. Model Development -- do it with mlflow
# check current working directory
print(os.getcwd())

c:\Users\MSFRotPC04\Desktop\CIAST_ML_Siri 2\ml-project-1\notebooks


In [8]:
# Change current working directory based to bas project folder
os.chdir("..")
print(os.getcwd())

c:\Users\MSFRotPC04\Desktop\CIAST_ML_Siri 2\ml-project-1


In [10]:
# Just in case not go to the current directory. Use absolute path to change directory
#os.chdir(r"C:\Users\MSFRotPC07\Desktop\ciast_ml_master\ml-project-1")
#print(os.getcwd())

In [9]:
# Create an MLFlow Experiment
mlflow.create_experiment("breast_cancer_experiment")


'830186232148615788'

In [11]:
# Use the experiment
mlflow.set_experiment("breast_cancer_experiment")

# Set experiment tag
mlflow.set_experiment_tag("mlfow_version","2.15.1") 

In [12]:
# Carry out ML run and log the necessary artifacts and results
with mlflow.start_run():
    # Setup for model training
    svc_kernel = 'rbf'
    model = SVC(kernel=svc_kernel)
    model.fit(x_train,y_train)
    # Model Evaluation
    predictions = model.predict(x_test)
    accuracy = metrics.accuracy_score(y_test,predictions)
    precision = metrics.precision_score(y_test,predictions)
    recall = metrics.recall_score(y_test,predictions)
    f1 = metrics.f1_score(y_test,predictions)
    # Logging
    # (A) Log the metrics
    mlflow.log_metrics({
        'Test Accuracy': accuracy,
        'Test Precision': precision,
        'Test Recall': recall,
        'Test F1': f1
    })
    # (B) Log the parameter used
    mlflow.log_param('kernel',svc_kernel)

    
    


In [13]:
# Test with orther kernel

kernel_list = ['linear','poly','sigmoid']

for kernel in kernel_list:
    with mlflow.start_run():
        # Setup for model training
        model = SVC(kernel=kernel)
        model.fit(x_train,y_train)
        # Model Evaluation
        predictions = model.predict(x_test)
        accuracy = metrics.accuracy_score(y_test,predictions)
        precision = metrics.precision_score(y_test,predictions)
        recall = metrics.recall_score(y_test,predictions)
        f1 = metrics.f1_score(y_test,predictions)
        # Logging
        # (A) Log the metrics
        mlflow.log_metrics({
            'Test Accuracy': accuracy,
            'Test Precision': precision,
            'Test Recall': recall,
            'Test F1': f1
        })
        # (B) Log the parameter used
        mlflow.log_param('kernel',kernel)

In [18]:
# Use auto log instead for the logging of the run 
with mlflow.start_run(run_name="decision_tree_1"):
    # enable autolog -> only for train data
    mlflow.autolog()
    #Train the model
    model = DecisionTreeClassifier()
    model.fit(x_train,y_train.values)


2024/08/08 10:30:31 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


In [15]:
model

In [16]:
with mlflow.start_run(run_name="decision_tree_2"):
    # enable autolog -> only for train data
    mlflow.autolog()
    #Train the model
    model = DecisionTreeClassifier()
    model.fit(x_train,y_train)
# Model Evaluation
    predictions = model.predict(x_test)
    accuracy = metrics.accuracy_score(y_test,predictions)
    precision = metrics.precision_score(y_test,predictions)
    recall = metrics.recall_score(y_test,predictions)
    f1 = metrics.f1_score(y_test,predictions)
    # Logging
    # (A) Log the metrics
    mlflow.log_metrics({
        'Test Accuracy': accuracy,
        'Test Precision': precision,
        'Test Recall': recall,
        'Test F1': f1
    })

2024/08/08 10:14:19 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


In [17]:
with mlflow.start_run(run_name="decision_tree_3"):
    # enable autolog -> only for train data
    mlflow.autolog()
    #Train the model
    model = DecisionTreeClassifier()
    model.fit(x_train,y_train.values)
# Model Evaluation
    predictions = model.predict(x_test)
    accuracy = metrics.accuracy_score(y_test.values,predictions)
    precision = metrics.precision_score(y_test.values,predictions)
    recall = metrics.recall_score(y_test.values,predictions)
    f1 = metrics.f1_score(y_test.values,predictions)
    # Logging
    # (A) Log the metrics
    mlflow.log_metrics({
        'Test Accuracy': accuracy,
        'Test Precision': precision,
        'Test Recall': recall,
        'Test F1': f1
    })

2024/08/08 10:23:43 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
