# ML Pipeline using scikit-learn

In [0]:
spark.sql("USE CATALOG levkiwi_lakehouse")
spark.sql("USE SCHEMA ml_sandbox")

In [0]:
from pyspark.sql.functions import col

# Load dataset
data_path = "/Volumes/levkiwi_lakehouse/ml_sandbox/data/train.csv"
train_df = spark.read.csv(data_path, header=True, inferSchema=True)

# Cast Boolean columns to int
train_df = train_df.withColumn("PassengerId", col("PassengerId").cast("string")) \
                   .withColumn("VIP", col("VIP").cast("int")) \
                   .withColumn("CryoSleep", col("CryoSleep").cast("int")) \
                   .withColumn("Transported", col("Transported").cast("int")) 

display(train_df)

## Pandas & scikit-learn pipeline

In [0]:
import pandas as pd
train = train_df.toPandas()

train.head()

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Step 1: Define transformers for different column types
numerical_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean"))]
)

categorical_cols = ['HomePlanet', 'Destination', 'VIP', 'CryoSleep']
categorical_transformer = Pipeline(
    steps=[
        ('encoder', OneHotEncoder())
])

# Step 2: Create a ColumnTransformer that applies the transformations to the columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='drop' 
)

# Step 3: Assemble the preprocessing pipeline
preprocessing_pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

# Fit and transform the DataFrame
X_preprocessed = preprocessing_pipeline.fit_transform(train)

preprocessing_pipeline

In [0]:
# Converting back to Pandas DataFrame
onehot_encoder_feature_names = list(preprocessing_pipeline.named_steps['preprocessor'].named_transformers_['cat'].named_steps['encoder'].get_feature_names_out())
column_order =  numerical_cols + onehot_encoder_feature_names

# Show the cleaned DataFrame
X_preprocessed = pd.DataFrame(X_preprocessed, columns=column_order, index=train.index)
y = train['Transported']

X_preprocessed.head()

## Hyperparameter tuning of a Decision Tree Classifier 

We use optuna to hyperparameter tuning of a decision tree classifier

In [0]:
import optuna
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

model = DecisionTreeClassifier(criterion='entropy', random_state= 42)

def objective(trial):

    params = {
        # trial parameters to optimize
        'max_depth' : trial.suggest_int('max_depth', 3, 40, log=True),
        'min_samples_split' : trial.suggest_float('min_samples_split', 1e-6, 1e-3, log=True),
        'min_samples_leaf' : trial.suggest_float('min_samples_leaf', 1e-6, 1e-3, log=True)
    }

    model.set_params(**params)

    cv_score = cross_val_score(model, X_preprocessed, y, cv=5, scoring='accuracy').mean()

    return cv_score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)


print("--------------------------------------")
print("best_params =", study.best_params, "with cross_validation_score =", study.best_value)

## ML Flow

In [0]:
import mlflow
from mlflow import MlflowClient
from mlflow.models.signature import infer_signature

# Set the tracking URI to the Databricks workspace
mlflow.set_tracking_uri("databricks")
mlflow.set_registry_uri("databricks-uc")

# Create an instance of MlflowClient
client = MlflowClient()

X = train.drop(['Transported'], axis = 1)
y = train['Transported']

# Start an MLflow run
with mlflow.start_run():
    # Fit the model with the best hyperparameters from the study
    model = DecisionTreeClassifier(criterion= 'entropy', random_state= 42)
    model.set_params(**study.best_params)

    model_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', model)
        ])

    model_pipeline.fit(X, y)
    
    # Log the hyperparameters
    mlflow.log_params(study.best_params)

    # Log the loss metric
    mlflow.log_metric("accuracy", study.best_value)

    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("Training Info", "Simple Decision Tree Classifier")
    
    # Infer the model signature
    signature = infer_signature(X, model_pipeline.predict(X))
    
    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=model_pipeline,
        signature=signature,
        registered_model_name="decision_tree_model",
        artifact_path="decision_tree_model"
    )