Run using ML Runtime Cluster

In [0]:
catalog = "main"
schema = "default"
labeled_table = f"{catalog}.{schema}.labeled_pilot_notes"
model_name = f"{catalog}.{schema}.pilot_notes_model"
target_table = f"{catalog}.{schema}.pilot_notes_supervised_classification"

In [0]:
from pyspark.sql.functions import *
from pyspark.ml.feature import *


labeled_df = spark.read.table(labeled_table)
normalized_df = (
    labeled_df
    .withColumn("pilot_notes", lower(col("pilot_notes")))
    .withColumn("pilot_notes", regexp_replace(col("pilot_notes"), "_[0-9]+_", " "))
)
tokenizer = RegexTokenizer(inputCol="pilot_notes", outputCol="words", pattern="\\W") # Can remove stopwords, use synonyms, etc here too
tokenized_df = tokenizer.transform(normalized_df)
cv = CountVectorizer(inputCol="words", outputCol="raw_features", vocabSize=20000, minDF=5)
idf = IDF(inputCol="raw_features", outputCol="features")

print("Fitting CV...")
cv_model = cv.fit(tokenized_df)
df_cv = cv_model.transform(tokenized_df).cache()

print("Fitting IDF...")
idf_model = idf.fit(df_cv)
features_df = idf_model.transform(df_cv).cache()

label_indexer = StringIndexer(inputCol="unsupervised_prediction", outputCol="label")
label_model = label_indexer.fit(features_df)
labeled_df = label_model.transform(features_df).cache()

train_data, test_data = labeled_df.randomSplit([0.8, 0.2], seed=42)
train_data = train_data.cache()
test_data = test_data.cache()

In [0]:
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.types import DoubleType
from mlflow.types import ColSpec, DataType, Schema
from mlflow.models.signature import ModelSignature
import mlflow

mlflow.autolog()

def train_model(model_class, params, run_name):
    with mlflow.start_run(run_name=run_name, nested=True):
        print(f"\nTraining {run_name}...")
        model = model_class(**params)
        pipeline = Pipeline(stages=[model])

        # Minimal tuning for speed
        param_grid = ParamGridBuilder().build() 
        if isinstance(model, RandomForestClassifier):
            param_grid = ParamGridBuilder() \
                .addGrid(model.numTrees, [50, 100]) \
                .build()

        evaluator = MulticlassClassificationEvaluator(metricName="f1")
        cv = CrossValidator(estimator=pipeline,
                            estimatorParamMaps=param_grid,
                            evaluator=evaluator,
                            numFolds=2,
                            parallelism=2)

        cv_model = cv.fit(train_data)
        predictions = cv_model.transform(test_data)

        accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
        mlflow.log_metric("test_accuracy", accuracy)
        return cv_model

In [0]:
from pyspark.ml.classification import RandomForestClassifier

rf_model = train_model(
    RandomForestClassifier,
    {"numTrees": 100, "maxDepth": 10},
    "RandomForest_v1"
)

In [0]:
from pyspark.ml.classification import LogisticRegression

lr_model = train_model(
    LogisticRegression,
    {"maxIter": 50, "elasticNetParam": 0.5},
    "LogReg_v1"
)

In [0]:
from pyspark.ml.classification import NaiveBayes

nb_model = train_model(
    NaiveBayes,
    {"smoothing": 1.0},
    "NaiveBayes_v1"
)

In [0]:
import mlflow.pyfunc
from pyspark.ml.feature import IndexToString

best_run = mlflow.search_runs(
    order_by=['metrics.test_accuracy DESC', 'start_time DESC'],
    max_results=1
).iloc[0]
model_uri = f"runs:/{best_run.run_id}/model"
loaded_model = mlflow.spark.load_model(model_uri)
predictions_df = loaded_model.transform(labeled_df)

label_converter = IndexToString(inputCol="prediction", outputCol="supervised_prediction", labels=label_model.labels)
labels_df = label_converter.transform(predictions_df).drop("prediction")
labels_df.write.saveAsTable(target_table)
spark.read.table(target_table).display()