Load PySpark, Libraries, and Data

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.2.3/spark-3.2.3-bin-hadoop2.7.tgz
!tar xf spark-3.2.3-bin-hadoop2.7.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.3-bin-hadoop2.7"

import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext

from google.colab import drive
drive.mount('/content/drive/')
%cd /content/drive/My Drive/Colab Notebooks/

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/My Drive/Colab Notebooks


In [None]:
# Imports
from pyspark.ml.linalg import Vectors
from pyspark.ml import Pipeline
from pyspark.sql import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StandardScaler
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import UnivariateFeatureSelector
import pyspark.ml.tuning as tune

In [None]:
import pandas as pd
import numpy as np
# read in the headers
spambase_headers = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.names',
                               skiprows=32,
                               sep=':',
                               names=['header_name', 'description']
                               )['header_name'].tolist()

# header file does not include label for target column
spambase_headers.append('spam')

# load the data using the
spambase_df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data',
                          header=None,
                          names=spambase_headers
                          )
spambase_df = spark.createDataFrame(spambase_df)

In [None]:
#Select all columns except target variable
input_cols = spambase_df.columns
input_cols.remove('spam')

In [None]:
#Test/Train Split
(trainData, testData) = spambase_df.randomSplit([0.7, 0.3])

In [None]:
#Stats for evaluating the performance of models
def run_models_get_results(pipelines, pipeline_name, train=trainData, test=testData):
  results = []
  for p, name in zip(pipelines, pipeline_name):
    print(f'Processing {name}')
    p_fit = p.fit(trainData)
    p_prediction = p_fit.transform(testData)

    accuracy_BCE = MulticlassClassificationEvaluator(labelCol='spam', predictionCol='prediction', metricName="accuracy")
    precision_BCE = MulticlassClassificationEvaluator(labelCol='spam', predictionCol='prediction', metricName="precisionByLabel")
    recall_BCE = MulticlassClassificationEvaluator(labelCol='spam', predictionCol='prediction', metricName="recallByLabel")
    f1Measure_BCE = MulticlassClassificationEvaluator(labelCol='spam', predictionCol='prediction', metricName="f1")

    accuracy = accuracy_BCE.evaluate(p_prediction)
    precision = precision_BCE.evaluate(p_prediction)
    recall = recall_BCE.evaluate(p_prediction)
    f1Measure = f1Measure_BCE.evaluate(p_prediction)

    print('Confusion Matrix')
    p_prediction.select("spam", "prediction").groupBy("spam", "prediction").count().show()

    print(f'{name}')
    print(f'accurary: {accuracy}')
    print(f'precision: {precision}')
    print(f'recall: {recall}')
    print(f'f1 Measure: {f1Measure}')



In [None]:
#"Confusion matrix" for each model to see how models perfrom on data, seperate from other statistic block depending on need for results in each form
def run_models_get_results_get_cost(pipelines, pipeline_name, train=trainData, test=testData):
  results = []
  for p, name in zip(pipelines, pipeline_name):
    print(f'Processing {name}')
    p_fit = p.fit(trainData)
    p_prediction = p_fit.transform(testData)

    True_Positive = p_prediction.where((p_prediction.spam == '1')&(p_prediction.prediction == '1')).count()
    True_Negative = p_prediction.where((p_prediction.spam == '0')&(p_prediction.prediction == '0')).count()
    False_Positive = p_prediction.where((p_prediction.spam == '0')&(p_prediction.prediction == '1')).count()
    False_Negative = p_prediction.where((p_prediction.spam == '1')&(p_prediction.prediction == '0')).count()
    Total = p_prediction.count()
    Total_Avg_Cost = (False_Positive*10+False_Negative)/Total

    print('Confusion Matrix')
    p_prediction.select("spam", "prediction").groupBy("spam", "prediction").count().show()

    print(f'{name}')
    print(f'True Positive Cost: {True_Positive*0}')
    print(f'True Negative Cost: {True_Negative*0}')
    print(f'False Positive Cost: {False_Positive*10}')
    print(f'False Negative Cost: {False_Negative*1}')
    print(f'Total Avg Cost: {Total_Avg_Cost}')

Base Models

In [None]:
#Pipelines for each simple model without adjustments
va = VectorAssembler(inputCols = input_cols, outputCol = 'features')

clf = DecisionTreeClassifier(labelCol="spam")
clf_pipe = Pipeline(stages=[va, clf])

nb = NaiveBayes(labelCol="spam")
nb_pipe = Pipeline(stages=[va, nb])

rf = RandomForestClassifier(labelCol="spam")
rf_pipe = Pipeline(stages=[va, rf])

gbt = GBTClassifier(labelCol = 'spam')
gbt_pipe = Pipeline(stages=[va, gbt])

pipelines = [clf_pipe, nb_pipe, rf_pipe, gbt_pipe]

pipeline_name = ['Decision_Tree', 'Naive_Bayes', 'Random_Forest', 'GBT_Classieifer']

results = run_models_get_results(pipelines, pipeline_name, trainData, testData)

Processing Decision_Tree
Confusion Matrix
+----+----------+-----+
|spam|prediction|count|
+----+----------+-----+
|   1|       0.0|   90|
|   0|       1.0|   46|
|   0|       0.0|  832|
|   1|       1.0|  425|
+----+----------+-----+

Decision_Tree
accurary: 0.9023689877961235
precision: 0.9023861171366594
recall: 0.9476082004555809
f1 Measure: 0.9013838761404175
Processing Naive_Bayes
Confusion Matrix
+----+----------+-----+
|spam|prediction|count|
+----+----------+-----+
|   1|       0.0|  145|
|   0|       1.0|  133|
|   0|       0.0|  745|
|   1|       1.0|  370|
+----+----------+-----+

Naive_Bayes
accurary: 0.8004307250538406
precision: 0.8370786516853933
recall: 0.8485193621867881
f1 Measure: 0.7999317530781043
Processing Random_Forest
Confusion Matrix
+----+----------+-----+
|spam|prediction|count|
+----+----------+-----+
|   1|       0.0|   75|
|   0|       0.0|  850|
|   1|       1.0|  440|
|   0|       1.0|   28|
+----+----------+-----+

Random_Forest
accurary: 0.92605886575

In [None]:
results = run_models_get_results_get_cost(pipelines, pipeline_name, trainData, testData)

Processing Decision_Tree
Confusion Matrix
+----+----------+-----+
|spam|prediction|count|
+----+----------+-----+
|   1|       0.0|   90|
|   0|       1.0|   46|
|   0|       0.0|  832|
|   1|       1.0|  425|
+----+----------+-----+

Decision_Tree
True Positive Cost: 0
True Negative Cost: 0
False Positive Cost: 460
False Negative Cost: 90
Total Avg Cost: 0.3948312993539124
Processing Naive_Bayes
Confusion Matrix
+----+----------+-----+
|spam|prediction|count|
+----+----------+-----+
|   1|       0.0|  145|
|   0|       1.0|  133|
|   0|       0.0|  745|
|   1|       1.0|  370|
+----+----------+-----+

Naive_Bayes
True Positive Cost: 0
True Negative Cost: 0
False Positive Cost: 1330
False Negative Cost: 145
Total Avg Cost: 1.0588657573582196
Processing Random_Forest
Confusion Matrix
+----+----------+-----+
|spam|prediction|count|
+----+----------+-----+
|   1|       0.0|   75|
|   0|       0.0|  850|
|   1|       1.0|  440|
|   0|       1.0|   28|
+----+----------+-----+

Random_Forest

Naive Bayes had the lowest accuracy score and performed on average much worse than any other model. For the other three models generally decision tree performed

Scaled

In [None]:
#Models with feature scaling added to pipeline
scaledFeatureArr = [(col+'_scaled') for col in input_cols]

va = [VectorAssembler(inputCols = [col], outputCol = (col+'_vec')) for col in input_cols]

ss = [StandardScaler(withMean = True, withStd = True, inputCol = col+'_vec', outputCol = col+'_scaled') for col in input_cols]

va2 = VectorAssembler(inputCols = scaledFeatureArr, outputCol = 'features')

clf = DecisionTreeClassifier(labelCol="spam", featuresCol='features')
clf_pipe = Pipeline(stages = va + ss + [va2, clf])

nb = NaiveBayes(labelCol="spam", featuresCol='features')
nb_pipe = Pipeline(stages= va + ss + [va2, nb])

rf = RandomForestClassifier(labelCol="spam", featuresCol='features')
rf_pipe = Pipeline(stages= va + ss + [va2, rf])

gbt = GBTClassifier(labelCol = 'spam', featuresCol='features')
gbt_pipe = Pipeline(stages= va + ss +  [va2, gbt])

pipelines = [clf_pipe, rf_pipe, gbt_pipe]

pipeline_name = ['Decision_Tree', 'Random_Forest', 'GBT_Classieifer']

results = run_models_get_results(pipelines, pipeline_name, trainData, testData)

Processing Decision_Tree
Confusion Matrix
+----+----------+-----+
|spam|prediction|count|
+----+----------+-----+
|   1|       0.0|   90|
|   0|       1.0|   46|
|   0|       0.0|  832|
|   1|       1.0|  425|
+----+----------+-----+

Decision_Tree
accurary: 0.9023689877961235
precision: 0.9023861171366594
recall: 0.9476082004555809
f1 Measure: 0.9013838761404175
Processing Random_Forest
Confusion Matrix
+----+----------+-----+
|spam|prediction|count|
+----+----------+-----+
|   1|       0.0|   75|
|   0|       0.0|  850|
|   1|       1.0|  440|
|   0|       1.0|   28|
+----+----------+-----+

Random_Forest
accurary: 0.9260588657573582
precision: 0.918918918918919
recall: 0.9681093394077449
f1 Measure: 0.9252549351224462
Processing GBT_Classieifer
Confusion Matrix
+----+----------+-----+
|spam|prediction|count|
+----+----------+-----+
|   1|       0.0|   63|
|   0|       1.0|   46|
|   0|       0.0|  832|
|   1|       1.0|  452|
+----+----------+-----+

GBT_Classieifer
accurary: 0.9217

Did not include Naive Bayes as NB does not need to be scaled in the same way as the other models

As scaling did not seem to make a significant difference in results it will not be used

Feature Selection

In [None]:
#Feature selection added to pipeline, feature scaling not included
va = VectorAssembler(inputCols = input_cols, outputCol = 'features')

In [None]:
sel = UnivariateFeatureSelector(featuresCol="features", outputCol="selectedFeatures", labelCol="spam", selectionMode="fpr")

sel.setFeatureType("continuous").setLabelType("categorical")

UnivariateFeatureSelector_4a666fe3c299

In [None]:
clf = DecisionTreeClassifier(labelCol="spam", featuresCol='selectedFeatures')
clf_pipe = Pipeline(stages=[va, sel, clf])

In [None]:
nb = NaiveBayes(labelCol="spam", featuresCol='selectedFeatures')
nb_pipe = Pipeline(stages=[va, sel, nb])

In [None]:
rf = RandomForestClassifier(labelCol="spam", featuresCol='selectedFeatures')
rf_pipe = Pipeline(stages=[va, sel, rf])

In [None]:
gbt = GBTClassifier(labelCol = 'spam', featuresCol='selectedFeatures')
gbt_pipe = Pipeline(stages=[va, sel, gbt])

In [None]:
pipelines = [clf_pipe, nb_pipe, rf_pipe, gbt_pipe]

In [None]:
pipeline_name = ['Decision_Tree', 'Naive_Bayes', 'Random_Forest', 'GBT_Classieifer']

In [None]:
results = run_models_get_results(pipelines, pipeline_name, trainData, testData)

Processing Decision_Tree
Confusion Matrix
+----+----------+-----+
|spam|prediction|count|
+----+----------+-----+
|   1|       0.0|   90|
|   0|       1.0|   46|
|   0|       0.0|  832|
|   1|       1.0|  425|
+----+----------+-----+

Decision_Tree
accurary: 0.9023689877961235
precision: 0.9023861171366594
recall: 0.9476082004555809
f1 Measure: 0.9013838761404175
Processing Naive_Bayes
Confusion Matrix
+----+----------+-----+
|spam|prediction|count|
+----+----------+-----+
|   1|       0.0|  140|
|   0|       1.0|  136|
|   0|       0.0|  742|
|   1|       1.0|  375|
+----+----------+-----+

Naive_Bayes
accurary: 0.8018664752333095
precision: 0.8412698412698413
recall: 0.8451025056947609
f1 Measure: 0.8017054017715219
Processing Random_Forest
Confusion Matrix
+----+----------+-----+
|spam|prediction|count|
+----+----------+-----+
|   1|       0.0|   78|
|   0|       0.0|  846|
|   1|       1.0|  437|
|   0|       1.0|   32|
+----+----------+-----+

Random_Forest
accurary: 0.92103374012

Parameter Tuning

In [None]:
#Models run with parameter tuning
va = VectorAssembler(inputCols = input_cols, outputCol = 'features')

clf = DecisionTreeClassifier(labelCol="spam")

nb = NaiveBayes(labelCol="spam")

rf = RandomForestClassifier(labelCol="spam")

gbt = GBTClassifier(labelCol = 'spam')

In [None]:
#Cost evaluator based on f1 score
CV_eval = MulticlassClassificationEvaluator(labelCol='spam', predictionCol='prediction', metricName="f1")

In [None]:
#Building parameters to be tested
dt_grid = tune.ParamGridBuilder()

# Add the hyperparameter
dt_grid = dt_grid.addGrid(clf.maxDepth,  [5, 8, 10, 15, 20, 30])
dt_grid = dt_grid.addGrid(clf.minInstancesPerNode, [2,3,5,10])

# Build the grid
dt_grid = dt_grid.build()

dt_cv = tune.CrossValidator(estimator=clf, estimatorParamMaps=dt_grid, evaluator=CV_eval, collectSubModels = True)

nb_grid = tune.ParamGridBuilder()

# Add the hyperparameter
nb_grid = nb_grid.addGrid(nb.smoothing, [0.01, .2,.4,.5,.6,.8,1,1.2])

# Build the grid
nb_grid = nb_grid.build()

nb_cv = tune.CrossValidator(estimator=nb, estimatorParamMaps=nb_grid, evaluator=CV_eval, collectSubModels = True)

rf_grid = tune.ParamGridBuilder()

# Add the hyperparameter
rf_grid = rf_grid.addGrid(rf.numTrees, [10, 20, 30, 40, 50])
rf_grid = rf_grid.addGrid(rf.maxDepth,  [10, 15, 20, 25, 30])

# Build the grid
rf_grid = rf_grid.build()

rf_cv = tune.CrossValidator(estimator=rf, estimatorParamMaps=rf_grid, evaluator=CV_eval, collectSubModels = True)

gbt_grid = tune.ParamGridBuilder()

# Add the hyperparameter
gbt_grid = gbt_grid.addGrid(gbt.minInfoGain, [.001, .005, .01, .05])
gbt_grid = gbt_grid.addGrid(gbt.maxDepth,  [10, 20, 25, 30])

# Build the grid
gbt_grid = gbt_grid.build()

gbt_cv = tune.CrossValidator(estimator=gbt, estimatorParamMaps=gbt_grid, evaluator=CV_eval, collectSubModels = True)

In [None]:
clf_pipe = Pipeline(stages=[va, dt_cv])

nb_pipe = Pipeline(stages=[va, nb_cv])

rf_pipe = Pipeline(stages=[va, rf_cv])

gbt_pipe = Pipeline(stages=[va, gbt_cv])

pipelines = [clf_pipe, nb_pipe, rf_pipe, gbt_pipe]

pipeline_name = ['Decision_Tree', 'Naive_Bayes', 'Random_Forest', 'GBT_Classieifer']

results = run_models_get_results(pipelines, pipeline_name, trainData, testData)

Processing Decision_Tree
Confusion Matrix
+----+----------+-----+
|spam|prediction|count|
+----+----------+-----+
|   1|       0.0|   61|
|   0|       1.0|   48|
|   0|       0.0|  830|
|   1|       1.0|  454|
+----+----------+-----+

Decision_Tree
accurary: 0.9217516152189519
precision: 0.9315375982042648
recall: 0.9453302961275627
f1 Measure: 0.9215390178953249
Processing Naive_Bayes
Confusion Matrix
+----+----------+-----+
|spam|prediction|count|
+----+----------+-----+
|   1|       0.0|  144|
|   0|       1.0|  132|
|   0|       0.0|  746|
|   1|       1.0|  371|
+----+----------+-----+

Naive_Bayes
accurary: 0.8018664752333095
precision: 0.8382022471910112
recall: 0.8496583143507973
f1 Measure: 0.8013710929840172
Processing Random_Forest
Confusion Matrix
+----+----------+-----+
|spam|prediction|count|
+----+----------+-----+
|   1|       0.0|   37|
|   0|       0.0|  852|
|   1|       1.0|  478|
|   0|       1.0|   26|
+----+----------+-----+

Random_Forest
accurary: 0.95477386934

In [None]:
#Output of best features
clf_pipe = Pipeline(stages=[va, dt_cv])

nb_pipe = Pipeline(stages=[va, nb_cv])

rf_pipe = Pipeline(stages=[va, rf_cv])

gbt_pipe = Pipeline(stages=[va, gbt_cv])


clf_fit = clf_pipe.fit(trainData)
clf_prediction = clf_fit.transform(testData)

clf_bestModel = clf_fit.stages[-1].bestModel
clf_max_dep = clf_bestModel._java_obj.getMaxDepth()

clf_bestModel = clf_fit.stages[-1].bestModel
clf_min_int_node = clf_bestModel._java_obj.getMinInstancesPerNode()

print(f'Decision Tree Tuned Max Depth: {clf_max_dep}')
print(f'Decision Tree Tuned Min Instances Per Node: {clf_min_int_node}')

nb_fit = nb_pipe.fit(trainData)
nb_prediction = nb_fit.transform(testData)

nb_bestModel = nb_fit.stages[-1].bestModel
nb_smooth = nb_bestModel._java_obj.getSmoothing()

print(f'Naive Bayes Tuned Smoothing: {nb_smooth}')

rf_fit = rf_pipe.fit(trainData)
rf_prediction = rf_fit.transform(testData)

rf_bestModel = rf_fit.stages[-1].bestModel
rf_max_dep = rf_bestModel._java_obj.getMaxDepth()

rf_bestModel = rf_fit.stages[-1].bestModel
rf_num_tree = rf_bestModel._java_obj.getNumTrees()

print(f'Random Forest Tuned Max Depth: {rf_max_dep}')
print(f'Random Forest Tuned Number Trees: {rf_num_tree}')

gbt_fit = gbt_pipe.fit(trainData)
gbt_prediction = gbt_fit.transform(testData)

gbt_bestModel = gbt_fit.stages[-1].bestModel
gbt_max_dep = gbt_bestModel._java_obj.getMaxDepth()

gbt_bestModel = gbt_fit.stages[-1].bestModel
gbt_min_in_gain = gbt_bestModel._java_obj.getMinInfoGain()

print(f'GBT Tuned Max Depth: {gbt_max_dep}')
print(f'GBT Tuned Min Info Gain: {gbt_min_in_gain}')

Decision Tree Tuned Max Depth: 10
Decision Tree Tuned Min Instances Per Node: 3
Naive Bayes Tuned Smoothing: 0.01
Random Forest Tuned Max Depth: 20
Random Forest Tuned Number Trees: 30
GBT Tuned Max Depth: 10
GBT Tuned Min Info Gain: 0.001


In [None]:
#Cost eval seperate to be changed for next parameter tuning, in this case recall
Cost_eval = MulticlassClassificationEvaluator(labelCol='spam', predictionCol='prediction', metricName="weightedRecall")

In [None]:
dt_grid = tune.ParamGridBuilder()

# Add the hyperparameter
dt_grid = dt_grid.addGrid(clf.maxDepth,  [5, 8, 10, 15, 20, 30])
dt_grid = dt_grid.addGrid(clf.minInstancesPerNode, [2,3,5,10])

# Build the grid
dt_grid = dt_grid.build()

dt_cv = tune.CrossValidator(estimator=clf, estimatorParamMaps=dt_grid, evaluator=Cost_eval, collectSubModels = True)

nb_grid = tune.ParamGridBuilder()

# Add the hyperparameter
nb_grid = nb_grid.addGrid(nb.smoothing, [.2,.4,.5,.6,.8,1])

# Build the grid
nb_grid = nb_grid.build()

nb_cv = tune.CrossValidator(estimator=nb, estimatorParamMaps=nb_grid, evaluator=Cost_eval, collectSubModels = True)

rf_grid = tune.ParamGridBuilder()

# Add the hyperparameter
rf_grid = rf_grid.addGrid(rf.numTrees, [10, 20, 30, 40, 50])
rf_grid = rf_grid.addGrid(rf.maxDepth,  [10, 15, 20, 25, 30])

# Build the grid
rf_grid = rf_grid.build()

rf_cv = tune.CrossValidator(estimator=rf, estimatorParamMaps=rf_grid, evaluator=Cost_eval, collectSubModels = True)

gbt_grid = tune.ParamGridBuilder()

# Add the hyperparameter
gbt_grid = gbt_grid.addGrid(gbt.minInfoGain, [.001, .005, .01, .05])
gbt_grid = gbt_grid.addGrid(gbt.maxDepth,  [10, 20, 25, 30])

# Build the grid
gbt_grid = gbt_grid.build()

gbt_cv = tune.CrossValidator(estimator=gbt, estimatorParamMaps=gbt_grid, evaluator=Cost_eval, collectSubModels = True)

In [None]:

clf_pipe = Pipeline(stages=[va, dt_cv])

nb_pipe = Pipeline(stages=[va, nb_cv])

rf_pipe = Pipeline(stages=[va, rf_cv])

gbt_pipe = Pipeline(stages=[va, gbt_cv])

pipelines = [clf_pipe, nb_pipe, rf_pipe, gbt_pipe]

pipeline_name = ['Decision_Tree', 'Naive_Bayes', 'Random_Forest', 'GBT_Classieifer']

results = run_models_get_results(pipelines, pipeline_name, trainData, testData)

results = run_models_get_results_get_cost(pipelines, pipeline_name, trainData, testData)

Processing Decision_Tree
Confusion Matrix
+----+----------+-----+
|spam|prediction|count|
+----+----------+-----+
|   1|       0.0|   61|
|   0|       1.0|   48|
|   0|       0.0|  830|
|   1|       1.0|  454|
+----+----------+-----+

Decision_Tree
accurary: 0.9217516152189519
precision: 0.9315375982042648
recall: 0.9453302961275627
f1 Measure: 0.9215390178953249
Processing Naive_Bayes
Confusion Matrix
+----+----------+-----+
|spam|prediction|count|
+----+----------+-----+
|   1|       0.0|  145|
|   0|       1.0|  132|
|   0|       0.0|  746|
|   1|       1.0|  370|
+----+----------+-----+

Naive_Bayes
accurary: 0.801148600143575
precision: 0.8372615039281706
recall: 0.8496583143507973
f1 Measure: 0.8006083298807796
Processing Random_Forest
Confusion Matrix
+----+----------+-----+
|spam|prediction|count|
+----+----------+-----+
|   1|       0.0|   37|
|   0|       0.0|  852|
|   1|       1.0|  478|
|   0|       1.0|   26|
+----+----------+-----+

Random_Forest
accurary: 0.954773869346

In [None]:
clf_pipe = Pipeline(stages=[va, dt_cv])

nb_pipe = Pipeline(stages=[va, nb_cv])

rf_pipe = Pipeline(stages=[va, rf_cv])

gbt_pipe = Pipeline(stages=[va, gbt_cv])


clf_fit = clf_pipe.fit(trainData)
clf_prediction = clf_fit.transform(testData)

clf_bestModel = clf_fit.stages[-1].bestModel
clf_max_dep = clf_bestModel._java_obj.getMaxDepth()

clf_bestModel = clf_fit.stages[-1].bestModel
clf_min_int_node = clf_bestModel._java_obj.getMinInstancesPerNode()

print(f'Decision Tree Tuned Max Depth: {clf_max_dep}')
print(f'Decision Tree Tuned Min Instances Per Node: {clf_min_int_node}')

nb_fit = nb_pipe.fit(trainData)
nb_prediction = nb_fit.transform(testData)

nb_bestModel = nb_fit.stages[-1].bestModel
nb_smooth = nb_bestModel._java_obj.getSmoothing()

print(f'Naive Bayes Tuned Smoothing: {nb_smooth}')

rf_fit = rf_pipe.fit(trainData)
rf_prediction = rf_fit.transform(testData)

rf_bestModel = rf_fit.stages[-1].bestModel
rf_max_dep = rf_bestModel._java_obj.getMaxDepth()

rf_bestModel = rf_fit.stages[-1].bestModel
rf_num_tree = rf_bestModel._java_obj.getNumTrees()

print(f'Random Forest Tuned Max Depth: {rf_max_dep}')
print(f'Random Forest Tuned Number Trees: {rf_num_tree}')

gbt_fit = gbt_pipe.fit(trainData)
gbt_prediction = gbt_fit.transform(testData)

gbt_bestModel = gbt_fit.stages[-1].bestModel
gbt_max_dep = gbt_bestModel._java_obj.getMaxDepth()

gbt_bestModel = gbt_fit.stages[-1].bestModel
gbt_min_in_gain = gbt_bestModel._java_obj.getMinInfoGain()

print(f'GBT Tuned Max Depth: {gbt_max_dep}')
print(f'GBT Tuned Min Info Gain: {gbt_min_in_gain}')

Decision Tree Tuned Max Depth: 10
Decision Tree Tuned Min Instances Per Node: 3
Naive Bayes Tuned Smoothing: 0.2
Random Forest Tuned Max Depth: 20
Random Forest Tuned Number Trees: 30
GBT Tuned Max Depth: 10
GBT Tuned Min Info Gain: 0.001
