In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Predicting income above/below 50k adult data").getOrCreate()

rawData = spark.read\
                .format("csv")\
                .option("header", "false")\
                .option("ignoreLeadingWhiteSpace", "true")\
                .load("./data/adult.data")

In [2]:
# Name Columns since they didn't have a header
dataset = rawData.toDF("Age", "WorkClass", "FnlWgt", "Education", "EducationNum", "MaritalStatus", "Occupation",
                        "Relationship", "Race", "Gender", "CapitalGain", "CapitalLoss", "HoursPerWeek",
                        "NativeCountry", "Label")

In [3]:
# Cheating by peaking in Pandas; Traditionally Can't do this if data is too large
print(dataset.count())
dataset.toPandas().head()

32561


Unnamed: 0,Age,WorkClass,FnlWgt,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
# Cleaning
dataset = dataset.drop("FnlWgt").replace("?", None).dropna(how='any')
print(dataset.count())
dataset.toPandas().head()

30162


Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Label
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
# Get column types (note it's not like pandas .describe() that gives summary statistics)
dataset.describe()

DataFrame[summary: string, Age: string, WorkClass: string, Education: string, EducationNum: string, MaritalStatus: string, Occupation: string, Relationship: string, Race: string, Gender: string, CapitalGain: string, CapitalLoss: string, HoursPerWeek: string, NativeCountry: string, Label: string]

In [6]:
from pyspark.sql.types import FloatType
from pyspark.sql.functions import col

# Need to cast some cols to float

dataset = dataset.withColumn("Age", dataset["Age"].cast(FloatType()))
dataset = dataset.withColumn("EducationNum", dataset["EducationNum"].cast(FloatType()))
dataset = dataset.withColumn("CapitalGain", dataset["CapitalGain"].cast(FloatType()))
dataset = dataset.withColumn("CapitalLoss", dataset["CapitalLoss"].cast(FloatType()))
dataset = dataset.withColumn("HoursPerWeek", dataset["HoursPerWeek"].cast(FloatType()))

In [7]:
# Note columns like Age are now floats, ex 39 -> 39.0 for Age.
dataset.toPandas().head()

Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Label
0,39.0,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50.0,Self-emp-not-inc,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38.0,Private,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53.0,Private,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28.0,Private,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K


# Single Feature Engineering
Just demoing how to convert WorkClass

In [8]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

# WorkClass from string to numeric
indexedDF = StringIndexer(inputCol='WorkClass', outputCol='WorkClass_index').fit(dataset).transform(dataset)

# WorkClass_index from numeric to one-hot encoded
encodedDF = OneHotEncoder(inputCol='WorkClass_index', outputCol='WorkClass_encoded').fit(indexedDF).transform(indexedDF)

# View Steps
encodedDF = encodedDF.select("WorkClass", "WorkClass_index", "WorkClass_encoded").toPandas().head()
encodedDF

Unnamed: 0,WorkClass,WorkClass_index,WorkClass_encoded
0,State-gov,3.0,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0)"
1,Self-emp-not-inc,1.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)"
2,Private,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)"
3,Private,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)"
4,Private,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)"


# Multi Feature Engineering/Preprocessing

In [9]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder


# Specify Categorical Variables
categoricalFeatures = ["WorkClass", "Education", "MaritalStatus", "Occupation",
                       "Relationship", "Race", "Gender", "NativeCountry"]

# Convert Categorical Variables to Numeric Variables using List Comprehension
indexers = [StringIndexer(inputCol=column, # Input Col
                          outputCol=column + "_index", # Name of new output col
                          handleInvalid="keep") # Creates new index if test set has categorical value not seen in training
                            for column in categoricalFeatures]

# One-Hot Encode Categorical Features
encoders = [OneHotEncoder(inputCol=column + "_index", # Note these are names coming from indexers above
                          outputCol=column + "_encoded") for column in categoricalFeatures]

# Convert Target Variable from str to numeric
labelIndexer = [StringIndexer(inputCol="Label", outputCol="Label_index")]


In [10]:
from pyspark.ml import Pipeline

# Pipeline all steps above
pipeline = Pipeline(stages=indexers + encoders + labelIndexer)

# Train/Test Split & ML Model

In [11]:
# Simple train/test split
(train, test) = dataset.randomSplit([.8, .2])

In [16]:
# Pass training data through pipeline
transformedDF = pipeline.fit(train).transform(train)
print("pipeline fitted & transformed")
transformedDF.toPandas().head() # Can take 30+ seconds since added many encoded variables to dataframe

pipeline fitted & transformed


Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,...,NativeCountry_index,WorkClass_encoded,Education_encoded,MaritalStatus_encoded,Occupation_encoded,Relationship_encoded,Race_encoded,Gender_encoded,NativeCountry_encoded,Label_index
0,17.0,Federal-gov,11th,7.0,Never-married,Adm-clerical,Not-in-family,Black,Female,0.0,...,0.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
1,17.0,Local-gov,10th,6.0,Never-married,Other-service,Own-child,White,Female,0.0,...,0.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
2,17.0,Local-gov,10th,6.0,Never-married,Protective-serv,Own-child,White,Female,0.0,...,0.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
3,17.0,Local-gov,11th,7.0,Never-married,Adm-clerical,Own-child,White,Female,0.0,...,0.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
4,17.0,Local-gov,11th,7.0,Never-married,Adm-clerical,Own-child,White,Female,0.0,...,0.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0


In [17]:
# Subset Features for ML model
requiredFeatures = ["Age", "EducationNum", "CapitalGain", "CapitalLoss", "HoursPerWeek",
                       "WorkClass_encoded", "Education_encoded", "MaritalStatus_encoded",
                       "Occupation_encoded", "Relationship_encoded", "Race_encoded",
                       "Gender_encoded", "NativeCountry_encoded"]

In [18]:
from pyspark.ml.feature import VectorAssembler

# Convert features into vector required for Spark; a "dense vector."
assembler = VectorAssembler(inputCols=requiredFeatures, outputCol='features')

# Transform data to have dense vector added
transformedDF = assembler.transform(transformedDF)
display(transformedDF.toPandas().head()) # Scroll to right to see new last column of dense feature vector
display(transformedDF.select("features").toPandas().head())

Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,...,WorkClass_encoded,Education_encoded,MaritalStatus_encoded,Occupation_encoded,Relationship_encoded,Race_encoded,Gender_encoded,NativeCountry_encoded,Label_index,features
0,17.0,Federal-gov,11th,7.0,Never-married,Adm-clerical,Not-in-family,Black,Female,0.0,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 7.0, 0.0, 1602.0, 40.0, 0.0, 0.0, 0.0, ..."
1,17.0,Local-gov,10th,6.0,Never-married,Other-service,Own-child,White,Female,0.0,...,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 6.0, 0.0, 0.0, 25.0, 0.0, 0.0, 1.0, 0.0..."
2,17.0,Local-gov,10th,6.0,Never-married,Protective-serv,Own-child,White,Female,0.0,...,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 6.0, 0.0, 1602.0, 40.0, 0.0, 0.0, 1.0, ..."
3,17.0,Local-gov,11th,7.0,Never-married,Adm-clerical,Own-child,White,Female,0.0,...,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 7.0, 0.0, 0.0, 12.0, 0.0, 0.0, 1.0, 0.0..."
4,17.0,Local-gov,11th,7.0,Never-married,Adm-clerical,Own-child,White,Female,0.0,...,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 7.0, 0.0, 0.0, 15.0, 0.0, 0.0, 1.0, 0.0..."


Unnamed: 0,features
0,"(17.0, 7.0, 0.0, 1602.0, 40.0, 0.0, 0.0, 0.0, ..."
1,"(17.0, 6.0, 0.0, 0.0, 25.0, 0.0, 0.0, 1.0, 0.0..."
2,"(17.0, 6.0, 0.0, 1602.0, 40.0, 0.0, 0.0, 1.0, ..."
3,"(17.0, 7.0, 0.0, 0.0, 12.0, 0.0, 0.0, 1.0, 0.0..."
4,"(17.0, 7.0, 0.0, 0.0, 15.0, 0.0, 0.0, 1.0, 0.0..."


In [19]:
from pyspark.ml.classification import RandomForestClassifier

# Initialize Random Forest Model
rf = RandomForestClassifier(labelCol='Label_index', # Numeric Target
                            featuresCol='features', # Dense Feature Vector
                            maxDepth=5)

# Create full pipeline of steps
pipeline = Pipeline(
    stages=indexers + encoders + labelIndexer + [assembler, rf]
)

# Train model
model = pipeline.fit(train)

In [20]:
# Predict on test set which adds last column to DF with predictions
predictions = model.transform(test)
predDF = predictions.toPandas()
predDF.head()

Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,...,Occupation_encoded,Relationship_encoded,Race_encoded,Gender_encoded,NativeCountry_encoded,Label_index,features,rawPrediction,probability,prediction
0,17.0,Local-gov,12th,8.0,Never-married,Adm-clerical,Own-child,White,Female,0.0,...,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 8.0, 0.0, 0.0, 12.0, 0.0, 0.0, 1.0, 0.0...","[19.036049830675598, 0.9639501693244017]","[0.9518024915337799, 0.04819750846622008]",0.0
1,17.0,Local-gov,9th,5.0,Never-married,Other-service,Own-child,Black,Male,0.0,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 5.0, 0.0, 0.0, 9.0, 0.0, 0.0, 1.0, 0.0,...","[18.73969390997666, 1.260306090023343]","[0.9369846954988329, 0.06301530450116714]",0.0
2,17.0,Private,10th,6.0,Never-married,Handlers-cleaners,Own-child,White,Male,0.0,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 6.0, 0.0, 0.0, 20.0, 1.0, 0.0, 0.0, 0.0...","[18.74539994418798, 1.2546000558120194]","[0.937269997209399, 0.06273000279060097]",0.0
3,17.0,Private,10th,6.0,Never-married,Other-service,Own-child,White,Female,0.0,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 6.0, 0.0, 0.0, 14.0, 1.0, 0.0, 0.0, 0.0...","[19.112977149256277, 0.887022850743722]","[0.9556488574628139, 0.0443511425371861]",0.0
4,17.0,Private,10th,6.0,Never-married,Other-service,Own-child,White,Female,0.0,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 6.0, 0.0, 0.0, 20.0, 1.0, 0.0, 0.0, 0.0...","[19.112977149256277, 0.887022850743722]","[0.9556488574628139, 0.0443511425371861]",0.0


In [29]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

# Initialize Evaluators
eval_acc = MulticlassClassificationEvaluator(
            labelCol='Label_index',
            predictionCol='prediction',
            metricName='accuracy'
)

eval_f1 = MulticlassClassificationEvaluator(
            labelCol='Label_index',
            predictionCol='prediction',
            metricName='f1'
)

eval_auc = BinaryClassificationEvaluator(
            labelCol='Label_index',
            rawPredictionCol='prediction',
            metricName='areaUnderROC'
)

# Evaluate Performance
acc = eval_acc.evaluate(predictions)
print(f"accuracy: {acc}")

f1 = eval_f1.evaluate(predictions)
print(f"f1 score: {f1}")

auc = eval_auc.evaluate(predictions)
print(f"auc: {auc}")

accuracy: 0.8262189978066475
f1 score: 0.8097225742058443
auc: 0.6995815647326914


### Hyperparameter Tuning

In [30]:
from pyspark.ml.tuning import ParamGridBuilder

# Parameter Grid of a few random forest parameters
paramGrid = ParamGridBuilder().addGrid(
    rf.maxDepth, [4, 5, 6]).addGrid(
    rf.numTrees, [5, 10, 15]).build()

In [31]:
# Metric we're tuning for
eval_auc = BinaryClassificationEvaluator(
            labelCol='Label_index',
            rawPredictionCol='prediction',
            metricName='areaUnderROC'
)

In [33]:
from pyspark.ml.tuning import CrossValidator

# Use Cross Validation w/ 3 folds optimizing for AUC
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)

# Train all models, will take some time
model = crossval.fit(train)

# Pull out best model
best = model.bestModel.stages[-1] # -1 is the last step in pipeline = random forest

In [34]:
best_depth = best._java_obj.getMaxDepth()
best_num_trees = best._java_obj.getNumTrees()
print(f"Best maxDepth = {best_depth}")
print(f"Best numTrees = {best_num_trees}")

Best maxDepth=6
Best numTrees=5


In [35]:
preds = model.transform(test)

# check auc on tuned model
auc = eval_auc.evaluate(preds)
print(f"auc: {auc}")

auc: 0.7234054357664089


### Great, AUC went from .699 to .723 with simple tuning