In [1]:
# Import the PySpark module
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

In [2]:
##Create SparkContext
sc = SparkContext.getOrCreate()

# Create SparkSession object
spark = SparkSession.builder.master('local[*]').appName('CrossValidation').getOrCreate()

In [3]:
## Is not the best choose for large data sets
flights = spark.read.csv('flights.csv', sep=',',header=True,inferSchema=True, nullValue='NA')
flights_train_2, flights_test_2 = flights.randomSplit([0.8, 0.2], seed=17)

In [4]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder 
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator 

In [5]:
from pyspark.ml.feature import VectorAssembler

# Create an assembler object

assembler = VectorAssembler(
    inputCols=['mon', 'dom' , 'dow','mile', 'depart', 'duration' ], outputCol='features')

# Consolidate predictor columns
flights_assembled = assembler.transform(flights)

In [6]:
flights = flights_assembled

In [7]:
# Split into training and testing sets in a 80:20 ratio
flights_train, flights_test = flights.randomSplit([0.8, 0.2], seed=17)

### Cross Validation (direct)

In [8]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoderEstimator
from pyspark.ml import Pipeline 

regression = LinearRegression(labelCol='duration')

# Create parameter grid
params = ParamGridBuilder()

# Add grids for two parameters
params = params.addGrid(regression.regParam, [0.01, 0.1, 1, 10]) \
               .addGrid(regression.elasticNetParam, [0, 0.5,1])

# Build the parameter grid
params = params.build()
print('Number of models to be tested: ', len(params))


# Create objects for building and evaluating a regression model

evaluator = RegressionEvaluator(labelCol='duration')

# Create a cross validator
cv = CrossValidator(estimator=regression, estimatorParamMaps=params, evaluator=evaluator, numFolds=5)

# Train and test model on multiple folds of the training data
cv = cv.fit(flights_train)

Number of models to be tested:  12


### Cross Validation on a Pipeline

In [9]:
# Create an indexer for the org field
indexer = StringIndexer(inputCol='org', outputCol='org_idx')

# Create an one-hot encoder for the indexed org field
onehot = OneHotEncoderEstimator(inputCols=['org_idx'], outputCols=['org_dummy'])

# Assemble the km and one-hot encoded fields
assembler = VectorAssembler(inputCols=['mile','org_dummy'], outputCol='features')

# Create a pipeline and cross-validator.
pipeline = Pipeline(stages=[indexer, onehot, assembler, regression])

# Create cross-validator
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=params, evaluator=evaluator, numFolds=5)

# Train and test model on multiple folds of the training data
cv = cv.fit(flights_train_2)

In [10]:
# Get the best model from cross validation
best_model = cv.bestModel

In [11]:
# Look at the stages in the best model
print(best_model.stages)

[StringIndexer_d06aa01bdbc4, OneHotEncoderEstimator_27790f61b1e9, VectorAssembler_4e9c6b6d9447, LinearRegression_4b78016cc9ce]


In [12]:
# Get the parameters for the LinearRegression object in the best model
best_model.stages[3].extractParamMap()

{Param(parent='LinearRegression_4b78016cc9ce', name='aggregationDepth', doc='suggested depth for treeAggregate (>= 2)'): 2,
 Param(parent='LinearRegression_4b78016cc9ce', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty'): 0.0,
 Param(parent='LinearRegression_4b78016cc9ce', name='epsilon', doc='The shape parameter to control the amount of robustness. Must be > 1.0.'): 1.35,
 Param(parent='LinearRegression_4b78016cc9ce', name='featuresCol', doc='features column name'): 'features',
 Param(parent='LinearRegression_4b78016cc9ce', name='fitIntercept', doc='whether to fit an intercept term'): True,
 Param(parent='LinearRegression_4b78016cc9ce', name='labelCol', doc='label column name'): 'duration',
 Param(parent='LinearRegression_4b78016cc9ce', name='loss', doc='The loss function to be optimized. Supported options: squaredError, huber. (Default squaredError)'): 'squaredError',
 Param

In [13]:
# Generate predictions on testing data using the best model then calculate RMSE
predictions = best_model.transform(flights_test_2)
evaluator.evaluate(predictions)

11.013570980274741

In [14]:
help(cv)

Help on CrossValidatorModel in module pyspark.ml.tuning object:

class CrossValidatorModel(pyspark.ml.base.Model, ValidatorParams, pyspark.ml.util.MLReadable, pyspark.ml.util.MLWritable)
 |  CrossValidatorModel(bestModel, avgMetrics=[], subModels=None)
 |  
 |  CrossValidatorModel contains the model with the highest average cross-validation
 |  metric across folds and uses this model to transform input data. CrossValidatorModel
 |  also tracks the metrics for each param map evaluated.
 |  
 |  .. versionadded:: 1.4.0
 |  
 |  Method resolution order:
 |      CrossValidatorModel
 |      pyspark.ml.base.Model
 |      pyspark.ml.base.Transformer
 |      ValidatorParams
 |      pyspark.ml.param.shared.HasSeed
 |      pyspark.ml.param.Params
 |      pyspark.ml.util.Identifiable
 |      pyspark.ml.util.MLReadable
 |      pyspark.ml.util.MLWritable
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, bestModel, avgMetrics=[], subModels=None)
 |      Initialize self. 