# [Learning Spark Second Edition](https://github.com/databricks/LearningSparkV2)


 _pyspark_ 

### [Chapter 10](https://learning.oreilly.com/library/view/learning-spark-2nd/9781492050032/ch10.html)
> Load Data, Train-Test Split, Linear Regression

In [2]:
# Start Spark Session
import os
from pyspark.sql import SparkSession
# Create a SparkSession


PARENT_DIR = os.popen('dirname $PWD').read().strip()

spark = SparkSession.builder.appName("Chapter 10. Load. Train-Test Split. Linear Regression") \
    .config("spark.sql.extensions") \
    .config("spark.sql.catalog.spark_catalog") \
    .getOrCreate()

In [3]:
# Load Data
filePath = 'databricks-datasets/learning-spark-v2/sf-airbnb/'+ \
          'sf-airbnb-clean.parquet/'
filePath = os.path.join(PARENT_DIR,filePath)

airbnbDF = spark.read.parquet(filePath)
airbnbDF.select("neighbourhood_cleansed", "room_type", "bedrooms", "bathrooms", 
                "number_of_reviews", "price").show(5)

+----------------------+---------------+--------+---------+-----------------+-----+
|neighbourhood_cleansed|      room_type|bedrooms|bathrooms|number_of_reviews|price|
+----------------------+---------------+--------+---------+-----------------+-----+
|      Western Addition|Entire home/apt|     1.0|      1.0|            180.0|170.0|
|        Bernal Heights|Entire home/apt|     2.0|      1.0|            111.0|235.0|
|        Haight Ashbury|   Private room|     1.0|      4.0|             17.0| 65.0|
|        Haight Ashbury|   Private room|     1.0|      4.0|              8.0| 65.0|
|      Western Addition|Entire home/apt|     2.0|      1.5|             27.0|785.0|
+----------------------+---------------+--------+---------+-----------------+-----+
only showing top 5 rows



In [4]:
# Train Test Split
trainDF, testDF = airbnbDF.randomSplit([.8, .2], seed=42)
print(f"""There are {trainDF.count()} rows in the training set, 
          and {testDF.count()} in the test set""")

There are 5780 rows in the training set, 
          and 1366 in the test set


In [5]:
# Transformers
from pyspark.ml.feature import VectorAssembler
vecAssembler = VectorAssembler(inputCols=["bedrooms"], outputCol="features")
vecTrainDF = vecAssembler.transform(trainDF)
vecTrainDF.select("bedrooms", "features", "price").show(10)

+--------+--------+-----+
|bedrooms|features|price|
+--------+--------+-----+
|     1.0|   [1.0]|200.0|
|     1.0|   [1.0]|130.0|
|     1.0|   [1.0]| 95.0|
|     1.0|   [1.0]|250.0|
|     3.0|   [3.0]|250.0|
|     1.0|   [1.0]|115.0|
|     1.0|   [1.0]|105.0|
|     1.0|   [1.0]| 86.0|
|     1.0|   [1.0]|100.0|
|     2.0|   [2.0]|220.0|
+--------+--------+-----+
only showing top 10 rows



In [6]:
#  Linear Regression
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol="features", labelCol="price")
lrModel = lr.fit(vecTrainDF)

In [7]:
# Regression coefficients
m = round(lrModel.coefficients[0], 2)
b = round(lrModel.intercept, 2)

print(f"""The formula for the linear regression line is 
price = {m}*bedrooms + {b}""")

The formula for the linear regression line is 
price = 123.68*bedrooms + 47.51


### [Chapter 10](https://learning.oreilly.com/library/view/learning-spark-2nd/9781492050032/ch10.html)
> Pipeline

In [8]:
# Start Spark Session
import os
from pyspark.sql import SparkSession
# Create a SparkSession


PARENT_DIR = os.popen('dirname $PWD').read().strip()

spark = SparkSession.builder.appName("Chapter 10. Pipeline") \
    .getOrCreate()


# Load Data
filePath = 'databricks-datasets/learning-spark-v2/sf-airbnb/'+ \
          'sf-airbnb-clean.parquet/'
filePath = os.path.join(PARENT_DIR,filePath)

#train-test split
airbnbDF = spark.read.parquet(filePath)
trainDF, testDF = airbnbDF.randomSplit([.8, .2], seed=42)


# Transformers
from pyspark.ml.feature import VectorAssembler
vecAssembler = VectorAssembler(inputCols=["bedrooms"], outputCol="features")
vecTrainDF = vecAssembler.transform(trainDF)

#  Linear Regression
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol="features", labelCol="price")

In [9]:
# pipeline Model
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[vecAssembler, lr])
pipelineModel = pipeline.fit(trainDF)

In [10]:
# Predict
predDF = pipelineModel.transform(testDF)
predDF.select("bedrooms", "features", "price", "prediction").show(10)

+--------+--------+------+------------------+
|bedrooms|features| price|        prediction|
+--------+--------+------+------------------+
|     1.0|   [1.0]|  85.0|171.18598011578285|
|     1.0|   [1.0]|  45.0|171.18598011578285|
|     1.0|   [1.0]|  70.0|171.18598011578285|
|     1.0|   [1.0]| 128.0|171.18598011578285|
|     1.0|   [1.0]| 159.0|171.18598011578285|
|     2.0|   [2.0]| 250.0|294.86172649777757|
|     1.0|   [1.0]|  99.0|171.18598011578285|
|     1.0|   [1.0]|  95.0|171.18598011578285|
|     1.0|   [1.0]| 100.0|171.18598011578285|
|     1.0|   [1.0]|2010.0|171.18598011578285|
+--------+--------+------+------------------+
only showing top 10 rows



### [Chapter 10](https://learning.oreilly.com/library/view/learning-spark-2nd/9781492050032/ch10.html)
> One-hot encoding

In [11]:
# Start Spark Session
import os
from pyspark.sql import SparkSession
# Create a SparkSession


PARENT_DIR = os.popen('dirname $PWD').read().strip()

spark = SparkSession.builder.appName("Chapter 10. One hot Encoders") \
    .getOrCreate()


# Load Data
filePath = 'databricks-datasets/learning-spark-v2/sf-airbnb/'+ \
          'sf-airbnb-clean.parquet/'
filePath = os.path.join(PARENT_DIR,filePath)

#train-test split
airbnbDF = spark.read.parquet(filePath)
trainDF, testDF = airbnbDF.randomSplit([.8, .2], seed=42)

In [12]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import OneHotEncoder, StringIndexer

categoricalCols = [field for (field, dataType) in trainDF.dtypes 
                   if dataType == "string"]

indexOutputCols = [x + "Index" for x in categoricalCols]
oheOutputCols = [x + "OHE" for x in categoricalCols]

stringIndexer = StringIndexer(inputCols=categoricalCols, 
                              outputCols=indexOutputCols, 
                              handleInvalid="skip")

oheEncoder = OneHotEncoder(inputCols=indexOutputCols, 
                           outputCols=oheOutputCols)

numericCols = [field for (field, dataType) in trainDF.dtypes 
               if ((dataType == "double") & (field != "price"))]

assemblerInputs = oheOutputCols + numericCols
vecAssembler = VectorAssembler(inputCols=assemblerInputs, 
                               outputCol="features")

In [13]:
# labels to be included using R programming Language Sintaxis
from pyspark.ml.feature import RFormula

rFormula = RFormula(formula="price ~ .", 
                    featuresCol="features", 
                    labelCol="price", 
                    handleInvalid="skip")

In [14]:

#  Linear Regression
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol="features", labelCol="price")

# pipeline Model
from pyspark.ml import Pipeline

pipeline = Pipeline(stages = [stringIndexer, oheEncoder, vecAssembler, lr])
pipelineModel = pipeline.fit(trainDF)

#predict
predDF = pipelineModel.transform(testDF)
predDF.select("features", "price", "prediction").show(5)

+--------------------+-----+------------------+
|            features|price|        prediction|
+--------------------+-----+------------------+
|(98,[0,3,6,22,43,...| 85.0| 55.24365707389188|
|(98,[0,3,6,22,43,...| 45.0|23.357685914717877|
|(98,[0,3,6,22,43,...| 70.0|28.474464479034395|
|(98,[0,3,6,12,42,...|128.0| -91.6079079594947|
|(98,[0,3,6,12,43,...|159.0| 95.05688229945372|
+--------------------+-----+------------------+
only showing top 5 rows



### [Chapter 10](https://learning.oreilly.com/library/view/learning-spark-2nd/9781492050032/ch10.html)
> Evaluating Models

In [3]:
# Start Spark Session
import os
from pyspark.sql import SparkSession
# Create a SparkSession


PARENT_DIR = os.popen('dirname $PWD').read().strip()

spark = SparkSession.builder.appName("Chapter 10. One hot Encoders") \
    .getOrCreate()


# Load Data
filePath = 'databricks-datasets/learning-spark-v2/sf-airbnb/'+ \
          'sf-airbnb-clean.parquet/'
filePath = os.path.join(PARENT_DIR,filePath)

#train-test split
airbnbDF = spark.read.parquet(filePath)
trainDF, testDF = airbnbDF.randomSplit([.8, .2], seed=42)

#  Linear Regression
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol="features", labelCol="price")

#one-hot encoding

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import OneHotEncoder, StringIndexer

categoricalCols = [field for (field, dataType) in trainDF.dtypes 
                   if dataType == "string"]
indexOutputCols = [x + "Index" for x in categoricalCols]
oheOutputCols = [x + "OHE" for x in categoricalCols]
stringIndexer = StringIndexer(inputCols=categoricalCols, 
                              outputCols=indexOutputCols, 
                              handleInvalid="skip")
oheEncoder = OneHotEncoder(inputCols=indexOutputCols, 
                           outputCols=oheOutputCols)
numericCols = [field for (field, dataType) in trainDF.dtypes 
               if ((dataType == "double") & (field != "price"))]
assemblerInputs = oheOutputCols + numericCols
vecAssembler = VectorAssembler(inputCols=assemblerInputs, 
                               outputCol="features")


# pipeline Model
from pyspark.ml import Pipeline
pipeline = Pipeline(stages = [stringIndexer, oheEncoder, vecAssembler, lr])
pipelineModel = pipeline.fit(trainDF)

#predict
predDF = pipelineModel.transform(testDF)

In [4]:
# Root Mean Squared Error (RMSE)
from pyspark.ml.evaluation import RegressionEvaluator
regressionEvaluator = RegressionEvaluator(
  predictionCol="prediction", 
  labelCol="price", 
  metricName="rmse")
rmse = regressionEvaluator.evaluate(predDF)
print(f"RMSE is {rmse:.1f}")

RMSE is 220.6


In [5]:
# R squared
r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF)
print(f"R2 is {r2}")

R2 is 0.16043316698848087


### [Chapter 10](https://learning.oreilly.com/library/view/learning-spark-2nd/9781492050032/ch10.html)
> Linear Regression Baseline RMSE Evaluation

In [5]:
# Start Spark Session
import os
from pyspark.sql import SparkSession

# Create a SparkSession
PARENT_DIR = os.popen('dirname $PWD').read().strip()

spark = SparkSession.builder.appName("Chapter 10. One hot Encoders") \
    .getOrCreate()

# Load Data
filePath = 'databricks-datasets/learning-spark-v2/sf-airbnb/'+ \
          'sf-airbnb-clean.parquet/'
filePath = os.path.join(PARENT_DIR,filePath)

#train-test split
airbnbDF = spark.read.parquet(filePath)
trainDF, testDF = airbnbDF.randomSplit([.8, .2], seed=42)

#new constant column
from pyspark.sql.functions import lit
mean_price = trainDF.agg({'price':'mean'}).collect()[0][0]
regDF = trainDF.withColumn('mean_price', lit(mean_price))


#one-hot encoding

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import OneHotEncoder, StringIndexer

categoricalCols = [field for (field, dataType) in regDF.dtypes 
                   if dataType == "string"]
indexOutputCols = [x + "Index" for x in categoricalCols]
oheOutputCols = [x + "OHE" for x in categoricalCols]
stringIndexer = StringIndexer(inputCols=categoricalCols, 
                              outputCols=indexOutputCols, 
                              handleInvalid="skip")
oheEncoder = OneHotEncoder(inputCols=indexOutputCols, 
                           outputCols=oheOutputCols)
numericCols = [field for (field, dataType) in regDF.dtypes 
               if ((dataType == "double") & (field != "mean_price"))]
assemblerInputs = oheOutputCols + numericCols
vecAssembler = VectorAssembler(inputCols=assemblerInputs, 
                               outputCol="features")


#  Linear Regression
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol="features", labelCol="mean_price")

# pipeline Model
from pyspark.ml import Pipeline
pipeline = Pipeline(stages = [stringIndexer, oheEncoder, vecAssembler, lr])
pipelineModel = pipeline.fit(regDF)

#predict
predDF = pipelineModel.transform(testDF)

#evaluations
from pyspark.ml.evaluation import RegressionEvaluator
regressionEvaluator = RegressionEvaluator(
  predictionCol="prediction", 
  labelCol="price", 
  metricName="rmse")
rmse = regressionEvaluator.evaluate(predDF)
print(f"RMSE is {rmse:.1f}")

RMSE is 240.8


In [26]:
# Regression coefficients
model = pipelineModel.stages[-1]
m = round(model.coefficients[0], 2)
b = round(model.intercept, 2)

print(f"""The horizontal line 
price = {m}*bedrooms + {b}""")

The horizontal line 
price = 0.0*bedrooms + 214.47


### [Chapter 10](https://learning.oreilly.com/library/view/learning-spark-2nd/9781492050032/ch10.html)
> Logarithm of Price Column

In [9]:
# Start Spark Session
import os
from pyspark.sql import SparkSession

# Create a SparkSession
PARENT_DIR = os.popen('dirname $PWD').read().strip()

spark = SparkSession.builder.appName("Chapter 10. One hot Encoders") \
    .getOrCreate()

# Load Data
filePath = 'databricks-datasets/learning-spark-v2/sf-airbnb/'+ \
          'sf-airbnb-clean.parquet/'
filePath = os.path.join(PARENT_DIR,filePath)

#log price

from pyspark.sql.functions import log
airbnbDF = spark.read.parquet(filePath)
airbnbDF = airbnbDF.withColumn('log_price', log('price'))


#train-test split
trainDF, testDF = airbnbDF.randomSplit([.8, .2], seed=42)


#one-hot encoding
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import OneHotEncoder, StringIndexer

categoricalCols = [field for (field, dataType) in trainDF.dtypes 
                   if dataType == "string"]
indexOutputCols = [x + "Index" for x in categoricalCols]
oheOutputCols = [x + "OHE" for x in categoricalCols]
stringIndexer = StringIndexer(inputCols=categoricalCols, 
                              outputCols=indexOutputCols, 
                              handleInvalid="skip")
oheEncoder = OneHotEncoder(inputCols=indexOutputCols, 
                           outputCols=oheOutputCols)
numericCols = [field for (field, dataType) in trainDF.dtypes 
               if ((dataType == "double") & (field != "log_price"))]
assemblerInputs = oheOutputCols + numericCols
vecAssembler = VectorAssembler(inputCols=assemblerInputs, 
                               outputCol="features")


#  Linear Regression
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol="features", labelCol="log_price")

# pipeline Model
from pyspark.ml import Pipeline
pipeline = Pipeline(stages = [stringIndexer, oheEncoder, vecAssembler, lr])
pipelineModel = pipeline.fit(trainDF)

#predict
predDF = pipelineModel.transform(testDF)

#evaluations
from pyspark.ml.evaluation import RegressionEvaluator
regressionEvaluator = RegressionEvaluator(
  predictionCol="prediction", 
  labelCol="log_price", 
  metricName="rmse")
rmse = regressionEvaluator.evaluate(predDF)
print(f"RMSE is {rmse:.1f}")

RMSE is 0.3


In [10]:
# R squared
r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF)
print(f"R2 is {r2}")

R2 is 0.7668615633102807


In [11]:
#Save Model
pipelinePath = "/tmp/lr-pipeline-model"
pipelineModel.write().overwrite().save(pipelinePath)

In [12]:
# Load Model
from pyspark.ml import PipelineModel
savedPipelineModel = PipelineModel.load(pipelinePath)

### [Chapter 10](https://learning.oreilly.com/library/view/learning-spark-2nd/9781492050032/ch10.html)
> Hyperparameter Tuning: Decision Trees

In [3]:
# Start Spark Session
import os
from pyspark.sql import SparkSession

# Create a SparkSession
PARENT_DIR = os.popen('dirname $PWD').read().strip()

spark = SparkSession.builder.appName("Chapter 10. Decision Tree") \
    .getOrCreate()

# filepath
filePath = 'databricks-datasets/learning-spark-v2/sf-airbnb/'+ \
          'sf-airbnb-clean.parquet/'
filePath = os.path.join(PARENT_DIR,filePath)

#load and create log price
from pyspark.sql.functions import log
airbnbDF = spark.read.parquet(filePath).withColumn('log_price', log('price'))

#train-test split
trainDF, testDF = airbnbDF.randomSplit([.8, .2], seed=42)

# Combine output of StringIndexer defined above and numeric columns
from pyspark.ml.feature import VectorAssembler,StringIndexer

categoricalCols = [field for (field, dataType) in trainDF.dtypes 
                   if dataType == "string"]

indexOutputCols = [x + "Index" for x in categoricalCols]

numericCols = [field for (field, dataType) in trainDF.dtypes 
               if ((dataType == "double") & (field != "log_price"))]

stringIndexer = StringIndexer(inputCols=categoricalCols, 
                              outputCols=indexOutputCols, 
                              handleInvalid="skip")

assemblerInputs = indexOutputCols + numericCols
vecAssembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

#decision tree
from pyspark.ml.regression import DecisionTreeRegressor
dt = DecisionTreeRegressor(labelCol="log_price")
dt.setMaxBins(40)

# Combine stages into pipeline

from pyspark.ml import Pipeline
stages = [stringIndexer, vecAssembler, dt]
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(trainDF) # This line should error


#predict
predDF = pipelineModel.transform(testDF)

In [8]:
# tree in if-else form
dtModel = pipelineModel.stages[-1]
print(dtModel.toDebugString[:500]+'...')

DecisionTreeRegressionModel: uid=DecisionTreeRegressor_a0f85d78f61d, depth=5, numNodes=63, numFeatures=34
  If (feature 23 <= 178.5)
   If (feature 23 <= 90.5)
    If (feature 23 <= 54.5)
     If (feature 23 <= 45.5)
      If (feature 3 in {1.0,3.0,4.0,6.0,24.0})
       Predict: 3.34800174250524
      Else (feature 3 not in {1.0,3.0,4.0,6.0,24.0})
       Predict: 3.6828292005441248
     Else (feature 23 > 45.5)
      If (feature 3 in {0.0,1.0,2.0,3.0,5.0,6.0,8.0,10.0,11.0,13.0,15.0,16.0,17.0,18....


In [12]:
# feature importance
import pandas as pd

featureImp = pd.DataFrame(
  list(zip(vecAssembler.getInputCols(), dtModel.featureImportances)),
  columns=["feature", "importance"])
featureImp.sort_values(by="importance", ascending=False).head(7)

Unnamed: 0,feature,importance
23,price,0.995259
3,neighbourhood_cleansedIndex,0.002481
8,latitude,0.001943
21,review_scores_location,0.000288
1,cancellation_policyIndex,2.8e-05
26,beds_na,0.0
20,review_scores_communication,0.0


### [Chapter 10](https://learning.oreilly.com/library/view/learning-spark-2nd/9781492050032/ch10.html)
> Random Forest & Hyperparameter Tuning

In [11]:
# Start Spark Session
import os
from pyspark.sql import SparkSession

# Create a SparkSession
PARENT_DIR = os.popen('dirname $PWD').read().strip()

spark = SparkSession.builder.appName("Chapter 10. Random Forest") \
    .getOrCreate()

# filepath
filePath = 'databricks-datasets/learning-spark-v2/sf-airbnb/'+ \
          'sf-airbnb-clean.parquet/'
filePath = os.path.join(PARENT_DIR,filePath)

#load and create log price
from pyspark.sql.functions import log
airbnbDF = spark.read.parquet(filePath).withColumn('log_price', log('price'))

from pyspark.ml import Pipeline
#train-test split
trainDF, testDF = airbnbDF.randomSplit([.8, .2], seed=42)

# Combine output of StringIndexer defined above and numeric columns
from pyspark.ml.feature import VectorAssembler,StringIndexer

categoricalCols = [field for (field, dataType) in trainDF.dtypes 
                   if dataType == "string"]

indexOutputCols = [x + "Index" for x in categoricalCols]

numericCols = [field for (field, dataType) in trainDF.dtypes 
               if ((dataType == "double") & (field != "log_price"))]

stringIndexer = StringIndexer(inputCols=categoricalCols, 
                              outputCols=indexOutputCols, 
                              handleInvalid="skip")

assemblerInputs = indexOutputCols + numericCols
vecAssembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

#Random Forest
from pyspark.ml.regression import RandomForestRegressor
rf = RandomForestRegressor(labelCol="log_price", maxBins=40, seed=42)


# pipeline

from pyspark.ml import Pipeline
pipeline = Pipeline(stages = [stringIndexer, vecAssembler, rf])

#gridbuilder
from pyspark.ml.tuning import ParamGridBuilder
paramGrid = (ParamGridBuilder()
            .addGrid(rf.maxDepth, [2, 4, 6])
            .addGrid(rf.numTrees, [10, 100])
            .build())

#evaluator
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(labelCol="log_price", 
                                predictionCol="prediction", 
                                metricName="rmse")


In [14]:
%%time
# Cross Validation

from pyspark.ml.tuning import CrossValidator

cv = CrossValidator(estimator=pipeline, 
                    evaluator=evaluator, 
                    estimatorParamMaps=paramGrid, 
                    numFolds=3, 
                    seed=42)
cvModel = cv.fit(trainDF)

CPU times: user 1.66 s, sys: 390 ms, total: 2.05 s
Wall time: 41.5 s


In [15]:
# In Python
list(zip(cvModel.getEstimatorParamMaps(), cvModel.avgMetrics))

[({Param(parent='RandomForestRegressor_64d6751757f0', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 2,
   Param(parent='RandomForestRegressor_64d6751757f0', name='numTrees', doc='Number of trees to train (>= 1).'): 10},
  0.3172766328923306),
 ({Param(parent='RandomForestRegressor_64d6751757f0', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 2,
   Param(parent='RandomForestRegressor_64d6751757f0', name='numTrees', doc='Number of trees to train (>= 1).'): 100},
  0.3358904413789592),
 ({Param(parent='RandomForestRegressor_64d6751757f0', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 4,
   Param(parent='RandomForestRegressor_64d6751757f0', name='numTrees', doc='Number of trees to train (>= 1).'): 10},
  0.1813968004970

In [16]:
%%time
# Validate in Parallel
cvModel = cv.setParallelism(4).fit(trainDF)

CPU times: user 2.86 s, sys: 919 ms, total: 3.78 s
Wall time: 32 s


In [17]:
%%time
# put cross-validator inside pipeline (instead of pipeline inside cross-validator)

cv = CrossValidator(estimator=rf, 
                    evaluator=evaluator, 
                    estimatorParamMaps=paramGrid, 
                    numFolds=3, 
                    parallelism=4, 
                    seed=42)

pipeline = Pipeline(stages=[stringIndexer, vecAssembler, cv])
pipelineModel = pipeline.fit(trainDF)

CPU times: user 1e+03 ms, sys: 314 ms, total: 1.31 s
Wall time: 22.9 s
