In [1]:
import os,sys
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.regression import RandomForestRegressor



spark = SparkSession \
    .builder \
    .appName("Spark ML App") \
     .getOrCreate()
data = spark.read.format("csv")\
.options(header='true', inferschema='true')\
.load("resources/advertising.csv")
data.describe().toPandas().transpose()
data.printSchema()
print(data.describe().toPandas().transpose())
data = data.withColumnRenamed("sales","label")

(trainingData, testingData) = data.randomSplit([0.9, 0.1])



assembler = VectorAssembler(inputCols=["TV","Radio","Newspaper"], outputCol="features")

standardizer = StandardScaler(withMean=True, withStd=True,
                              inputCol='features',
                              outputCol='std_features')


#dt = DecisionTreeRegressor(featuresCol="std_features",labelCol = 'label')
rf = RandomForestRegressor(featuresCol="std_features",labelCol = 'label')

pipeline = Pipeline(stages=[assembler,standardizer, rf])

rfModel=pipeline.fit(trainingData);
dtPredictions=rfModel.transform(testingData);
dtPredictions.select("prediction", "label", "std_features").show(5)




lrPredictions=rfModel.transform(testingData);
lrPredictions.select("prediction", "label", "std_features").show(5)

evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(lrPredictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

randomForestModel = rfModel.stages[2]
# summary only
print(randomForestModel)

paramGrid = ParamGridBuilder()\
    .addGrid(rf.maxDepth, [ 14,18,22]) \
.addGrid(rf.numTrees, [  24,28,30]) \
    .build()


crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(predictionCol='prediction', labelCol='label',metricName= "r2"),
                          numFolds=3)

# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(trainingData)

print(cvModel.avgMetrics)
#print( cvModel.bestModel.stages[2].summary.r2)

for param in paramGrid:
    print (param)


cvPrediction = cvModel.transform(testingData)
cvPrediction.select("prediction", "label", "std_features").show(5)



root
 |-- _c0: integer (nullable = true)
 |-- TV: double (nullable = true)
 |-- Radio: double (nullable = true)
 |-- Newspaper: double (nullable = true)
 |-- Sales: double (nullable = true)

               0                   1                   2    3      4
summary    count                mean              stddev  min    max
_c0          200               100.5  57.879184513951124    1    200
TV           200            147.0425   85.85423631490805  0.7  296.4
Radio        200  23.264000000000024  14.846809176168728  0.0   49.6
Newspaper    200  30.553999999999995   21.77862083852283  0.3  114.0
Sales        200  14.022500000000003   5.217456565710477  1.6   27.0
+------------------+-----+--------------------+
|        prediction|label|        std_features|
+------------------+-----+--------------------+
|19.815377302649036| 22.1|[0.95147001458083...|
| 12.49505112942613|  9.2|[-1.4302793850911...|
| 10.67393144021154|  9.7|[-0.5794072049853...|
| 20.91351664941338| 24.4|[1.543732468