#### regression

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("CA_Housing_builtin").getOrCreate()

import pandas as pd
from sklearn.datasets import fetch_california_housing
cal = fetch_california_housing()   
pdf = pd.DataFrame(cal.data, columns=cal.feature_names)
pdf['medianHouseValue'] = cal.target    

housing = spark.createDataFrame(pdf)

housing = housing.na.drop(how="any")

from pyspark.ml.feature import RFormula
rf = RFormula(formula="medianHouseValue ~ .")

from pyspark.ml.regression import LinearRegression
lr = LinearRegression()

from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[rf, lr])

from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(metricName="rmse") 

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
params = ParamGridBuilder().addGrid(lr.regParam, [0, 0.1, 0.2]).build()
cv = CrossValidator(estimator=pipeline,
                    estimatorParamMaps=params,
                    evaluator=evaluator,
                    numFolds=4)

train, test = housing.randomSplit([0.7, 0.3])
model = cv.fit(train)

print("训练集 RMSE:", evaluator.evaluate(model.transform(train)))
print("测试集 RMSE:", evaluator.evaluate(model.transform(test)))

25/10/07 21:16:24 WARN Instrumentation: [2071894b] regParam is zero, which might cause numerical instability and overfitting.
25/10/07 21:16:24 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/10/07 21:16:24 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
25/10/07 21:16:26 WARN Instrumentation: [2fd36b53] regParam is zero, which might cause numerical instability and overfitting.
25/10/07 21:16:27 WARN Instrumentation: [80610d62] regParam is zero, which might cause numerical instability and overfitting.
25/10/07 21:16:28 WARN Instrumentation: [f72c9d9a] regParam is zero, which might cause numerical instability and overfitting.
25/10/07 21:16:29 WARN Instrumentation: [20564c99] regParam is zero, which might cause numerical instability and overfitting.


训练集 RMSE: 0.7249646889209516
测试集 RMSE: 0.7232588285989255


In [4]:
print(model.bestModel.stages[1].explainParams())

aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
epsilon: The shape parameter to control the amount of robustness. Must be > 1.0. Only valid when loss is huber (default: 1.35)
featuresCol: features column name. (default: features)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label)
loss: The loss function to be optimized. Supported options: squaredError, huber. (default: squaredError)
maxBlockSizeInMB: maximum memory in MB for stacking input data into blocks. Data is stacked within partitions. If more than remaining data size in a partition then it is adjusted to the data size. Default 0.0 represents choosing optimal value, depends on specific algorithm. Must be >= 0. (default: 0.0)
maxIter: max number of iterations (>= 0). (default: 100)
predic