In [0]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

spark = SparkSession.builder.getOrCreate()
spark

In [0]:
from pyspark.ml.regression import RandomForestRegressor
from sklearn.datasets import load_boston

In [0]:
boston=load_boston()
feature_names = boston.feature_names.tolist()
output_name = 'outcome'
boston_columns = feature_names + [output_name]
X = boston.data.tolist()
y = boston.target.tolist()
Xy = [(i + [j]) for (i, j) in zip(X, y)]
boston_df = spark.createDataFrame(Xy, boston_columns)
print(feature_names)

['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']


In [0]:
boston_df.show(2)

+-------+----+-----+----+-----+-----+----+------+---+-----+-------+-----+-----+-------+
|   CRIM|  ZN|INDUS|CHAS|  NOX|   RM| AGE|   DIS|RAD|  TAX|PTRATIO|    B|LSTAT|outcome|
+-------+----+-----+----+-----+-----+----+------+---+-----+-------+-----+-----+-------+
|0.00632|18.0| 2.31| 0.0|0.538|6.575|65.2|  4.09|1.0|296.0|   15.3|396.9| 4.98|   24.0|
|0.02731| 0.0| 7.07| 0.0|0.469|6.421|78.9|4.9671|2.0|242.0|   17.8|396.9| 9.14|   21.6|
+-------+----+-----+----+-----+-----+----+------+---+-----+-------+-----+-----+-------+
only showing top 2 rows



In [0]:
train, test = boston_df.randomSplit([0.7, 0.3], seed=7)

In [0]:
train.show(2)

+-------+----+-----+----+-----+-----+----+------+---+-----+-------+------+-----+-------+
|   CRIM|  ZN|INDUS|CHAS|  NOX|   RM| AGE|   DIS|RAD|  TAX|PTRATIO|     B|LSTAT|outcome|
+-------+----+-----+----+-----+-----+----+------+---+-----+-------+------+-----+-------+
|0.00632|18.0| 2.31| 0.0|0.538|6.575|65.2|  4.09|1.0|296.0|   15.3| 396.9| 4.98|   24.0|
|0.01311|90.0| 1.22| 0.0|0.403|7.249|21.9|8.6966|5.0|226.0|   17.9|395.93| 4.81|   35.4|
+-------+----+-----+----+-----+-----+----+------+---+-----+-------+------+-----+-------+
only showing top 2 rows



In [0]:
catCols = [x for (x, dataType) in train.dtypes if dataType == "string"]
numCols = [
    x for (x, dataType) in train.dtypes if ((dataType == "double"))
]


In [0]:
print(numCols)
print(catCols)

['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'outcome']
[]


In [0]:
train.dtypes

Out[65]: [('CRIM', 'double'),
 ('ZN', 'double'),
 ('INDUS', 'double'),
 ('CHAS', 'double'),
 ('NOX', 'double'),
 ('RM', 'double'),
 ('AGE', 'double'),
 ('DIS', 'double'),
 ('RAD', 'double'),
 ('TAX', 'double'),
 ('PTRATIO', 'double'),
 ('B', 'double'),
 ('LSTAT', 'double'),
 ('outcome', 'double')]

In [0]:
from pyspark.ml.feature import VectorAssembler
assemblerInput = [x for x in numCols]
vector_assembler = VectorAssembler(
    inputCols=assemblerInput, outputCol="VectorAssembler_features"
)

In [0]:
stages = [vector_assembler]

In [0]:
%%time
from pyspark.ml import Pipeline

pipeline = Pipeline().setStages(stages)
model = pipeline.fit(train)
pp_df = model.transform(test)

CPU times: user 3.61 ms, sys: 3.95 ms, total: 7.57 ms
Wall time: 36.4 ms


In [0]:
pp_df

Out[70]: DataFrame[CRIM: double, ZN: double, INDUS: double, CHAS: double, NOX: double, RM: double, AGE: double, DIS: double, RAD: double, TAX: double, PTRATIO: double, B: double, LSTAT: double, outcome: double, VectorAssembler_features: vector]

In [0]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
data = pp_df.select(
    F.col("VectorAssembler_features").alias("features"),
    F.col("outcome").alias("label"),
)

In [0]:
data.show(5, truncate=False)

+-------------------------------------------------------------------------------+-----+
|features                                                                       |label|
+-------------------------------------------------------------------------------+-----+
|[0.0136,75.0,4.0,0.0,0.41,5.888,47.6,7.3197,3.0,469.0,21.1,396.9,14.8,18.9]    |18.9 |
|[0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7] |34.7 |
|[0.02763,75.0,2.95,0.0,0.428,6.595,21.8,5.4011,3.0,252.0,18.3,395.63,4.32,30.8]|30.8 |
|[0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4] |33.4 |
|[0.03359,75.0,2.95,0.0,0.428,7.024,15.8,5.4011,3.0,252.0,18.3,395.62,1.98,34.9]|34.9 |
+-------------------------------------------------------------------------------+-----+
only showing top 5 rows



In [0]:
%%time
model = GBTRegressor().fit(data)

CPU times: user 62.4 ms, sys: 64.8 ms, total: 127 ms
Wall time: 48.3 s


In [0]:
%%time
model = LinearRegression().fit(data)

CPU times: user 14 ms, sys: 6.55 ms, total: 20.5 ms
Wall time: 1.42 s


In [0]:
print(model.summary.meanAbsoluteError)

8.654188476953095e-14


In [0]:
print(model.summary.rootMeanSquaredError)

1.0932223553547008e-13
