In [1]:
import findspark
findspark.init()

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("airbnb_example") \
    .master("local") \
    .getOrCreate()

In [6]:
file_path = '/Users/hyunseokjung/data/learning-spark-v2/sf-airbnb/sf-airbnb-clean.parquet'
airbnbDF = spark.read.format('parquet').load(file_path)
airbnbDF.select("neighbourhood_cleansed", "room_type", "bedrooms", "bathrooms",
                "number_of_reviews", "price").show(5)


                                                                                

+----------------------+---------------+--------+---------+-----------------+-----+
|neighbourhood_cleansed|      room_type|bedrooms|bathrooms|number_of_reviews|price|
+----------------------+---------------+--------+---------+-----------------+-----+
|      Western Addition|Entire home/apt|     1.0|      1.0|            180.0|170.0|
|        Bernal Heights|Entire home/apt|     2.0|      1.0|            111.0|235.0|
|        Haight Ashbury|   Private room|     1.0|      4.0|             17.0| 65.0|
|        Haight Ashbury|   Private room|     1.0|      4.0|              8.0| 65.0|
|      Western Addition|Entire home/apt|     2.0|      1.5|             27.0|785.0|
+----------------------+---------------+--------+---------+-----------------+-----+
only showing top 5 rows



                                                                                

In [8]:
trainDF, testDF = airbnbDF.randomSplit([.8, .2], seed=42)
print(f"""There are {trainDF.count()} rows in the training set,
      and {testDF.count()} in the test set""")

22/12/05 11:33:29 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
There are 5780 rows in the training set,
      and 1366 in the test set


In [10]:
from pyspark.ml.feature import VectorAssembler

vecAssembler = VectorAssembler(inputCols=['bedrooms'], outputCol='features')
vecTrainDF = vecAssembler.transform(trainDF)
vecTrainDF.select("bedrooms", "features", "price").show(10)

+--------+--------+-----+
|bedrooms|features|price|
+--------+--------+-----+
|     1.0|   [1.0]|200.0|
|     1.0|   [1.0]|130.0|
|     1.0|   [1.0]| 95.0|
|     1.0|   [1.0]|250.0|
|     3.0|   [3.0]|250.0|
|     1.0|   [1.0]|115.0|
|     1.0|   [1.0]|105.0|
|     1.0|   [1.0]| 86.0|
|     1.0|   [1.0]|100.0|
|     2.0|   [2.0]|220.0|
+--------+--------+-----+
only showing top 10 rows



In [11]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol="features", labelCol="price")
lrModel = lr.fit(vecTrainDF)

22/12/05 11:44:17 WARN Instrumentation: [e6667f9f] regParam is zero, which might cause numerical instability and overfitting.
22/12/05 11:44:17 WARN InstanceBuilder$JavaBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
22/12/05 11:44:17 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/12/05 11:44:17 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
22/12/05 11:44:17 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


In [16]:
m = round(lrModel.coefficients[0], 2)
b = round(lrModel.intercept, 2)
print(f'''The formula for the linear regression line is
      price = {m}*bedrooms + {b}''')

The formula for the linear regression line is
      price = 123.68*bedrooms + 47.51


In [17]:
from pyspark.ml import Pipeline

# ㄷ
pipeline = Pipeline(stages=[vecAssembler, lr])
pipelineModel = pipeline.fit(trainDF)

22/12/05 11:53:59 WARN Instrumentation: [9cb4e9a0] regParam is zero, which might cause numerical instability and overfitting.


In [18]:
predDF = pipelineModel.transform(testDF)
predDF.select("bedrooms", "features", "price", "prediction").show(5)

+--------+--------+-----+------------------+
|bedrooms|features|price|        prediction|
+--------+--------+-----+------------------+
|     1.0|   [1.0]| 85.0|171.18598011578285|
|     1.0|   [1.0]| 45.0|171.18598011578285|
|     1.0|   [1.0]| 70.0|171.18598011578285|
|     1.0|   [1.0]|128.0|171.18598011578285|
|     1.0|   [1.0]|159.0|171.18598011578285|
+--------+--------+-----+------------------+
only showing top 5 rows



In [21]:
trainDF.dtypes

[('host_is_superhost', 'string'),
 ('cancellation_policy', 'string'),
 ('instant_bookable', 'string'),
 ('host_total_listings_count', 'double'),
 ('neighbourhood_cleansed', 'string'),
 ('latitude', 'double'),
 ('longitude', 'double'),
 ('property_type', 'string'),
 ('room_type', 'string'),
 ('accommodates', 'double'),
 ('bathrooms', 'double'),
 ('bedrooms', 'double'),
 ('beds', 'double'),
 ('bed_type', 'string'),
 ('minimum_nights', 'double'),
 ('number_of_reviews', 'double'),
 ('review_scores_rating', 'double'),
 ('review_scores_accuracy', 'double'),
 ('review_scores_cleanliness', 'double'),
 ('review_scores_checkin', 'double'),
 ('review_scores_communication', 'double'),
 ('review_scores_location', 'double'),
 ('review_scores_value', 'double'),
 ('price', 'double'),
 ('bedrooms_na', 'double'),
 ('bathrooms_na', 'double'),
 ('beds_na', 'double'),
 ('review_scores_rating_na', 'double'),
 ('review_scores_accuracy_na', 'double'),
 ('review_scores_cleanliness_na', 'double'),
 ('review_sco

In [25]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer

categoricalCols = [field for (field, dataType) in trainDF.dtypes
                   if dataType == "string"]
indexOutputCols = [x + "Index" for x in categoricalCols]
oheOutputCols = [x + "OHE" for x in categoricalCols]

stringIndexer = StringIndexer(inputCols=categoricalCols,
                              outputCols=indexOutputCols,
                              handleInvalid='skip')

numericCols = [field for (field, dataType) in trainDF.dtypes
               if ((dataType == "double") & (field != "price"))]
assembleInputs = oheOutputCols + numericCols
vecAssembler = VectorAssembler(inputCols=assembleInputs,
                               outputCol = "features")

In [29]:
vecAssembler

VectorAssembler_7104e1c374cd

In [30]:
from pyspark.ml.feature import RFormula

rFormula = RFormula(formula = "price ~ .",
                    featuresCol = "features",
                    labelCol = "price",
                    handleInvalid = "skip")


In [31]:
pipeline = Pipeline(stages = [rFormula, lr])

In [34]:
pipelineModel = pipeline.fit(trainDF)
predDF = pipelineModel.transform(testDF)
predDF.select("features", "price", "prediction").show(5)

22/12/05 12:52:10 WARN Instrumentation: [9506692d] regParam is zero, which might cause numerical instability and overfitting.
22/12/05 12:52:10 WARN Instrumentation: [9506692d] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
+--------------------+-----+------------------+
|            features|price|        prediction|
+--------------------+-----+------------------+
|(98,[0,3,6,7,23,4...| 85.0| 55.30094763354373|
|(98,[0,3,6,7,23,4...| 45.0| 22.70940291742818|
|(98,[0,3,6,7,23,4...| 70.0|27.182906571761578|
|(98,[0,3,6,7,13,4...|128.0|-91.90969569190747|
|(98,[0,3,6,7,13,4...|159.0| 94.54162775821169|
+--------------------+-----+------------------+
only showing top 5 rows



In [35]:
from pyspark.ml.evaluation import RegressionEvaluator

regressionEvaluator = RegressionEvaluator(
    predictionCol = "prediction",
    labelCol = "price",
    metricName = "rmse")
rmse = regressionEvaluator.evaluate(predDF)
print(f"RMSE is {rmse:.1f}")

RMSE is 220.6


In [37]:
r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF)
print(f"R2 is {r2}")

R2 is 0.15985154393435386


22/12/05 15:16:16 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 437020 ms exceeds timeout 120000 ms
22/12/05 15:16:16 WARN SparkContext: Killing executors is not supported by current scheduler.
