In [0]:
df = spark.read.format('parquet').table('training_dataset_numeric')
display(df)

review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,price,minimum_nights,maximum_nights,beds,number_of_reviews,bedrooms,bathrooms,room_type,accommodates,neighbourhood_group_cleansed,latitude,longitude,property_type,host_total_listings_count,instant_bookable,bathrooms_na
4.83,4.98,4.95,4.65,4.8,90.0,2.0,25,3,41,3.0,,Entire home/apt,6,,37.73864,-122.50042,Entire residential home,1,f,1.0
4.0,5.0,5.0,5.0,5.0,495.0,30.0,1124,4,2,4.0,,Entire home/apt,6,,37.74331,-122.4561,Entire residential home,1,f,1.0
4.97,4.91,4.96,4.88,4.79,174.0,2.0,1125,1,123,1.0,,Private room,2,,37.77864,-122.42226,Room in boutique hotel,101,f,1.0
,,,,,5.0,5.0,5,t,,,30.0,110,110,30,110,30,30,,,0.0
4.33,5.0,5.0,4.67,4.0,43.0,30.0,1125,1,3,1.0,,Private room,1,,37.72011,-122.46662,Private room in residential home,2,f,1.0
4.95,5.0,5.0,4.89,4.89,180.0,5.0,90,1,63,1.0,,Entire home/apt,2,,37.77135,-122.45189,Entire condominium (condo),2,f,1.0
,,,,,62.0,152.0,2021-08-04,2,t,,1125.0,1125,2.0,28,2,2,1125,2 baths,,0.0
,,,,,57.0,147.0,2021-08-04,0,f,,1125.0,1125,30.0,120,30,30,1125,1 shared bath,,0.0
4.95,5.0,4.95,5.0,4.76,155.0,30.0,365,1,22,1.0,,Entire home/apt,2,,37.76632,-122.45165,Entire rental unit,1,t,1.0
4.75,5.0,5.0,4.75,4.75,110.0,85.0,87,1,4,,,Entire home/apt,2,,37.79111,-122.41457,Entire rental unit,1,t,1.0


In [0]:
# split data
df = df.select(['review_scores_value','price','minimum_nights','bedrooms','bathrooms_na'])

train_df, test_df = df.randomSplit([.8,.2], seed = 42)

In [0]:
display(train_df.select('price'))

price
1.0
1.0
1.0
1.0
1.0
1.0
2.0
2.0
2.0
2.0


In [0]:
from pyspark.ml.feature import Imputer

imputer = Imputer(strategy='median', inputCols = ['review_scores_value','price','minimum_nights','bedrooms','bathrooms_na'], outputCols =['review_scores_value','price','minimum_nights','bedrooms','bathrooms_na'])
imputerModel = imputer.fit(df)
df = imputerModel.transform(df)

display(df)

review_scores_value,price,minimum_nights,bedrooms,bathrooms_na
4.8,90.0,2.0,3.0,1.0
5.0,495.0,30.0,4.0,1.0
4.79,174.0,2.0,1.0,1.0
4.75,5.0,5.0,1.0,0.0
4.0,43.0,30.0,1.0,1.0
4.89,180.0,5.0,1.0,1.0
4.75,62.0,152.0,1.0,0.0
4.75,57.0,147.0,1.0,0.0
4.76,155.0,30.0,1.0,1.0
4.75,110.0,85.0,1.0,1.0


In [0]:
import pyspark.sql.functions as F
df.where(F.isnan('bathrooms_na')).count()
df.printSchema()

In [0]:
# linear regression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import LinearRegression

vecAssembler = VectorAssembler(inputCols=['review_scores_value','price','minimum_nights','bedrooms','bathrooms_na'], outputCol="features")
vecdf = vecAssembler.transform(df)
splits = vecdf.randomSplit([0.7, 0.3])
train_df = splits[0]
test_df = splits[1]

lrModel = LinearRegression(labelCol="price").fit(train_df)
predDF = lrModel.transform(test_df)

regressionEvaluator = RegressionEvaluator(predictionCol = 'prediction', labelCol = 'price', metricName='rmse')
rmse = regressionEvaluator.evaluate(predDF)
r2 = regressionEvaluator.setMetricName('r2').evaluate(predDF)

print(f'RMSE:', {rmse})
print(f'R-squared:', {r2})

In [0]:
lrModel.intercept
lrModel.coefficients

In [0]:
# oneHoTEncoding for string data 
df = spark.read.format('parquet').table('training_dataset_numeric')
df.dtypes

In [0]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml.feature import StringIndexer, IndexToString

categoricalCols = [field for (field, dataType) in df.dtypes if dataType == 'string']
indexOutputCols = [x + "Index" for x in categoricalCols]
oheOuputCols = [x + "OHE" for x in categoricalCols]

stringIndexer = StringIndexer(inputCols = categoricalCols, outputCols = indexOutputCols, handleInvalid='skip')
oheEncoder = OneHotEncoder(inputCols = indexOutputCols, outputCols = oheOuputCols)

In [0]:
stringDF = stringIndexer.fit(df).transform(df)
display(stringDF)

oheEncoderDF = oheEncoder.fit(stringDF).transform(stringDF)
display(oheEncoderDF)

review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,price,minimum_nights,maximum_nights,beds,number_of_reviews,bedrooms,bathrooms,room_type,accommodates,neighbourhood_group_cleansed,latitude,longitude,property_type,host_total_listings_count,instant_bookable,bathrooms_na,review_scores_checkinIndex,latitudeIndex,longitudeIndex,host_total_listings_countIndex,accommodatesIndex,review_scores_locationIndex,bathroomsIndex,instant_bookableIndex,maximum_nightsIndex,number_of_reviewsIndex,review_scores_cleanlinessIndex,room_typeIndex,neighbourhood_group_cleansedIndex,property_typeIndex,review_scores_communicationIndex,bedsIndex


review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,price,minimum_nights,maximum_nights,beds,number_of_reviews,bedrooms,bathrooms,room_type,accommodates,neighbourhood_group_cleansed,latitude,longitude,property_type,host_total_listings_count,instant_bookable,bathrooms_na,review_scores_checkinIndex,latitudeIndex,longitudeIndex,host_total_listings_countIndex,accommodatesIndex,review_scores_locationIndex,bathroomsIndex,instant_bookableIndex,maximum_nightsIndex,number_of_reviewsIndex,review_scores_cleanlinessIndex,room_typeIndex,neighbourhood_group_cleansedIndex,property_typeIndex,review_scores_communicationIndex,bedsIndex,review_scores_checkinOHE,review_scores_locationOHE,host_total_listings_countOHE,property_typeOHE,instant_bookableOHE,longitudeOHE,review_scores_cleanlinessOHE,bedsOHE,number_of_reviewsOHE,maximum_nightsOHE,room_typeOHE,bathroomsOHE,latitudeOHE,neighbourhood_group_cleansedOHE,review_scores_communicationOHE,accommodatesOHE


In [0]:
from pyspark.ml import Pipeline

splits = df.randomSplit([0.7, 0.3])
train_df = splits[0]
test_df = splits[1]

stages = [stringIndexer, oheEncoder, vecAssembler, lr]

pipeline = Pipeline(stages = stages)
pipelineModel = pipeline.fit(train_df)

In [0]:
# saving
pipelineModel.write().overwrite().save(pipeline_paths)