In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer, VectorAssembler, MinMaxScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
spark = SparkSession.builder.appName("S6").getOrCreate()

In [6]:
file_path = "/content/housing.csv"
df = spark.read.csv(file_path, header=True, inferSchema=True)
df.show()

+----------+-----+------------------+-------------------+-----------+--------+----------------+------------------+---------------+----------------------+------------------+-------------------------------------+--------------------+----------+---------------+-----------+---------+---------+-----------------+--------------+------------------------+-------------------------+-------+
|        id| Date|number of bedrooms|number of bathrooms|living area|lot area|number of floors|waterfront present|number of views|condition of the house|grade of the house|Area of the house(excluding basement)|Area of the basement|Built Year|Renovation Year|Postal Code|Lattitude|Longitude|living_area_renov|lot_area_renov|Number of schools nearby|Distance from the airport|  Price|
+----------+-----+------------------+-------------------+-----------+--------+----------------+------------------+---------------+----------------------+------------------+-------------------------------------+--------------------+---

In [7]:
#Remove null

df.dropna()

DataFrame[id: bigint, Date: int, number of bedrooms: int, number of bathrooms: double, living area: int, lot area: int, number of floors: double, waterfront present: string, number of views: int, condition of the house: int, grade of the house: int, Area of the house(excluding basement): int, Area of the basement: int, Built Year: int, Renovation Year: int, Postal Code: int, Lattitude: double, Longitude: double, living_area_renov: int, lot_area_renov: int, Number of schools nearby: int, Distance from the airport: int, Price: int]

In [8]:
# One Hot Encode the column 'waterfront present'
indexer = StringIndexer(inputCol='waterfront present', outputCol='WaterfrontIndex')
df = indexer.fit(df).transform(df)
df.show()

+----------+-----+------------------+-------------------+-----------+--------+----------------+------------------+---------------+----------------------+------------------+-------------------------------------+--------------------+----------+---------------+-----------+---------+---------+-----------------+--------------+------------------------+-------------------------+-------+---------------+
|        id| Date|number of bedrooms|number of bathrooms|living area|lot area|number of floors|waterfront present|number of views|condition of the house|grade of the house|Area of the house(excluding basement)|Area of the basement|Built Year|Renovation Year|Postal Code|Lattitude|Longitude|living_area_renov|lot_area_renov|Number of schools nearby|Distance from the airport|  Price|WaterfrontIndex|
+----------+-----+------------------+-------------------+-----------+--------+----------------+------------------+---------------+----------------------+------------------+------------------------------

In [14]:
model_features = ['number of bedrooms', 'number of bathrooms', 'living area', 'lot area', 'number of floors','number of views','condition of the house','grade of the house','Area of the house(excluding basement)','Area of the basement','Built Year','Renovation Year','Postal Code','Lattitude','Longitude','living_area_renov','lot_area_renov','Number of schools nearby','Distance from the airport','Price','WaterfrontIndex']

In [15]:
# Create a new column called FeatureVector where each cell contains the entire row as a 1D vector
assembler = VectorAssembler(inputCols=model_features, outputCol="FeatureVector")
df = assembler.transform(df)
df.show()

+----------+-----+------------------+-------------------+-----------+--------+----------------+------------------+---------------+----------------------+------------------+-------------------------------------+--------------------+----------+---------------+-----------+---------+---------+-----------------+--------------+------------------------+-------------------------+-------+---------------+--------------------+
|        id| Date|number of bedrooms|number of bathrooms|living area|lot area|number of floors|waterfront present|number of views|condition of the house|grade of the house|Area of the house(excluding basement)|Area of the basement|Built Year|Renovation Year|Postal Code|Lattitude|Longitude|living_area_renov|lot_area_renov|Number of schools nearby|Distance from the airport|  Price|WaterfrontIndex|       FeatureVector|
+----------+-----+------------------+-------------------+-----------+--------+----------------+------------------+---------------+----------------------+-------

In [17]:
# Scale the FeatureVector using MinMaxScaler

scaler = MinMaxScaler(inputCol='FeatureVector', outputCol='ScaledFeatureVector')
df = scaler.fit(df).transform(df)
df.show()

+----------+-----+------------------+-------------------+-----------+--------+----------------+------------------+---------------+----------------------+------------------+-------------------------------------+--------------------+----------+---------------+-----------+---------+---------+-----------------+--------------+------------------------+-------------------------+-------+---------------+--------------------+--------------------+
|        id| Date|number of bedrooms|number of bathrooms|living area|lot area|number of floors|waterfront present|number of views|condition of the house|grade of the house|Area of the house(excluding basement)|Area of the basement|Built Year|Renovation Year|Postal Code|Lattitude|Longitude|living_area_renov|lot_area_renov|Number of schools nearby|Distance from the airport|  Price|WaterfrontIndex|       FeatureVector| ScaledFeatureVector|
+----------+-----+------------------+-------------------+-----------+--------+----------------+------------------+----

In [19]:
# Create new DF with only ScaledFeatureVector and Price

data = df.select(col("ScaledFeatureVector").alias("features"), col("Price").alias("label"))
data.show()

+--------------------+-------+
|            features|  label|
+--------------------+-------+
|[0.09375,0.266666...|1400000|
|[0.125,0.3,0.1928...|1200000|
|[0.09375,0.266666...| 838000|
|[0.0625,0.2,0.177...| 805000|
|[0.0625,0.2666666...| 790000|
|[0.125,0.36666666...| 785000|
|[0.0625,0.1666666...| 750000|
|[0.0625,0.2666666...| 750000|
|[0.09375,0.233333...| 698000|
|[0.125,0.26666666...| 675000|
|[0.09375,0.2,0.11...| 650000|
|[0.09375,0.2,0.08...| 640000|
|[0.09375,0.3,0.17...| 630000|
|[0.0625,0.2333333...| 626000|
|[0.09375,0.266666...| 625000|
|[0.09375,0.366666...| 625000|
|[0.0625,0.1666666...| 615000|
|[0.09375,0.266666...| 612500|
|[0.0625,0.2666666...| 604000|
|[0.0625,0.1666666...| 588500|
+--------------------+-------+
only showing top 20 rows



In [20]:
train_set, test_set = data.randomSplit([0.8, 0.2])

In [21]:
model = LinearRegression(featuresCol='features', labelCol='label')
model = model.fit(train_set)

In [22]:
predictions = model.transform(test_set)
predictions.show()

+--------------------+------+------------------+
|            features| label|        prediction|
+--------------------+------+------------------+
|[0.0,0.0,0.038724...|255000|254999.99546603102|
|[0.0,0.0333333333...|145000|145000.00992750635|
|[0.0,0.0666666666...|192500|192500.01536181298|
|[0.0,0.0666666666...|295000| 294999.9937443325|
|[0.0,0.0666666666...|238000| 238000.0154794529|
|[0.0,0.0666666666...|290000| 289999.9902487095|
|[0.0,0.0666666666...|106000|106000.00452959171|
|[0.0,0.0666666666...|330600|   330599.99062688|
|[0.0,0.0666666666...|202000|   201999.99216368|
|[0.0,0.0666666666...|279200|279200.00456454605|
|[0.0,0.0666666666...|280000|279999.99880696565|
|[0.0,0.0666666666...|315000| 314999.9960885464|
|[0.0,0.0666666666...|385195| 385195.0129160316|
|[0.0,0.0666666666...|335000| 334999.9950748203|
|[0.0,0.0666666666...|270000|269999.99050150235|
|[0.0,0.0666666666...|250000| 250000.0220061415|
|[0.0,0.0666666666...|395000| 394999.9907916668|
|[0.0,0.0666666666..

In [25]:
evaluator = RegressionEvaluator(labelCol='label', predictionCol='prediction', metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(rmse)

0.00812676035030852


In [26]:
coef = model.coefficients
intercept = model.intercept
print(coef)
print(intercept)

[-0.028890069701967905,0.04821826382449324,-0.035097536313111063,-0.03262878311890781,0.02294484990943174,0.0004171814053658231,0.01704518967639278,-0.06805864937778702,-0.030272809952721837,2.063707839179173e-05,0.008648878546063908,0.002489621207720053,0.006633417523983837,0.009179182444621759,0.006584029571822754,0.03219654891380277,0.04638707901804883,0.0009906380700399698,0.0002541361613098851,7622000.09435314,0.0012964059054715621]
77999.98228564487


In [28]:
feature_importance = zip(model_features, coef)
for feature, importance in feature_importance:
  print(f"{feature}: {importance}")

number of bedrooms: -0.028890069701967905
number of bathrooms: 0.04821826382449324
living area: -0.035097536313111063
lot area: -0.03262878311890781
number of floors: 0.02294484990943174
number of views: 0.0004171814053658231
condition of the house: 0.01704518967639278
grade of the house: -0.06805864937778702
Area of the house(excluding basement): -0.030272809952721837
Area of the basement: 2.063707839179173e-05
Built Year: 0.008648878546063908
Renovation Year: 0.002489621207720053
Postal Code: 0.006633417523983837
Lattitude: 0.009179182444621759
Longitude: 0.006584029571822754
living_area_renov: 0.03219654891380277
lot_area_renov: 0.04638707901804883
Number of schools nearby: 0.0009906380700399698
Distance from the airport: 0.0002541361613098851
Price: 7622000.09435314
WaterfrontIndex: 0.0012964059054715621


In [29]:
#Printing as list for fun
feature_importance2 = list(zip(model_features, coef))
print(feature_importance2)

[('number of bedrooms', -0.028890069701967905), ('number of bathrooms', 0.04821826382449324), ('living area', -0.035097536313111063), ('lot area', -0.03262878311890781), ('number of floors', 0.02294484990943174), ('number of views', 0.0004171814053658231), ('condition of the house', 0.01704518967639278), ('grade of the house', -0.06805864937778702), ('Area of the house(excluding basement)', -0.030272809952721837), ('Area of the basement', 2.063707839179173e-05), ('Built Year', 0.008648878546063908), ('Renovation Year', 0.002489621207720053), ('Postal Code', 0.006633417523983837), ('Lattitude', 0.009179182444621759), ('Longitude', 0.006584029571822754), ('living_area_renov', 0.03219654891380277), ('lot_area_renov', 0.04638707901804883), ('Number of schools nearby', 0.0009906380700399698), ('Distance from the airport', 0.0002541361613098851), ('Price', 7622000.09435314), ('WaterfrontIndex', 0.0012964059054715621)]


In [30]:
spark.stop()