In [None]:
# Cài đặt pyspark
!pip install pyspark
!pip install findspark
import findspark
findspark.init()

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285397 sha256=814a311dff0324172fd2bc2724df743913029bcd4642e55189709336a32bd53f
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1
Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


###Cấu hình các thư viện cần thiết

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

### Spark Context and Session
Spark context is the entry point of any Spark Application. To work on Spark, loading the data, we need to create spark context. It is more like creating an object of a class.

from pyspark import SparkContext, SparkConf
# Creating a spark context class
sc = SparkSession.builder.master("local[*]").getOrCreate()

    local[*] → Create a session using all CPU cores available.
    getOrCreate → A new session is created if a context is not already been created.


In [None]:
# local[*] → Create a session using all CPU cores available.
# getOrCreate → A new session is created if a context is not already been created.
sc = SparkSession.builder.master("local[*]").getOrCreate()

In [None]:
data = sc.read.csv('/content/car_data.csv', inferSchema = True, header = True)
data.show(5)

+----+----------+----+--------------------+---------+----------------+-----------------+----------------+---------------+--------------------+------------+-------------+-----------+--------+----------+-----+
|Make|     Model|Year|    Engine Fuel Type|Engine HP|Engine Cylinders|Transmission Type|   Driven_Wheels|Number of Doors|     Market Category|Vehicle Size|Vehicle Style|highway MPG|city mpg|Popularity| MSRP|
+----+----------+----+--------------------+---------+----------------+-----------------+----------------+---------------+--------------------+------------+-------------+-----------+--------+----------+-----+
| BMW|1 Series M|2011|premium unleaded ...|      335|               6|           MANUAL|rear wheel drive|              2|Factory Tuner,Lux...|     Compact|        Coupe|         26|      19|      3916|46135|
| BMW|  1 Series|2011|premium unleaded ...|      300|               6|           MANUAL|rear wheel drive|              2|  Luxury,Performance|     Compact|  Convertible

In [None]:
data.printSchema()

root
 |-- Make: string (nullable = true)
 |-- Model: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Engine Fuel Type: string (nullable = true)
 |-- Engine HP: integer (nullable = true)
 |-- Engine Cylinders: integer (nullable = true)
 |-- Transmission Type: string (nullable = true)
 |-- Driven_Wheels: string (nullable = true)
 |-- Number of Doors: integer (nullable = true)
 |-- Market Category: string (nullable = true)
 |-- Vehicle Size: string (nullable = true)
 |-- Vehicle Style: string (nullable = true)
 |-- highway MPG: integer (nullable = true)
 |-- city mpg: integer (nullable = true)
 |-- Popularity: integer (nullable = true)
 |-- MSRP: integer (nullable = true)



In [None]:
data.describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
Make,11914,,,Acura,Volvo
Model,11914,745.5822222222222,1490.8280590623795,1 Series,xD
Year,11914,2010.384337753903,7.5797398875957995,1990,2017
Engine Fuel Type,11911,,,diesel,regular unleaded
Engine HP,11845,249.38607007176023,109.19187025917194,55,1001
Engine Cylinders,11884,5.628828677213059,1.78055934824622,0,16
Transmission Type,11914,,,AUTOMATED_MANUAL,UNKNOWN
Driven_Wheels,11914,,,all wheel drive,rear wheel drive
Number of Doors,11908,3.4360933825999327,0.8813153865835529,2,4


###Data Cleaning

In [None]:
from pyspark.sql.functions import when,lit,count,isnan,col

def replace(column, value):
    return when(column!=value,column).otherwise(lit(None))

data = data.withColumn("Market Category", replace(col("Market Category"),"N/A"))

data.select([count(when(isnan(c)|col(c).isNull(),c)).alias(c) for c in data.columns]).show()

+----+-----+----+----------------+---------+----------------+-----------------+-------------+---------------+---------------+------------+-------------+-----------+--------+----------+----+
|Make|Model|Year|Engine Fuel Type|Engine HP|Engine Cylinders|Transmission Type|Driven_Wheels|Number of Doors|Market Category|Vehicle Size|Vehicle Style|highway MPG|city mpg|Popularity|MSRP|
+----+-----+----+----------------+---------+----------------+-----------------+-------------+---------------+---------------+------------+-------------+-----------+--------+----------+----+
|   0|    0|   0|               3|       69|              30|                0|            0|              6|           3742|           0|            0|          0|       0|         0|   0|
+----+-----+----+----------------+---------+----------------+-----------------+-------------+---------------+---------------+------------+-------------+-----------+--------+----------+----+



The Market Category has a maximum number of null or nan values which means the  Market Category is not significant in our dataset hence we are free to drop this column.

In [None]:
#deleting the column Market Category
data = data.drop("Market Category")
# deleting the all null values
data = data.na.drop()
print((data.count(), len(data.columns)))

(11812, 15)


Feature Vectors in Spark ML-lib

Spark ML-lib accepts our data in the form of Feature Vectors. We convert our regular columns into the Spark feature vectors.

This is done by Using VectorAssembler from the Spark ML-lib class.

We want to pass “Year”,”highway MPG”,”Engine Cylinders”,
“Number of Doors”,”city mpg”,”Engine HP”,”Popularity” these columns as input features in our model.

We want to train our model on the above features, to do that we need to convert our selected features into a Vector.

In [None]:
# VectorAssembler from the Spark ML-lib class
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols = ["Year","Engine HP","Engine Cylinders","Number of Doors","Popularity",
                                        "highway MPG","city mpg"],
                           outputCol = "features")

In [None]:
data = assembler.transform(data)
final_data = data.select("features", "MSRP")

In [None]:
train_data, test_data = final_data.randomSplit([0.8, 0.2], seed=42)

In [None]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

#Build the linear regression model
lr = LinearRegression(featuresCol="features", labelCol="MSRP", predictionCol="predicted_MSRP")
lr_model = lr.fit(train_data)

In [None]:
#Make predictions and evaluate the model
predictions = lr_model.transform(test_data)

evaluator = RegressionEvaluator(labelCol="MSRP", predictionCol="predicted_MSRP", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data: {:.3f}".format(rmse))

evaluator_r2 = RegressionEvaluator(labelCol="predicted_MSRP", predictionCol="predicted_MSRP", metricName="r2")
r2 = evaluator_r2.evaluate(predictions)
print("R-squared (R2) on test data: {:.3f}".format(r2))

Root Mean Squared Error (RMSE) on test data: 32476.857
R-squared (R2) on test data: 1.000
