In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession

# Import VectorAssembler and Vectors
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.sql.functions import corr

In [3]:
sc = SparkContext()
spark = SparkSession(sc)

### Chuẩn bị và chuẩn hóa dữ liệu, xác định inputs, output

In [4]:
# Use Spark to read in the flights csv file
data = spark.read.csv(["../../Data/flights.csv"], header=True, inferSchema=True)

In [5]:
# Print the Schema of the DataFrame
data.printSchema()

root
 |-- mon: integer (nullable = true)
 |-- dom: integer (nullable = true)
 |-- dow: integer (nullable = true)
 |-- carrier: string (nullable = true)
 |-- flight: integer (nullable = true)
 |-- org: string (nullable = true)
 |-- mile: integer (nullable = true)
 |-- depart: double (nullable = true)
 |-- duration: integer (nullable = true)
 |-- delay: string (nullable = true)



In [6]:
data.show(3)

+---+---+---+-------+------+---+----+------+--------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|
+---+---+---+-------+------+---+----+------+--------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351|   NA|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|
+---+---+---+-------+------+---+----+------+--------+-----+
only showing top 3 rows



In [7]:
data.columns

['mon',
 'dom',
 'dow',
 'carrier',
 'flight',
 'org',
 'mile',
 'depart',
 'duration',
 'delay']

In [8]:
assembler = VectorAssembler(inputCols=["mile"],
                            outputCol="features")

In [9]:
data_pre = assembler.transform(data)

In [10]:
data_pre.select('features').show(2, False)

+--------+
|features|
+--------+
|[2153.0]|
|[316.0] |
+--------+
only showing top 2 rows



In [11]:
final_data = data_pre.select('features', 'duration')

In [12]:
final_data.show(3, truncate=False)

+--------+--------+
|features|duration|
+--------+--------+
|[2153.0]|351     |
|[316.0] |82      |
|[337.0] |82      |
+--------+--------+
only showing top 3 rows



### Chuẩn bị train/test dataset

In [13]:
train_data, test_data = final_data.randomSplit([0.8, 0.2])

In [14]:
train_data.describe().show()

+-------+------------------+
|summary|          duration|
+-------+------------------+
|  count|             40121|
|   mean|151.84180354427855|
| stddev| 87.13775106230814|
|    min|                30|
|    max|               560|
+-------+------------------+



In [15]:
test_data.describe().show()

+-------+-----------------+
|summary|         duration|
+-------+-----------------+
|  count|             9879|
|   mean|151.4572325134123|
| stddev|86.67137754669051|
|    min|               30|
|    max|              560|
+-------+-----------------+



### Xây dựng model với train dataset

In [16]:
# Create a Linear Regression Model object
lr = LinearRegression(featuresCol='features',
                      labelCol='duration',
                      predictionCol='prediction')

In [17]:
# Fit the model to the data and call this model lrModel
lrModel = lr.fit(train_data)

In [18]:
# Print the coefficients and intercept for linear regression
print('Coefficients: {}, Intercept: {}'.format(lrModel.coefficients, lrModel.intercept))

Coefficients: [0.12160147121099962], Intercept: 44.50923645469706


### Đánh giá model với test dataset

In [19]:
test_model = lrModel.evaluate(test_data)

In [20]:
# Interesting results...
test_model.residuals.show(5)

+------------------+
|         residuals|
+------------------+
|-8.656535025834032|
|-6.656535025834032|
|-5.656535025834032|
|-5.656535025834032|
|-5.656535025834032|
+------------------+
only showing top 5 rows



In [21]:
print('RMSE: {}'.format(test_model.rootMeanSquaredError))
print('MSE: {}'.format(test_model.meanSquaredError))
print('r2: {}'.format(v.r2))

RMSE: 16.75783537998422
MSE: 280.8250466226508
r2: 0.9626123296561192


In [22]:
from pyspark.ml.evaluation import RegressionEvaluator

In [24]:
RegressionEvaluator(labelCol='duration').evaluate(test_model)

AttributeError: 'LinearRegressionModel' object has no attribute '_jdf'

In [None]:
# Check test dataset
test_model = lrModel.transform(test_data)

In [None]:
# Inspect results
test_model.select('prediction', 'duration').show(5)

### Lưu trữ và tải model

In [None]:
# Save Model
lrModel.save('lrModel_flights')

In [None]:
from pyspark.ml.regression import LinearRegressionModel
# Load model from
lrModel2 = LinearRegressionModel.load('lrModel_flights')

### Dự đoán dữ liệu mới

In [None]:
unlabeled_data = test_data.select('features')

In [None]:
predictions = lrModel2.transform(unlabeled_data)

In [None]:
predictions.show(5)