In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession

# Import VectorAssembler and Vectors
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.sql.functions import corr

In [3]:
sc = SparkContext()
spark = SparkSession(sc)

### Chuẩn bị và chuẩn hóa dữ liệu, xác định inputs, output

In [4]:
# Use Spark to read in the flights csv file
data = spark.read.csv(["../../Data/flights.csv"], header=True, inferSchema=True)

In [5]:
# Print the Schema of the DataFrame
data.printSchema()

root
 |-- mon: integer (nullable = true)
 |-- dom: integer (nullable = true)
 |-- dow: integer (nullable = true)
 |-- carrier: string (nullable = true)
 |-- flight: integer (nullable = true)
 |-- org: string (nullable = true)
 |-- mile: integer (nullable = true)
 |-- depart: double (nullable = true)
 |-- duration: integer (nullable = true)
 |-- delay: string (nullable = true)



In [6]:
data.show(3)

+---+---+---+-------+------+---+----+------+--------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|
+---+---+---+-------+------+---+----+------+--------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351|   NA|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|
+---+---+---+-------+------+---+----+------+--------+-----+
only showing top 3 rows



In [7]:
data.columns

['mon',
 'dom',
 'dow',
 'carrier',
 'flight',
 'org',
 'mile',
 'depart',
 'duration',
 'delay']

In [8]:
assembler = VectorAssembler(inputCols=["mile"],
                            outputCol="features")

In [9]:
data_pre = assembler.transform(data)

In [10]:
data_pre.select('features').show(2, False)

+--------+
|features|
+--------+
|[2153.0]|
|[316.0] |
+--------+
only showing top 2 rows



In [11]:
final_data = data_pre.select('features', 'duration')

In [12]:
final_data.show(3, truncate=False)

+--------+--------+
|features|duration|
+--------+--------+
|[2153.0]|351     |
|[316.0] |82      |
|[337.0] |82      |
+--------+--------+
only showing top 3 rows



### Chuẩn bị train/test dataset

In [13]:
train_data, test_data = final_data.randomSplit([0.8, 0.2])

In [14]:
train_data.describe().show()

+-------+-----------------+
|summary|         duration|
+-------+-----------------+
|  count|            40206|
|   mean|151.6067004924638|
| stddev|86.70382192325576|
|    min|               30|
|    max|              560|
+-------+-----------------+



In [15]:
test_data.describe().show()

+-------+------------------+
|summary|          duration|
+-------+------------------+
|  count|              9794|
|   mean|152.41903206044518|
| stddev|  88.4336473631593|
|    min|                30|
|    max|               560|
+-------+------------------+



### Xây dựng model với train dataset

In [16]:
# Create a Linear Regression Model object
lr = LinearRegression(featuresCol='features',
                      labelCol='duration',
                      predictionCol='prediction')

In [17]:
# Fit the model to the data and call this model lrModel
lrModel = lr.fit(train_data)

In [18]:
# Print the coefficients and intercept for linear regression
print('Coefficients: {}, Intercept: {}'.format(lrModel.coefficients, lrModel.intercept))

Coefficients: [0.12163311780959021], Intercept: 44.44337070409753


### Đánh giá model với test dataset

In [19]:
test_results = lrModel.evaluate(test_data)

In [20]:
# Interesting results...
test_results.residuals.show(5)

+------------------+
|         residuals|
+------------------+
|-9.592789597340072|
|-9.592789597340072|
|-8.592789597340072|
|-8.592789597340072|
|-6.592789597340072|
+------------------+
only showing top 5 rows



In [21]:
print('RMSE: {}'.format(test_results.rootMeanSquaredError))
print('MSE: {}'.format(test_results.meanSquaredError))
print('r2: {}'.format(test_results.r2))

RMSE: 17.17209772759433
MSE: 294.8809403660503
r2: 0.9622900485651407


In [22]:
# Check test dataset
test_model = lrModel.transform(test_data)

In [23]:
# Inspect results
test_model.select('prediction', 'duration').show(5)

+-----------------+--------+
|       prediction|duration|
+-----------------+--------+
|52.59278959734007|      43|
|52.59278959734007|      43|
|52.59278959734007|      44|
|52.59278959734007|      44|
|52.59278959734007|      46|
+-----------------+--------+
only showing top 5 rows



In [24]:
from pyspark.ml.evaluation import RegressionEvaluator

In [25]:
RegressionEvaluator(labelCol='duration').evaluate(test_model)

17.17209772759433

### Lưu trữ và tải model

In [26]:
# Save Model
lrModel.save('lrModel_flights')

In [27]:
from pyspark.ml.regression import LinearRegressionModel
# Load model from
lrModel2 = LinearRegressionModel.load('lrModel_flights')

### Dự đoán dữ liệu mới

In [28]:
unlabeled_data = test_data.select('features')

In [29]:
predictions = lrModel2.transform(unlabeled_data)

In [30]:
predictions.show(5)

+--------+-----------------+
|features|       prediction|
+--------+-----------------+
|  [67.0]|52.59278959734007|
|  [67.0]|52.59278959734007|
|  [67.0]|52.59278959734007|
|  [67.0]|52.59278959734007|
|  [67.0]|52.59278959734007|
+--------+-----------------+
only showing top 5 rows

