In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('lr_example').getOrCreate()

In [4]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.functions import corr
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [5]:
# Use Spark to read in the Ecommerce Customers csv file.
data = spark.read.csv('flights.csv', inferSchema = True, header = True)

In [6]:
assembler = VectorAssembler(
    inputCols = ['mile'],
    outputCol = 'features') # input

In [7]:
data_pre = assembler.transform(data)

In [8]:
data_pre.show(2)

+---+---+---+-------+------+---+----+------+--------+-----+--------+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|features|
+---+---+---+-------+------+---+----+------+--------+-----+--------+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351|   NA|[2153.0]|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30| [316.0]|
+---+---+---+-------+------+---+----+------+--------+-----+--------+
only showing top 2 rows



In [9]:
final_data = data_pre.select('features', 'duration')

In [10]:
final_data.show(5)

+--------+--------+
|features|duration|
+--------+--------+
|[2153.0]|     351|
| [316.0]|      82|
| [337.0]|      82|
|[1236.0]|     195|
| [258.0]|      65|
+--------+--------+
only showing top 5 rows



In [15]:
# Check Null values
from pyspark.sql.functions import isnan, when, count, col
final_data.select([count(when(isnan(c), c)).alias(c) for c in final_data.columns]).toPandas().T

AnalysisException: "cannot resolve 'isnan(`features`)' due to data type mismatch: argument 1 requires (double or float) type, however, '`features`' is of struct<type:tinyint,size:int,indices:array<int>,values:array<double>> type.;;\n'Aggregate [count(CASE WHEN isnan(features#31) THEN features END) AS features#131, count(CASE WHEN isnan(cast(duration#18 as double)) THEN duration END) AS duration#133L]\n+- Project [features#31, duration#18]\n   +- Project [mon#10, dom#11, dow#12, carrier#13, flight#14, org#15, mile#16, depart#17, duration#18, delay#19, UDF(named_struct(mile_double_VectorAssembler_ec6b7e54fe68, cast(mile#16 as double))) AS features#31]\n      +- Relation[mon#10,dom#11,dow#12,carrier#13,flight#14,org#15,mile#16,depart#17,duration#18,delay#19] csv\n"

In [14]:
final_data.select([count(when(col(c).isNull(), c)).alias(c) for c in final_data.columns]).toPandas().T

Unnamed: 0,0
features,0
duration,0


In [13]:
train_data, test_data = final_data.randomSplit([0.8,0.2])

In [16]:
from pyspark.ml.regression import LinearRegressionModel

In [20]:
# Create a Linear Regression Model object
lr = LinearRegression(featuresCol='features',
                      labelCol='duration',
                      predictionCol='prediction')

In [18]:
# Fit the model to the data and call this model LrModel
lrModel = lr.fit(train_data,)

In [19]:
# Print the coefficients and intercept for linear regression
print('Coefficients: {} Intercept: {}'.format(lrModel.coefficients, lrModel.intercept))

Coefficients: [0.12164341063857562] Intercept: 44.381052692606644


In [25]:
test_results = lrModel.evaluate(test_data)

In [21]:
# Check test dataset
test_model = lrModel.transform(test_data)

In [22]:
# Inspect results
test_model.select('prediction', 'duration').show(5)

+------------------+--------+
|        prediction|duration|
+------------------+--------+
|52.531161205391214|      43|
|52.531161205391214|      44|
|52.531161205391214|      44|
|52.531161205391214|      46|
|52.531161205391214|      46|
+------------------+--------+
only showing top 5 rows



In [26]:
# Interesting results
test_results.residuals.show(5)

+------------------+
|         residuals|
+------------------+
|-9.531161205391214|
|-8.531161205391214|
|-8.531161205391214|
|-6.531161205391214|
|-6.531161205391214|
+------------------+
only showing top 5 rows



In [27]:
print('RMSE: {}'.format(test_results.rootMeanSquaredError))
print('MSE: {}'.format(test_results.meanSquaredError))
print('r2: {}'.format(test_results.r2))

RMSE: 17.05041104807629
MSE: 290.71651690836194
r2: 0.9613910162158743


In [28]:
from pyspark.ml.evaluation import RegressionEvaluator

In [29]:
RegressionEvaluator(labelCol='duration').evaluate(test_model)

17.05041104807629

In [30]:
# Save model
lrModel.save('lrModel_Flights_50k')

from pyspark.ml.regression import LinearRegressionModel
# Load model from
lrModel2 = LinearRegressionModel.load('lrModel_Flights_50k')

# Predict new values (Assuming select test_data)
unlabeled_data = test_data.select('features')

In [31]:
predictions = lrModel2.transform(unlabeled_data)

In [32]:
predictions.show(5)

+--------+------------------+
|features|        prediction|
+--------+------------------+
|  [67.0]|52.531161205391214|
|  [67.0]|52.531161205391214|
|  [67.0]|52.531161205391214|
|  [67.0]|52.531161205391214|
|  [67.0]|52.531161205391214|
+--------+------------------+
only showing top 5 rows

