In [0]:
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder.appName("lr_example").getOrCreate()

In [0]:
from pyspark.ml.regression import LinearRegression

In [0]:
%sh
wget -P /tmp https://raw.githubusercontent.com/apache/spark/master/data/mllib/sample_linear_regression_data.txt
pwd

In [0]:
dbutils.fs.ls("dbfs:/FileStore/") #caution!

In [0]:
# Move file from local driver to DBFS
# Copy from local driver /tmp into DBFS
dbutils.fs.cp("file:/tmp/sample_linear_regression_data.txt", "dbfs:/BDA_UM_FLINT/datasets")

In [0]:
training = spark.read.format("libsvm").load("dbfs:/BDA_UM_FLINT/datasets/sample_linear_regression_data.txt")

In [0]:
training.count()

In [0]:
training.show()

In [0]:
lr = LinearRegression(featuresCol="features", labelCol="label", predictionCol="prediction")

In [0]:
lrModel = lr.fit(training)

In [0]:
lrModel.coefficients

In [0]:
lrModel.intercept

In [0]:
print("Coefficients: {}".format(str(lrModel.coefficients)))
print("\n")
print("Intercept: {}".format(str(lrModel.intercept)))

In [0]:
trainingSummary = lrModel.summary

In [0]:
trainingSummary.r2

In [0]:
trainingSummary.meanSquaredError

In [0]:
trainingSummary.rootMeanSquaredError

In [0]:
trainingSummary.residuals.show(10)
print("MAE: {}".format(trainingSummary.meanAbsoluteError))
print("MSE: {}".format(trainingSummary.meanSquaredError))
print("RMSE: {}".format(trainingSummary.rootMeanSquaredError))
print("r2: {}".format(trainingSummary.r2))

## Train and Test Split

In [0]:
all_data = spark.read.format("libsvm").load("dbfs:/BDA_UM_FLINT/datasets/sample_linear_regression_data.txt")

In [0]:
train_data, test_data = all_data.randomSplit([0.7,0.3])

In [0]:
train_data.show()

In [0]:
train_data.show(5)
train_data.describe().show()

In [0]:
test_data.show(5)
test_data.describe().show()

In [0]:
correct_model = lr.fit(train_data)

In [0]:
test_results = correct_model.evaluate(test_data)

In [0]:
test_results.residuals.show(5)
test_results.rootMeanSquaredError
print("RMSE on test data = %g" % test_results.rootMeanSquaredError)

In [0]:
unlabeled_data = test_data.select('features')
unlabeled_data.show()

In [0]:
predictions = correct_model.transform(unlabeled_data)

In [0]:
predictions.show()