In [0]:
storage_account_name = "databricksdemostorage"
storage_account_key = dbutils.secrets.get("Keys", "Storage")
container = "data"

spark.conf.set(f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net", storage_account_key)

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

schema = StructType([
  StructField("Vendor", StringType(), True),
  StructField("Model", StringType(), True),
  StructField("CycleTime", DoubleType(), True),
  StructField("MinMainMemory", DoubleType(), True),
  StructField("MaxMainMemory", DoubleType(), True),
  StructField("Cache", DoubleType(), True),
  StructField("MinChannels", DoubleType(), True),
  StructField("MaxChannels", DoubleType(), True),
  StructField("PublishedPerf", DoubleType(), True),
  StructField("RelativePerf", DoubleType(), True)
])

In [0]:
data = spark.read.option("header", "true").option("delimeter", ",").schema(schema).csv(f"wasbs://{container}@{storage_account_name}.blob.core.windows.net/machine.data")

data.show()

In [0]:
(train_data, test_data) = data.randomSplit([0.8, 0.2])

In [0]:
print(train_data.count())
print(test_data.count())

In [0]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator

In [0]:
vectors = VectorAssembler(inputCols=['CycleTime', 'MinMainMemory', 'MaxMainMemory', 'Cache', 'MinChannels', 'MaxChannels'], outputCol="features")

vector_data = vectors.transform(train_data)

In [0]:
vector_data.show()

In [0]:
features_data = vector_data.select(["features", "PublishedPerf"])

features_data.show()

In [0]:
lr = LinearRegression(labelCol="PublishedPerf", featuresCol="features")

In [0]:
model = lr.fit(features_data)

In [0]:
summary = model.summary

print("R^2", summary.r2)

In [0]:
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="PublishedPerf", metricName="r2")

In [0]:
vector_test = vectors.transform(test_data)

In [0]:
features_test = vector_test.select(["features", "PublishedPerf"])

In [0]:
test_transform = model.transform(features_test)

test_transform.show()

In [0]:
evaluator.evaluate(test_transform)