In [None]:
df=spark.read.format('delta').table('hive_metastore.jmr_ahk_test.dailyagg_mass_move_gold')
display(df)

In [None]:
from pyspark.ml.feature import Imputer
from pyspark.sql.functions import col

impute_cols = [
    "weightlb"
]
imputer = Imputer(strategy="median", inputCols=impute_cols, outputCols=impute_cols)

imputer_model = imputer.fit(df)
imputed_df = imputer_model.transform(df)

display(imputed_df.orderBy(col('dateDay').desc()))

In [None]:
train_df, test_df = imputed_df.randomSplit([.8, .2], seed=42)

In [None]:
display(train_df.select("restingBpm", "avgBpm","dailyKcal","weightlb").summary())

In [None]:
from pyspark.ml.feature import VectorAssembler
input_cols = [
  "restingBpm", 
  "avgBpm",
  "maxBpm",
  "weightlb"
]
vec_assembler = VectorAssembler(inputCols=input_cols, outputCol="features")

vec_train_df = vec_assembler.transform(train_df)

In [None]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol="features", labelCol="dailyKcal")
lr_model = lr.fit(vec_train_df)

In [None]:
m = lr_model.coefficients[0]
b = lr_model.intercept

print(f"The formula for the linear regression line is y = {m:.2f}x + {b:.2f}")

In [None]:
vec_test_df = vec_assembler.transform(test_df)

pred_df = lr_model.transform(vec_test_df)

display(pred_df.select("dateDay","dailyKcal", "prediction").orderBy(col('dateDay').desc()))

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

regression_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="dailyKcal", metricName="rmse")

rmse = regression_evaluator.evaluate(pred_df)
print(f"RMSE is {rmse}")