In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml import Pipeline

In [None]:
plays_raw = spark.sql("SELECT * FROM nfl_plays_cleaned")

In [None]:
feature_cols = [
    "offenseFormationIndex", 'num_offense_runningbacks',
    'num_offense_tightends', 'num_offense_widereceivers', 'num_def_defline',
    'num_def_linebackers', 'num_def_defbacks', 'delta_defline_offreceivers'
]
target_col = "netYardsGained"

assembler = (
    VectorAssembler().setInputCols(feature_cols).setOutputCol("features"))

plays = assembler.transform(plays_raw)
plays_training, plays_test = plays.randomSplit([0.7, 0.3])
plays_training.cache()
plays_test.cache();

### Simple linear regression

In [None]:
plays_reg_model = (
    LinearRegression().setLabelCol(target_col).setFeaturesCol("features"))

elasticNetParamGrid = [0., 0.5]

paramGrid = (ParamGridBuilder().addGrid(plays_reg_model.elasticNetParam,
                                        elasticNetParamGrid).build())

# What other stages exist?
stages = [plays_reg_model]

pipeline = Pipeline().setStages(stages)

cv = (CrossValidator().setEstimator(pipeline).setEstimatorParamMaps(
    paramGrid).setEvaluator(RegressionEvaluator().setLabelCol(target_col)))

plays_fitted_model = cv.fit(plays_training)

In [None]:
holdout = plays_fitted_model.transform(plays_test).selectExpr(
    "double(prediction) as prediction",
    "double(netYardsGained) as netYardsGained")
rm = RegressionMetrics(
    holdout.select("prediction",
                   "netYardsGained").rdd.map(lambda x: (x[0], x[1])))

print("MSE: ", rm.meanSquaredError)
print("MAE: ", rm.meanAbsoluteError)
print("RMSE Squared: ", rm.rootMeanSquaredError)
print("R Squared: ", rm.r2)
print("Explained Variance: ", rm.explainedVariance, "\n")

In [None]:
display(holdout)

prediction,netYardsGained
7.731653847931227,10.0
5.976338010339347,33.0
8.737610167176223,0.0
6.162576206974366,4.0
6.53811484133186,0.0
8.04072743708992,0.0
6.162576206974366,10.0
6.53811484133186,23.0
6.285411599498039,7.0
6.285411599498039,0.0
