# Machine Learning with Spark

In [1]:
from pydataset import data
import pyspark
import pyspark.ml
from pyspark.sql.functions import *

spark = pyspark.sql.SparkSession.builder.getOrCreate()

df = spark.createDataFrame(data('tips'))

train, test = df.randomSplit([0.8, 0.2], seed=123)

In [2]:
def shape(df: pyspark.sql.DataFrame):
    return df.count(), len(df.columns)

In [3]:
shape(train)

(192, 7)

In [4]:
shape(test)


(52, 7)

# Regression
We'll first demonstrate a regression problem: predicting the tip amount.

In [5]:
train.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
+----------+----+------+------+---+------+----+
only showing top 5 rows



`pyspark.ml.feature.RFormula`
- `tip ~ total_bill`: predict tip based on total bill
- `tip ~ total_bill + size`: predict tip based on total bill and size
- `tip ~ .`: predict tip based on all the other features in the dataset

In [13]:
pyspark.ml.feature.RFormula()

SyntaxError: invalid syntax (<ipython-input-13-b883ce1a3320>, line 1)

In [7]:
# nb: spark's rformula does encoding
rf = pyspark.ml.feature.RFormula(formula="tip ~ total_bill + size").fit(train)

rf

RFormula_88fc3278caa5

In [6]:
rf.transform(train).show(5)

+----------+----+------+------+---+------+----+-----------+-----+
|total_bill| tip|   sex|smoker|day|  time|size|   features|label|
+----------+----+------+------+---+------+----+-----------+-----+
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2| [8.77,2.0]|  2.0|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|[14.78,2.0]| 3.23|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|[14.83,2.0]| 3.02|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|[15.04,2.0]| 1.96|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|[15.42,2.0]| 1.57|
+----------+----+------+------+---+------+----+-----------+-----+
only showing top 5 rows



`features` and `labels` columns are the shape/name required for `pyspark.ml`

In [14]:
train_input = rf.transform(train).select('features', 'label')
train_input.show(5)

+-----------+-----+
|   features|label|
+-----------+-----+
| [8.77,2.0]|  2.0|
|[14.78,2.0]| 3.23|
|[14.83,2.0]| 3.02|
|[15.04,2.0]| 1.96|
|[15.42,2.0]| 1.57|
+-----------+-----+
only showing top 5 rows



Create, fit, and use the model.

**Note**: unlike sklearn, each step produces a new object!

In [15]:
lr = pyspark.ml.regression.LinearRegression()
lr

LinearRegression_896f4ccaa93f

In [17]:
print(lr.explainParams())

aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
epsilon: The shape parameter to control the amount of robustness. Must be > 1.0. Only valid when loss is huber (default: 1.35)
featuresCol: features column name. (default: features)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label)
loss: The loss function to be optimized. Supported options: squaredError, huber. (default: squaredError)
maxIter: max number of iterations (>= 0). (default: 100)
predictionCol: prediction column name. (default: prediction)
regParam: regularization parameter (>= 0). (default: 0.0)
solver: The solver algorithm for optimization. Supported options: auto, normal, l-bfgs. (default: auto)
standardization: whether to standardize the training features before fitting the model.

In [18]:
lr_fit = lr.fit(train_input)
lr_fit.transform(train_input).show(5)

+-----------+-----+------------------+
|   features|label|        prediction|
+-----------+-----+------------------+
| [8.77,2.0]|  2.0|1.8651089944622925|
|[14.78,2.0]| 3.23|2.4179416721366804|
|[14.83,2.0]| 3.02|2.4225409456614253|
|[15.04,2.0]| 1.96|2.4418578944653557|
|[15.42,2.0]| 1.57|2.4768123732534204|
+-----------+-----+------------------+
only showing top 5 rows



Training results:

In [19]:
lr_fit.summary

<pyspark.ml.regression.LinearRegressionTrainingSummary at 0x116ba7c50>

In [20]:
lr_fit.summary.r2, lr_fit.summary.rootMeanSquaredError

(0.4837967854062204, 0.9789098245570234)

In [21]:
[x for x in dir(lr_fit.summary) if not x.startswith('_')]

['coefficientStandardErrors',
 'degreesOfFreedom',
 'devianceResiduals',
 'explainedVariance',
 'featuresCol',
 'labelCol',
 'meanAbsoluteError',
 'meanSquaredError',
 'numInstances',
 'objectiveHistory',
 'pValues',
 'predictionCol',
 'predictions',
 'r2',
 'r2adj',
 'residuals',
 'rootMeanSquaredError',
 'tValues',
 'totalIterations']

How do we do on the test data?

In [22]:
test_input = rf.transform(test)
lr_fit.transform(test_input).show(4)

+----------+----+------+------+---+------+----+-----------+-----+------------------+
|total_bill| tip|   sex|smoker|day|  time|size|   features|label|        prediction|
+----------+----+------+------+---+------+----+-----------+-----+------------------+
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|[10.27,2.0]| 1.71|2.0030872002046523|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|[10.33,3.0]| 1.67|2.1772701176223346|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|[10.34,3.0]| 1.66|2.1781899723272837|
|     16.29|3.71|  Male|    No|Sun|Dinner|   3|[16.29,3.0]| 3.71| 2.725503521771977|
+----------+----+------+------+---+------+----+-----------+-----+------------------+
only showing top 4 rows



In [23]:
evaluator = pyspark.ml.evaluation.RegressionEvaluator()
rmse = evaluator.evaluate(lr_fit.transform(test_input))
rmse

1.1083950180451543

# Classification

predict time of day

In [24]:
train.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
+----------+----+------+------+---+------+----+
only showing top 5 rows



Preprocess the training data

In [25]:
rf = pyspark.ml.feature.RFormula(formula='time ~ total_bill + size').fit(train)
train_input = rf.transform(train)
train_input.show(50)

# pyspark encodes our data for us

+----------+----+------+------+---+------+----+-----------+-----+
|total_bill| tip|   sex|smoker|day|  time|size|   features|label|
+----------+----+------+------+---+------+----+-----------+-----+
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2| [8.77,2.0]|  0.0|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|[14.78,2.0]|  0.0|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|[14.83,2.0]|  0.0|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|[15.04,2.0]|  0.0|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|[15.42,2.0]|  0.0|
|     16.97| 3.5|Female|    No|Sun|Dinner|   3|[16.97,3.0]|  0.0|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|[18.43,4.0]|  0.0|
|     20.65|3.35|  Male|    No|Sat|Dinner|   3|[20.65,3.0]|  0.0|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|[21.01,3.0]|  0.0|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|[21.58,2.0]|  0.0|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|[23.68,2.0]|  0.0|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|[25.29,4.0]|  0.0|
|     26.8

Create and fit the model

In [26]:
lr = pyspark.ml.classification.LogisticRegression()

In [27]:
print(lr.explainParams())

aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial (default: auto)
featuresCol: features column name. (default: features)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label)
lowerBoundsOnCoefficients: The lower bounds on coefficients if fitting under bound constrained optimization. The bound matrix must be compatible with the shape (1, number of features) for binomial regression, or (number of classes, number of features) for multinomial regression. (undefined)
lowerBoundsOnIntercepts: The lower bounds on intercepts if fitting under bound constrained optimization. The bounds vector size must beequal wi

In [28]:
lr_fit = lr.fit(train_input)

Model Evaluation

In [29]:
[x for x in dir(lr_fit.summary) if not x.startswith('_')]

['accuracy',
 'areaUnderROC',
 'fMeasureByLabel',
 'fMeasureByThreshold',
 'falsePositiveRateByLabel',
 'featuresCol',
 'labelCol',
 'labels',
 'objectiveHistory',
 'pr',
 'precisionByLabel',
 'precisionByThreshold',
 'predictionCol',
 'predictions',
 'probabilityCol',
 'recallByLabel',
 'recallByThreshold',
 'roc',
 'totalIterations',
 'truePositiveRateByLabel',
 'weightedFMeasure',
 'weightedFalsePositiveRate',
 'weightedPrecision',
 'weightedRecall',
 'weightedTruePositiveRate']

Area under ROC Curve

- Produce a curve where each point on the curve is the TPR vs FPR; multiple points are found by adjusting the threshold for the mode [animation]()
- This works for models that predict a probability in addition to a yes/no
- number between 0 and 1, closer to 1 is better.

In [30]:
# area under TPR (recall) vs FPR (FP / (FP + TN)) curve
# https://en.wikipedia.org/wiki/Receiver_operating_characteristic
lr_fit.summary.areaUnderROC

0.6625082946250829

In [31]:
evaluator = pyspark.ml.evaluation.BinaryClassificationEvaluator()
test_auc = evaluator.evaluate(lr_fit.transform(rf.transform(test)))
test_auc

0.4319526627218935

In [32]:
test_input = rf.transform(test)

test_input.show()

+----------+----+------+------+----+------+----+-----------+-----+
|total_bill| tip|   sex|smoker| day|  time|size|   features|label|
+----------+----+------+------+----+------+----+-----------+-----+
|     10.27|1.71|  Male|    No| Sun|Dinner|   2|[10.27,2.0]|  0.0|
|     10.33|1.67|Female|    No| Sun|Dinner|   3|[10.33,3.0]|  0.0|
|     10.34|1.66|  Male|    No| Sun|Dinner|   3|[10.34,3.0]|  0.0|
|     16.29|3.71|  Male|    No| Sun|Dinner|   3|[16.29,3.0]|  0.0|
|     16.99|1.01|Female|    No| Sun|Dinner|   2|[16.99,2.0]|  0.0|
|     24.59|3.61|Female|    No| Sun|Dinner|   4|[24.59,4.0]|  0.0|
|     16.93|3.07|Female|    No| Sat|Dinner|   3|[16.93,3.0]|  0.0|
|     17.81|2.34|  Male|    No| Sat|Dinner|   4|[17.81,4.0]|  0.0|
|     17.92|4.08|  Male|    No| Sat|Dinner|   2|[17.92,2.0]|  0.0|
|      21.7| 4.3|  Male|    No| Sat|Dinner|   2| [21.7,2.0]|  0.0|
|     39.42|7.58|  Male|    No| Sat|Dinner|   4|[39.42,4.0]|  0.0|
|      9.68|1.32|  Male|    No| Sun|Dinner|   2| [9.68,2.0]|  

In [33]:
# confusion matrix for the test data
(lr_fit.transform(test_input)
 .select('time', 'total_bill', 'size', 'label', 'probability', 'prediction')
 .groupby('prediction') # predicted == rows
 .pivot('label') # actual values are columns
 .count()
 .show())

+----------+---+----+
|prediction|0.0| 1.0|
+----------+---+----+
|       0.0| 36|  13|
|       1.0|  3|null|
+----------+---+----+



In [34]:
# Many other preprocessing steps
dir(pyspark.ml.feature)

['Binarizer',
 'BucketedRandomProjectionLSH',
 'BucketedRandomProjectionLSHModel',
 'Bucketizer',
 'ChiSqSelector',
 'ChiSqSelectorModel',
 'CountVectorizer',
 'CountVectorizerModel',
 'DCT',
 'DecisionTreeParams',
 'ElementwiseProduct',
 'FeatureHasher',
 'HasAggregationDepth',
 'HasCheckpointInterval',
 'HasCollectSubModels',
 'HasDistanceMeasure',
 'HasElasticNetParam',
 'HasFeaturesCol',
 'HasFitIntercept',
 'HasHandleInvalid',
 'HasInputCol',
 'HasInputCols',
 'HasLabelCol',
 'HasLoss',
 'HasMaxIter',
 'HasNumFeatures',
 'HasOutputCol',
 'HasOutputCols',
 'HasParallelism',
 'HasPredictionCol',
 'HasProbabilityCol',
 'HasRawPredictionCol',
 'HasRegParam',
 'HasSeed',
 'HasSolver',
 'HasStandardization',
 'HasStepSize',
 'HasThreshold',
 'HasThresholds',
 'HasTol',
 'HasVarianceCol',
 'HasWeightCol',
 'HashingTF',
 'IDF',
 'IDFModel',
 'Imputer',
 'ImputerModel',
 'IndexToString',
 'JavaEstimator',
 'JavaMLReadable',
 'JavaMLWritable',
 'JavaModel',
 'JavaParams',
 'JavaTransformer'