In [None]:
# run this cell as is to install PySpark
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317130 sha256=9ab6766a76f3dab6fe23bb2ce1e8781e2f7c2d816426a3696b1965f7019b47b3
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


In [None]:
# Start PySpark
from pyspark.sql import SparkSession

try:
    spark
    print("Spark is already running")
    print(f"{sc.master} appName: {sc.appName}")
except NameError:
    print('starting Spark')
    app_name = 'Week07'
    master = "local[*]"
    spark = SparkSession\
            .builder\
            .appName(app_name)\
            .config('spark.ui.port', '4050')\
            .master(master)\
            .getOrCreate()
sc = spark.sparkContext


starting Spark


## From last week

In [None]:
# (RUN THIS CELL AS IS)
import re
import ast
import time
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Simple linear regression
#           X             y
X_y =  [[  0.39050803,  -1.20623543],
       [  1.72151493,  13.57377242],
       [  0.82210701,   5.50818095],
       [  0.35906546,  -2.19996366],
       [ -0.61076161,  -3.90958845],
       [  1.1671529 ,  11.12900159],
       [ -0.49930231,  -3.63685934],
       [  3.13418401,  22.71362238],
       [  3.70930208,  25.53291143]]

data_rdd = sc.parallelize(X_y).cache()
# The true y = 8x - 2.
#.  [b, m]
W = [-2, 8]  # model
wBroadcast = sc.broadcast(W)  # make available in memory as read-only to the executors (for mappers and reducers)
#                                       (               Xw             -     y)**2
# gradient  (Xw - y) X
MSE  = data_rdd.map(lambda d: (np.dot(np.append(1, d[:-1]), wBroadcast.value) - d[-1])**2).mean()
print(f"MSE:{MSE}")

MSE:5.832730881179018


# Using packages and Dataframes

Next week we will properly introduce Dataframes and how to use them. However, we want it to give a preview on how to use them and how can we use MLlib to train our model (Question 9 of HW04!)

## Dataframes

We can transform any RDD into a Dataframe by a simple command [Click here for the API](https://spark.apache.org/docs/3.1.1/api/python/reference/api/pyspark.sql.DataFrame.html)

In [None]:
# Simple command to transform a RDD to Dataframe
# If you want to name the columns of the RDD
columns = ['X', 'y']
data_df = data_rdd.toDF(columns)
data_df.toPandas() #If the DF is the result of an aaggregation, you can transform it to Pandas

Unnamed: 0,X,y
0,0.390508,-1.206235
1,1.721515,13.573772
2,0.822107,5.508181
3,0.359065,-2.199964
4,-0.610762,-3.909588
5,1.167153,11.129002
6,-0.499302,-3.636859
7,3.134184,22.713622
8,3.709302,25.532911


## Spark SQL

Dataframe API is very similar to Spark SQL [Link Here.](https://spark.apache.org/docs/latest/sql-programming-guide.html) You can use `select(), filter(), and more`

In [None]:
## Adding a new column
from pyspark.sql.functions import *

data_df = data_df.withColumn('2X', 2*col('X'))

## Using MLLib

To use MLLib we need to first create a Vector Assembler (that create a vector of features). Then, we can call it similar to `scikit-learn`

### Step 1: Vector Assembler

In [None]:
from pyspark.ml.feature import VectorAssembler
features = ['X', '2X']
assembler = VectorAssembler(inputCols=features, outputCol="features")
data_df_VA = assembler.transform(data_df)
data_df_VA.show()

+-----------+-----------+-----------+--------------------+
|          X|          y|         2X|            features|
+-----------+-----------+-----------+--------------------+
| 0.39050803|-1.20623543| 0.78101606|[0.39050803,0.781...|
| 1.72151493|13.57377242| 3.44302986|[1.72151493,3.443...|
| 0.82210701| 5.50818095| 1.64421402|[0.82210701,1.644...|
| 0.35906546|-2.19996366| 0.71813092|[0.35906546,0.718...|
|-0.61076161|-3.90958845|-1.22152322|[-0.61076161,-1.2...|
|  1.1671529|11.12900159|  2.3343058|[1.1671529,2.3343...|
|-0.49930231|-3.63685934|-0.99860462|[-0.49930231,-0.9...|
| 3.13418401|22.71362238| 6.26836802|[3.13418401,6.268...|
| 3.70930208|25.53291143| 7.41860416|[3.70930208,7.418...|
+-----------+-----------+-----------+--------------------+



### Step 2: Scale the date

Similar to `scikit-learn`, MLLib have a scaler to scale the data. Its logic is very similar to Vector Assembler. You can specify if you want to scale only by standard deviation (default) or if you want to compute a `Z-score` (flag `withMean = True`)

In [None]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(withMean = True).setInputCol('features').setOutputCol("features_scaled").fit(data_df_VA)
data_df_VA_SC = scaler.transform(data_df_VA)
data_df_VA_SC.show()

+-----------+-----------+-----------+--------------------+--------------------+
|          X|          y|         2X|            features|     features_scaled|
+-----------+-----------+-----------+--------------------+--------------------+
| 0.39050803|-1.20623543| 0.78101606|[0.39050803,0.781...|[-0.4955596577609...|
| 1.72151493|13.57377242| 3.44302986|[1.72151493,3.443...|[0.39322065840177...|
| 0.82210701| 5.50818095| 1.64421402|[0.82210701,1.644...|[-0.2073592901295...|
| 0.35906546|-2.19996366| 0.71813092|[0.35906546,0.718...|[-0.5165554447140...|
|-0.61076161|-3.90958845|-1.22152322|[-0.61076161,-1.2...|[-1.1641578049255...|
|  1.1671529|11.12900159|  2.3343058|[1.1671529,2.3343...|[0.02304522182666...|
|-0.49930231|-3.63685934|-0.99860462|[-0.49930231,-0.9...|[-1.0897308189397...|
| 3.13418401|22.71362238| 6.26836802|[3.13418401,6.268...|[1.33653092340163...|
| 3.70930208|25.53291143| 7.41860416|[3.70930208,7.418...|[1.72056621283963...|
+-----------+-----------+-----------+---

### Step 3: Fit model
The `LinearRegression` method covers both standard Linear Regression (default) or both LASSO and Ridge (0 Corresponds to Ridge, 1 correspond to LASSO for the flag `elasticNetParam`). You can save a summary of the model to extract coefficients, intercept (bias), root mean squared error, and R2

In [None]:
## Train a simple LR model
from pyspark.ml.regression import LinearRegression
# Select the Columns
data_df_VA_SC = data_df_VA_SC.select(['features_scaled', 'y'])
lr = LinearRegression(featuresCol = 'features_scaled', labelCol='y', maxIter=50)
lr_model = lr.fit(data_df_VA_SC)
trainingSummary = lr_model.summary
print("Coefficients: " + str(np.round(lr_model.coefficients, 4)))
print("Intercept: " + str(np.round(lr_model.intercept, 4)))
print("MSE: %f" % trainingSummary.rootMeanSquaredError**2)
print("r2: %f" % trainingSummary.r2)

Coefficients: [5.5618 5.5618]
Intercept: 7.5005
MSE: 4.986968
r2: 0.956625
