Example of PySpark ML flow

In [1]:
# Create spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("ML").getOrCreate()

In [23]:
# Read Dataset
training = spark.read.csv("Salary_Data.csv",header = True, inferSchema=True)

In [24]:
training.printSchema()

root
 |-- YearsExperience: double (nullable = true)
 |-- Age: double (nullable = true)
 |-- Salary: integer (nullable = true)



In [25]:
training.columns

['YearsExperience', 'Age', 'Salary']

In [26]:
training.show(3)

+---------------+----+------+
|YearsExperience| Age|Salary|
+---------------+----+------+
|            1.1|21.0| 39343|
|            1.3|21.5| 46205|
|            1.5|21.7| 37731|
+---------------+----+------+
only showing top 3 rows



In [None]:
# For Pyspark we first have to group the independent features , i.e, make a vector assembler

## Step 1

In [27]:
from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(inputCols = ["Age","YearsExperience"], 
                                   outputCol = "Independent Features")

In [28]:
output = featureassembler.transform(training)

In [30]:
output.show(3)

+---------------+----+------+--------------------+
|YearsExperience| Age|Salary|Independent Features|
+---------------+----+------+--------------------+
|            1.1|21.0| 39343|          [21.0,1.1]|
|            1.3|21.5| 46205|          [21.5,1.3]|
|            1.5|21.7| 37731|          [21.7,1.5]|
+---------------+----+------+--------------------+
only showing top 3 rows



## Step 2

In [32]:
# Since we are only interested in Salary(dependent variable) and Independent Feature columns. we will make a finalized dataset
finalized_data = output.select("Independent Features","Salary")
finalized_data.show(3)

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|          [21.0,1.1]| 39343|
|          [21.5,1.3]| 46205|
|          [21.7,1.5]| 37731|
+--------------------+------+
only showing top 3 rows



## Step 3 - Train Test split

In [33]:
from pyspark.ml.regression import LinearRegression 
train_data, test_data = finalized_data.randomSplit([0.75,0.25])

regressor = LinearRegression(featuresCol = "Independent Features",labelCol= "Salary")
regressor = regressor.fit(train_data)

# Result

In [36]:
# coefficients
regressor.coefficients

DenseVector([2538.3705, 5010.513])

In [37]:
# Intercepts
regressor.intercept

-19670.906603940253

In [41]:
# Prediction 
pred_result = regressor.evaluate(test_data)
pred_result.predictions.show(5)

+--------------------+------+------------------+
|Independent Features|Salary|        prediction|
+--------------------+------+------------------+
|          [21.7,1.5]| 37731|42927.503273045964|
|          [22.0,2.0]| 43525| 46194.27092418802|
|          [23.3,3.2]| 54445|55506.768191528216|
|          [23.3,3.2]| 64445|55506.768191528216|
|          [28.0,5.9]| 81363| 80965.49472522245|
+--------------------+------+------------------+
only showing top 5 rows

