# Pyspark Machine Learning Library (MLlib)

- Predict salary based on Age and Experience


In [27]:
# Create a Spark session and load the data set

from pyspark.sql import SparkSession

spark_session = SparkSession.builder.appName(
    "Lesson 6 - MLLib").getOrCreate()

data_frame = spark_session.read.csv('test1.csv', header=True, inferSchema=True)
data_frame.show()
data_frame.printSchema()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [28]:
# Group target inputs as independent features
# [age, Experience] -> New Feature (independent)
from pyspark.ml.feature import VectorAssembler

target_columns = ['age', 'Experience']
feature_asm = VectorAssembler(
    inputCols=target_columns, outputCol="Independent Features")

In [29]:
output = feature_asm.transform(data_frame)
output.show()

+---------+---+----------+------+--------------------+
|     Name|age|Experience|Salary|Independent Features|
+---------+---+----------+------+--------------------+
|    Krish| 31|        10| 30000|         [31.0,10.0]|
|Sudhanshu| 30|         8| 25000|          [30.0,8.0]|
|    Sunny| 29|         4| 20000|          [29.0,4.0]|
|     Paul| 24|         3| 20000|          [24.0,3.0]|
|   Harsha| 21|         1| 15000|          [21.0,1.0]|
|  Shubham| 23|         2| 18000|          [23.0,2.0]|
+---------+---+----------+------+--------------------+



In [30]:
final_data = output.select("Independent Features", 'Salary')
final_data.show()


+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|         [31.0,10.0]| 30000|
|          [30.0,8.0]| 25000|
|          [29.0,4.0]| 20000|
|          [24.0,3.0]| 20000|
|          [21.0,1.0]| 15000|
|          [23.0,2.0]| 18000|
+--------------------+------+



In [31]:
# Create and train a linear regression model

from pyspark.ml.regression import LinearRegression

# The train data set gets 75% of the data, the test set gets 25%
train_data, test_data = final_data.randomSplit([0.75, 0.25])
regressor = LinearRegression(
    featuresCol='Independent Features', labelCol='Salary')
regressor = regressor.fit(train_data)


22/11/03 17:37:50 WARN Instrumentation: [c3ce8959] regParam is zero, which might cause numerical instability and overfitting.


In [32]:
# Coeficients
regressor.coefficients

DenseVector([-263.7076, 1767.624])

In [33]:
# Intercepts
regressor.intercept


19919.060052212404

In [34]:
# Prediction
results = regressor.evaluate(test_data)
results.predictions.show()

+--------------------+------+-----------------+
|Independent Features|Salary|       prediction|
+--------------------+------+-----------------+
|          [29.0,4.0]| 20000|19342.03655352618|
+--------------------+------+-----------------+



In [35]:
results.meanAbsoluteError, results.meanSquaredError

(657.9634464738192, 432915.89689570636)