### PySpark implementation of Machine Learning

In [18]:
from pyspark.sql import SparkSession

In [19]:
spark = SparkSession.builder.appName('Missing').getOrCreate()

In [20]:
pwd

'/mnt/sda7/projects/ds/data-engineering/learnings/pyspark'

In [27]:
training = spark.read.csv('test2.csv', header=True, inferSchema=True)

In [28]:
training.show()

+-------+---+----------+------+
|   name|age|experience|salary|
+-------+---+----------+------+
|   Jijo| 25|         4| 30000|
|Adharsh| 25|         5| 25000|
| Sajjad| 23|         2| 20000|
|  Allen| 23|         3| 20000|
| Nikhil| 23|         1| 15000|
|Abhinav| 26|         3| 18000|
+-------+---+----------+------+



In [29]:
training.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: integer (nullable = true)
 |-- salary: integer (nullable = true)



In [31]:
training.columns

['name', 'age', 'experience', 'salary']

In [34]:
from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(inputCols=["age","experience"], outputCol="independent feature")

In [35]:
output = featureassembler.transform(training)

In [36]:
output.show()

+-------+---+----------+------+-------------------+
|   name|age|experience|salary|independent feature|
+-------+---+----------+------+-------------------+
|   Jijo| 25|         4| 30000|         [25.0,4.0]|
|Adharsh| 25|         5| 25000|         [25.0,5.0]|
| Sajjad| 23|         2| 20000|         [23.0,2.0]|
|  Allen| 23|         3| 20000|         [23.0,3.0]|
| Nikhil| 23|         1| 15000|         [23.0,1.0]|
|Abhinav| 26|         3| 18000|         [26.0,3.0]|
+-------+---+----------+------+-------------------+



In [37]:
output.columns

['name', 'age', 'experience', 'salary', 'independent feature']

In [38]:
finalized_data = output.select("independent feature", "salary")

In [39]:
finalized_data.show()

+-------------------+------+
|independent feature|salary|
+-------------------+------+
|         [25.0,4.0]| 30000|
|         [25.0,5.0]| 25000|
|         [23.0,2.0]| 20000|
|         [23.0,3.0]| 20000|
|         [23.0,1.0]| 15000|
|         [26.0,3.0]| 18000|
+-------------------+------+



In [40]:
from pyspark.ml.regression import LinearRegression

In [42]:
train_data, test_data = finalized_data.randomSplit([.75, .25])

In [43]:
regressor = LinearRegression(featuresCol='independent feature', labelCol="salary")

In [None]:
regressor = regressor.fit(train_data)

In [46]:
regressor.coefficients

DenseVector([7500.0, -5000.0])

In [48]:
regressor.intercept

-137500.0000000731

In [49]:
pred_results = regressor.evaluate(test_data)

In [50]:
pred_results.predictions.show()

+-------------------+------+------------------+
|independent feature|salary|        prediction|
+-------------------+------+------------------+
|         [23.0,1.0]| 15000|30000.000000006316|
|         [23.0,2.0]| 20000| 25000.00000000262|
|         [26.0,3.0]| 18000| 42500.00000000972|
+-------------------+------+------------------+



In [53]:
pred_results.meanAbsoluteError, pred_results.meanSquaredError

(14833.333333339551, 283416666.66689736)