In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('MLib').getOrCreate()

In [4]:
spark

In [6]:
training = spark.read.csv('test3.csv',header=True,inferSchema=True)

In [7]:
training.show()

+------------+---+----------+------+
|        Name|Age|Experience|Salary|
+------------+---+----------+------+
|       harsh| 20|         5| 30000|
|       krish| 30|        10| 25000|
|  naya ladka| 10|         2| 20000|
|purana ladka| 40|        15| 15000|
|  paul allen| 35|        10| 20000|
|        saul| 45|        20| 40000|
+------------+---+----------+------+



In [8]:
training.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [9]:
training.columns

['Name', 'Age', 'Experience', 'Salary']

In [10]:
from pyspark.ml.feature import VectorAssembler

In [11]:
featureassembler = VectorAssembler(inputCols=['Age','Experience'], outputCol='Independent Features')

In [12]:
output = featureassembler.transform(training)

In [13]:
output.show()

+------------+---+----------+------+--------------------+
|        Name|Age|Experience|Salary|Independent Features|
+------------+---+----------+------+--------------------+
|       harsh| 20|         5| 30000|          [20.0,5.0]|
|       krish| 30|        10| 25000|         [30.0,10.0]|
|  naya ladka| 10|         2| 20000|          [10.0,2.0]|
|purana ladka| 40|        15| 15000|         [40.0,15.0]|
|  paul allen| 35|        10| 20000|         [35.0,10.0]|
|        saul| 45|        20| 40000|         [45.0,20.0]|
+------------+---+----------+------+--------------------+



In [14]:
output.columns

['Name', 'Age', 'Experience', 'Salary', 'Independent Features']

In [15]:
finalized_data = output.select('Independent Features','Salary')

In [18]:
finalized_data.show()

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|          [20.0,5.0]| 30000|
|         [30.0,10.0]| 25000|
|          [10.0,2.0]| 20000|
|         [40.0,15.0]| 15000|
|         [35.0,10.0]| 20000|
|         [45.0,20.0]| 40000|
+--------------------+------+



In [19]:
from pyspark.ml.regression import LinearRegression

In [74]:
train_data, test_data = finalized_data.randomSplit([0.75,0.25])

In [75]:
regressor = LinearRegression(featuresCol='Independent Features', labelCol='Salary')

In [76]:
regressor = regressor.fit(train_data)

In [77]:
regressor.coefficients

DenseVector([5250.0, -12500.0])

In [78]:
regressor.intercept

-7500.000000001041

In [79]:
pred_results = regressor.evaluate(test_data)

In [80]:
pred_results.predictions.show()

+--------------------+------+-------------------+
|Independent Features|Salary|         prediction|
+--------------------+------+-------------------+
|          [20.0,5.0]| 30000| 35000.000000000546|
|         [35.0,10.0]| 20000| 51250.000000001164|
|         [45.0,20.0]| 40000|-21250.000000001462|
+--------------------+------+-------------------+



In [81]:
pred_results.meanAbsoluteError, pred_results.meanSquaredError


(32500.00000000106, 1584375000.0000858)