In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("ML example").getOrCreate()
spark

In [4]:
### Read the Dataset
df_pyspark = spark.read.csv('Test1.csv',header=True, inferSchema=True)
df_pyspark.show()

+------+---+----------+------+
|  Name|Age|Experience|Salary|
+------+---+----------+------+
| Illia| 20|        10|  1000|
| Artem| 21|         8|  8000|
|  Vlad| 30|        20| 20000|
| Lesia| 45|        10| 15000|
|Nastya| 20|         2| 18000|
| Vania| 27|         6| 30000|
|  Dima| 34|         5|  3000|
|  Roma| 54|         4| 15000|
+------+---+----------+------+



In [5]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [6]:
df_pyspark.columns

['Name', 'Age', 'Experience', 'Salary']

In [7]:
####['Age','Experience']----> new feature -------> Independed feature
from pyspark.ml.feature import VectorAssembler
feature_assembler = VectorAssembler(inputCols=['Age','Experience'], outputCol='Independ Feature')

In [8]:
output = feature_assembler.transform(df_pyspark)

In [9]:
output.show()

+------+---+----------+------+----------------+
|  Name|Age|Experience|Salary|Independ Feature|
+------+---+----------+------+----------------+
| Illia| 20|        10|  1000|     [20.0,10.0]|
| Artem| 21|         8|  8000|      [21.0,8.0]|
|  Vlad| 30|        20| 20000|     [30.0,20.0]|
| Lesia| 45|        10| 15000|     [45.0,10.0]|
|Nastya| 20|         2| 18000|      [20.0,2.0]|
| Vania| 27|         6| 30000|      [27.0,6.0]|
|  Dima| 34|         5|  3000|      [34.0,5.0]|
|  Roma| 54|         4| 15000|      [54.0,4.0]|
+------+---+----------+------+----------------+



In [10]:
output.columns

['Name', 'Age', 'Experience', 'Salary', 'Independ Feature']

In [15]:
finalised_data = output.select(['Independ Feature','Salary'])

In [16]:
finalised_data.show()

+----------------+------+
|Independ Feature|Salary|
+----------------+------+
|     [20.0,10.0]|  1000|
|      [21.0,8.0]|  8000|
|     [30.0,20.0]| 20000|
|     [45.0,10.0]| 15000|
|      [20.0,2.0]| 18000|
|      [27.0,6.0]| 30000|
|      [34.0,5.0]|  3000|
|      [54.0,4.0]| 15000|
+----------------+------+



In [20]:
from pyspark.ml.regression import LinearRegression
#train test split 
train_data, test_data = finalised_data.randomSplit([0.75,0.25])
regressor = LinearRegression(featuresCol='Independ Feature', labelCol='Salary')
regressor = regressor.fit(train_data)

In [21]:
### Coefficients
regressor.coefficients

DenseVector([92.5845, 114.4644])

In [22]:
### Intercepts
regressor.intercept

9947.43518616639

In [23]:
### Prediction
pred_results = regressor.evaluate(test_data)

In [24]:
pred_results.predictions.show()

+----------------+------+------------------+
|Independ Feature|Salary|        prediction|
+----------------+------+------------------+
|     [45.0,10.0]| 15000|15258.382630702963|
+----------------+------+------------------+

