# Examples of Pyspark ML

In [1]:
import findspark

In [None]:
#pip install pyspark

In [None]:
#pip install -q findspark

In [2]:
findspark.init()

In [4]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('Missing').getOrCreate()

In [5]:
spark

In [6]:
training=spark.read.csv("C:\\CSV Files\\csv3.csv",header=True, inferSchema=True)

In [7]:
training.show()

+-------+---+----------+------+
|   Name|Age|Experience|salary|
+-------+---+----------+------+
|  Irfan| 26|         5| 22000|
| Sandra| 32|         8| 12000|
| Mehmad| 25|         6| 15000|
| tamara| 19|         4| 19000|
|  soham| 22|         3| 21000|
|shubham| 24|         4| 24000|
+-------+---+----------+------+



In [8]:
training.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- salary: integer (nullable = true)



In [9]:
training.columns

['Name', 'Age', 'Experience', 'salary']

In [None]:
#  [Age, Experience]---> new feature ---> independant feature   #group columns and create new columns called independant feature

In [None]:
## for this we use vector assembler

In [10]:
from pyspark.ml.feature import VectorAssembler

In [12]:
featureassembler =VectorAssembler(inputCols=["Age","Experience"], outputCol="Independant feature") 

In [13]:
output=featureassembler.transform(training)

In [14]:
output.show()

+-------+---+----------+------+-------------------+
|   Name|Age|Experience|salary|Independant feature|
+-------+---+----------+------+-------------------+
|  Irfan| 26|         5| 22000|         [26.0,5.0]|
| Sandra| 32|         8| 12000|         [32.0,8.0]|
| Mehmad| 25|         6| 15000|         [25.0,6.0]|
| tamara| 19|         4| 19000|         [19.0,4.0]|
|  soham| 22|         3| 21000|         [22.0,3.0]|
|shubham| 24|         4| 24000|         [24.0,4.0]|
+-------+---+----------+------+-------------------+



In [15]:
output.columns

['Name', 'Age', 'Experience', 'salary', 'Independant feature']

In [None]:
## now Independant feature is our input feature and salary is our output feature to predict

In [16]:
finalized_data=output.select("Independant feature","salary")

In [17]:
finalized_data.show()

+-------------------+------+
|Independant feature|salary|
+-------------------+------+
|         [26.0,5.0]| 22000|
|         [32.0,8.0]| 12000|
|         [25.0,6.0]| 15000|
|         [19.0,4.0]| 19000|
|         [22.0,3.0]| 21000|
|         [24.0,4.0]| 24000|
+-------------------+------+



In [None]:
# now we do train test split

In [18]:
from pyspark.ml.regression import LinearRegression

In [19]:
train_data, test_data = finalized_data.randomSplit([0.75,0.25])

In [26]:
train_data.show()

+-------------------+------+
|Independant feature|salary|
+-------------------+------+
|         [19.0,4.0]| 19000|
|         [22.0,3.0]| 21000|
|         [24.0,4.0]| 24000|
|         [32.0,8.0]| 12000|
+-------------------+------+



In [20]:
regressor=LinearRegression(featuresCol='Independant feature', labelCol='salary')
regressor=regressor.fit(train_data)

In [None]:
## coefficients

In [21]:
regressor.coefficients

DenseVector([428.5714, -3000.0])

In [None]:
### intercepts

In [22]:
regressor.intercept

22857.14285714272

In [None]:
### prediction on test data

In [23]:
pred_results = regressor.evaluate(test_data)

In [24]:
pred_results.predictions.show()

+-------------------+------+------------------+
|Independant feature|salary|        prediction|
+-------------------+------+------------------+
|         [25.0,6.0]| 15000| 15571.42857142854|
|         [26.0,5.0]| 22000|19000.000000000015|
+-------------------+------+------------------+



In [27]:
pred_results.meanAbsoluteError

1785.7142857142626

In [28]:
pred_results.meanSquaredError

4663265.306122387