# PySpark Mlib Library Introduction for Linear Regression

In [8]:
import pandas as pd 
data = pd.read_csv("salary.csv")
data.head()

Unnamed: 0,Name,Dept,Age,Exp,Sal
0,A,D1,29,5,10000
1,B,D3,25,3,5000
2,C,D1,22,2,4000
3,C,D2,25,2,4000
4,D,D2,37,7,20000


In [9]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Missing").getOrCreate()

In [11]:
training = spark.read.csv("salary.csv",header=True,inferSchema=True)
training.show()

+----+----+---+---+-----+
|Name|Dept|Age|Exp|  Sal|
+----+----+---+---+-----+
|   A|  D1| 29|  5|10000|
|   B|  D3| 25|  3| 5000|
|   C|  D1| 22|  2| 4000|
|   C|  D2| 25|  2| 4000|
|   D|  D2| 37|  7|20000|
|   C|  D1| 22|  1| 3000|
|   B|  D2| 23|  1| 1000|
|   A|  D1| 32|  1| 2000|
|   C|  D3| 24|  4| 7000|
+----+----+---+---+-----+



In [12]:
training.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Dept: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Exp: integer (nullable = true)
 |-- Sal: integer (nullable = true)



In [13]:
training.columns

['Name', 'Dept', 'Age', 'Exp', 'Sal']

## Data Pre-Processing in PySpark
1. In pyspark we use Vector Assembler that is used to group the independent features.

2. New grouped independent feature is created.

In [15]:
from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(inputCols=["Age","Exp"],outputCol="Independent Features")
output = featureassembler.transform(training)
output.show()

+----+----+---+---+-----+--------------------+
|Name|Dept|Age|Exp|  Sal|Independent Features|
+----+----+---+---+-----+--------------------+
|   A|  D1| 29|  5|10000|          [29.0,5.0]|
|   B|  D3| 25|  3| 5000|          [25.0,3.0]|
|   C|  D1| 22|  2| 4000|          [22.0,2.0]|
|   C|  D2| 25|  2| 4000|          [25.0,2.0]|
|   D|  D2| 37|  7|20000|          [37.0,7.0]|
|   C|  D1| 22|  1| 3000|          [22.0,1.0]|
|   B|  D2| 23|  1| 1000|          [23.0,1.0]|
|   A|  D1| 32|  1| 2000|          [32.0,1.0]|
|   C|  D3| 24|  4| 7000|          [24.0,4.0]|
+----+----+---+---+-----+--------------------+



In [17]:
finalized_data = output.select("Independent Features","Sal")
finalized_data.show()

+--------------------+-----+
|Independent Features|  Sal|
+--------------------+-----+
|          [29.0,5.0]|10000|
|          [25.0,3.0]| 5000|
|          [22.0,2.0]| 4000|
|          [25.0,2.0]| 4000|
|          [37.0,7.0]|20000|
|          [22.0,1.0]| 3000|
|          [23.0,1.0]| 1000|
|          [32.0,1.0]| 2000|
|          [24.0,4.0]| 7000|
+--------------------+-----+



In [23]:
from pyspark.ml.regression import LinearRegression
train_data,test_data = finalized_data.randomSplit([0.60,0.40])
reg = LinearRegression(featuresCol="Independent Features",labelCol="Sal")
reg = reg.fit(train_data)

In [24]:
reg.coefficients

DenseVector([534.4311, 1821.8563])

In [25]:
reg.intercept

-12796.407185629623

In [26]:
pred_results = reg.evaluate(test_data)
pred_results.predictions.show()

+--------------------+-----+------------------+
|Independent Features|  Sal|        prediction|
+--------------------+-----+------------------+
|          [22.0,1.0]| 3000| 782.9341317365852|
|          [25.0,2.0]| 4000|  4208.08383233542|
|          [29.0,5.0]|10000|11811.377245508937|
|          [32.0,1.0]| 2000| 6127.245508982573|
+--------------------+-----+------------------+



In [27]:
pred_results.meanAbsoluteError,pred_results.meanSquaredError

(2090.943113772586, 6318480.74061559)