# Machine Learning

MLlib is Spark’s machine learning (ML) library. Its goal is to make practical machine learning scalable and easy.

At a high level, it provides tools such as:

**ML Algorithms:** Common learning algorithms such as classification, regression, clustering, and collaborative filtering.

**Featurization:** Feature extraction, transformation, dimensionality reduction, and selection.

**Pipelines:** Tools for constructing, evaluating, and tuning ML Pipelines.

**Persistence:** Saving and load algorithms, models, and Pipelines.

**Utilities:** Linear algebra, statistics, data handling, etc.

In [1]:
import pyspark

In [2]:
import findspark

In [3]:
findspark.init()

In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.appName('Session 3').getOrCreate()

In [6]:
spark

In [8]:
df = spark.read.csv('dataset/test1.csv', header=True, inferSchema=True)

In [9]:
df.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [10]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [11]:
from pyspark.ml.feature import VectorAssembler

In [12]:
assembler = VectorAssembler(inputCols=['age', 'Experience'], outputCol='Independent Feature')

In [13]:
assembler_df = assembler.transform(df)

In [14]:
assembler_df.show()

+---------+---+----------+------+-------------------+
|     Name|age|Experience|Salary|Independent Feature|
+---------+---+----------+------+-------------------+
|    Krish| 31|        10| 30000|        [31.0,10.0]|
|Sudhanshu| 30|         8| 25000|         [30.0,8.0]|
|    Sunny| 29|         4| 20000|         [29.0,4.0]|
|     Paul| 24|         3| 20000|         [24.0,3.0]|
|   Harsha| 21|         1| 15000|         [21.0,1.0]|
|  Shubham| 23|         2| 18000|         [23.0,2.0]|
+---------+---+----------+------+-------------------+



In [15]:
assembler_df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- Independent Feature: vector (nullable = true)



In [16]:
final_df = assembler_df.select(['Independent Feature', 'Salary'])

In [17]:
final_df.show()

+-------------------+------+
|Independent Feature|Salary|
+-------------------+------+
|        [31.0,10.0]| 30000|
|         [30.0,8.0]| 25000|
|         [29.0,4.0]| 20000|
|         [24.0,3.0]| 20000|
|         [21.0,1.0]| 15000|
|         [23.0,2.0]| 18000|
+-------------------+------+



In [56]:
train_df, test_df = final_df.randomSplit([0.70, 0.30])

In [57]:
train_df.show()

+-------------------+------+
|Independent Feature|Salary|
+-------------------+------+
|         [21.0,1.0]| 15000|
|         [23.0,2.0]| 18000|
|         [24.0,3.0]| 20000|
|         [29.0,4.0]| 20000|
|         [30.0,8.0]| 25000|
+-------------------+------+



In [58]:
test_df.show()

+-------------------+------+
|Independent Feature|Salary|
+-------------------+------+
|        [31.0,10.0]| 30000|
+-------------------+------+



In [59]:
from pyspark.ml.regression import LinearRegression

In [60]:
model = LinearRegression(featuresCol='Independent Feature', labelCol='Salary')
model = model.fit(train_df)

In [61]:
model.coefficients

DenseVector([28.4757, 1271.3568])

In [62]:
model.intercept

14299.832495812996

In [63]:
result = model.evaluate(test_df)

In [64]:
result.predictions.show()



+-------------------+------+------------------+
|Independent Feature|Salary|        prediction|
+-------------------+------+------------------+
|        [31.0,10.0]| 30000|27896.147403685147|
+-------------------+------+------------------+



In [65]:
result.meanSquaredError

4426195.747020748

In [66]:
result.meanAbsoluteError

2103.852596314853

In [67]:
result.rootMeanSquaredError

2103.852596314853

In [69]:
model.evaluate(train_df).predictions.show()



+-------------------+------+------------------+
|Independent Feature|Salary|        prediction|
+-------------------+------+------------------+
|         [21.0,1.0]| 15000|16169.179229480775|
|         [23.0,2.0]| 18000|17497.487437185944|
|         [24.0,3.0]| 20000|18797.319932998347|
|         [29.0,4.0]| 20000| 20211.05527638182|
|         [30.0,8.0]| 25000|25324.958123953118|
+-------------------+------+------------------+

