### Basic Machine Learning with Spark - Clustering

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans

spark = SparkSession.builder.getOrCreate()

23/07/15 11:24:50 WARN Utils: Your hostname, bhajji resolves to a loopback address: 127.0.1.1; using 192.168.2.24 instead (on interface wlp2s0)
23/07/15 11:24:50 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/07/15 11:24:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/07/15 11:24:53 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/07/15 11:24:53 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [2]:
json_df2_path = "./Data/utilization.json"
df_util = spark.read.format("json").load(json_df2_path)

                                                                                

In [3]:
df_util.show()

+---------------+-------------------+-----------+---------+-------------+
|cpu_utilization|     event_datetime|free_memory|server_id|session_count|
+---------------+-------------------+-----------+---------+-------------+
|           0.57|03/05/2019 08:06:14|       0.51|      100|           47|
|           0.47|03/05/2019 08:11:14|       0.62|      100|           43|
|           0.56|03/05/2019 08:16:14|       0.57|      100|           62|
|           0.57|03/05/2019 08:21:14|       0.56|      100|           50|
|           0.35|03/05/2019 08:26:14|       0.46|      100|           43|
|           0.41|03/05/2019 08:31:14|       0.58|      100|           48|
|           0.57|03/05/2019 08:36:14|       0.35|      100|           58|
|           0.41|03/05/2019 08:41:14|        0.4|      100|           58|
|           0.53|03/05/2019 08:46:14|       0.35|      100|           62|
|           0.51|03/05/2019 08:51:14|        0.6|      100|           45|
|           0.32|03/05/2019 08:56:14| 

In [4]:
vectorAssembler = VectorAssembler(inputCols=["cpu_utilization", "free_memory", "session_count"], 
                                  outputCol="features")

In [5]:
vcluster_df = vectorAssembler.transform(df_util)

In [6]:
vcluster_df.show()

+---------------+-------------------+-----------+---------+-------------+----------------+
|cpu_utilization|     event_datetime|free_memory|server_id|session_count|        features|
+---------------+-------------------+-----------+---------+-------------+----------------+
|           0.57|03/05/2019 08:06:14|       0.51|      100|           47|[0.57,0.51,47.0]|
|           0.47|03/05/2019 08:11:14|       0.62|      100|           43|[0.47,0.62,43.0]|
|           0.56|03/05/2019 08:16:14|       0.57|      100|           62|[0.56,0.57,62.0]|
|           0.57|03/05/2019 08:21:14|       0.56|      100|           50|[0.57,0.56,50.0]|
|           0.35|03/05/2019 08:26:14|       0.46|      100|           43|[0.35,0.46,43.0]|
|           0.41|03/05/2019 08:31:14|       0.58|      100|           48|[0.41,0.58,48.0]|
|           0.57|03/05/2019 08:36:14|       0.35|      100|           58|[0.57,0.35,58.0]|
|           0.41|03/05/2019 08:41:14|        0.4|      100|           58| [0.41,0.4,58.0]|

In [7]:
kmeans = KMeans().setK(3)
kmeans = kmeans.setSeed(1)

In [8]:
kmodel = kmeans.fit(vcluster_df)

                                                                                

In [9]:
kmodel.clusterCenters()

[array([ 0.62881549,  0.37094643, 70.43030159]),
 array([ 0.52047775,  0.47836303, 51.79927162]),
 array([ 0.71931575,  0.28104316, 88.23965784])]

### Basic Machine Learning with Spark - Linear Regression

In [10]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

In [11]:
json_df2_path = "./Data/utilization.json"
df_util = spark.read.format("json").load(json_df2_path)

                                                                                

In [12]:
vectorAssembler = VectorAssembler(inputCols=["cpu_utilization"], outputCol="features")
df_vutil = vectorAssembler.transform(df_util)
df_vutil.show()

[Stage 33:>                                                         (0 + 1) / 1]

+---------------+-------------------+-----------+---------+-------------+--------+
|cpu_utilization|     event_datetime|free_memory|server_id|session_count|features|
+---------------+-------------------+-----------+---------+-------------+--------+
|           0.57|03/05/2019 08:06:14|       0.51|      100|           47|  [0.57]|
|           0.47|03/05/2019 08:11:14|       0.62|      100|           43|  [0.47]|
|           0.56|03/05/2019 08:16:14|       0.57|      100|           62|  [0.56]|
|           0.57|03/05/2019 08:21:14|       0.56|      100|           50|  [0.57]|
|           0.35|03/05/2019 08:26:14|       0.46|      100|           43|  [0.35]|
|           0.41|03/05/2019 08:31:14|       0.58|      100|           48|  [0.41]|
|           0.57|03/05/2019 08:36:14|       0.35|      100|           58|  [0.57]|
|           0.41|03/05/2019 08:41:14|        0.4|      100|           58|  [0.41]|
|           0.53|03/05/2019 08:46:14|       0.35|      100|           62|  [0.53]|
|   

                                                                                

In [13]:
lr = LinearRegression(featuresCol="features",labelCol="session_count")

In [14]:
lrModel = lr.fit(df_vutil)

23/07/15 11:26:58 WARN Instrumentation: [e2d6e2f0] regParam is zero, which might cause numerical instability and overfitting.


                                                                                

In [15]:
lrModel.coefficients

DenseVector([47.024])

In [16]:
lrModel.intercept

40.416951035523695

In [17]:
lrModel.summary.rootMeanSquaredError

12.83799022593174