In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
data_path = 'C:/Users/pcmr/spark/data'

In [4]:
json_df2_path = data_path + '/utilization.json'

In [7]:
df_util = spark.read.format('json').load(json_df2_path)

In [8]:
df_util.show(10)

+---------------+-------------------+-----------+---------+-------------+
|cpu_utilization|     event_datetime|free_memory|server_id|session_count|
+---------------+-------------------+-----------+---------+-------------+
|           0.57|03/05/2019 08:06:14|       0.51|      100|           47|
|           0.47|03/05/2019 08:11:14|       0.62|      100|           43|
|           0.56|03/05/2019 08:16:14|       0.57|      100|           62|
|           0.57|03/05/2019 08:21:14|       0.56|      100|           50|
|           0.35|03/05/2019 08:26:14|       0.46|      100|           43|
|           0.41|03/05/2019 08:31:14|       0.58|      100|           48|
|           0.57|03/05/2019 08:36:14|       0.35|      100|           58|
|           0.41|03/05/2019 08:41:14|        0.4|      100|           58|
|           0.53|03/05/2019 08:46:14|       0.35|      100|           62|
|           0.51|03/05/2019 08:51:14|        0.6|      100|           45|
+---------------+-------------------+-

In [10]:
df_util.createOrReplaceTempView('utilization')

In [11]:
df_util.count()

500000

In [12]:
## High level view of what a data set is like
df_util.describe().show()

+-------+-------------------+-------------------+------------------+------------------+------------------+
|summary|    cpu_utilization|     event_datetime|       free_memory|         server_id|     session_count|
+-------+-------------------+-------------------+------------------+------------------+------------------+
|  count|             500000|             500000|            500000|            500000|            500000|
|   mean| 0.6205177400000016|               NULL|0.3791281000000014|             124.5|          69.59616|
| stddev|0.15875173872912912|               NULL| 0.158309312783762|14.430884120553037|14.850676696352764|
|    min|               0.22|03/05/2019 08:06:14|               0.0|               100|                32|
|    max|                1.0|04/09/2019 01:22:46|              0.78|               149|               105|
+-------+-------------------+-------------------+------------------+------------------+------------------+



In [13]:
## Check correlation between two variables
df_util.stat.corr('cpu_utilization', 'free_memory')

-0.470477157308072

In [14]:
df_util.stat.corr('session_count', 'free_memory')

-0.5008320848876557

In [15]:
## frecuency of items
df_util.stat.freqItems(('server_id', 'session_count')).show()

+--------------------+-----------------------+
| server_id_freqItems|session_count_freqItems|
+--------------------+-----------------------+
|[137, 146, 101, 1...|   [92, 101, 83, 104...|
+--------------------+-----------------------+



In [17]:
## Sampling the data
df_util_sample = df_util.sample(fraction=0.5, withReplacement=False)

In [20]:
df_util_sample.count()

250154

In [21]:
spark.sql('SELECT min(cpu_utilization), max(cpu_utilization), stddev(cpu_utilization) FROM utilization').show()

+--------------------+--------------------+-----------------------+
|min(cpu_utilization)|max(cpu_utilization)|stddev(cpu_utilization)|
+--------------------+--------------------+-----------------------+
|                0.22|                 1.0|    0.15875173872912912|
+--------------------+--------------------+-----------------------+



In [23]:
spark.sql('SELECT server_id, \
                  min(cpu_utilization), \
                  max(cpu_utilization), \
                  stddev(cpu_utilization) \
            FROM utilization \
            GROUP BY server_id').show()

+---------+--------------------+--------------------+-----------------------+
|server_id|min(cpu_utilization)|max(cpu_utilization)|stddev(cpu_utilization)|
+---------+--------------------+--------------------+-----------------------+
|      103|                0.56|                0.96|    0.11617507884178278|
|      104|                0.51|                0.91|    0.11521679513850488|
|      100|                0.27|                0.67|     0.1152264191787964|
|      101|                 0.6|                 1.0|    0.11651726263197697|
|      102|                0.56|                0.96|    0.11549678751286807|
|      107|                0.45|                0.85|    0.11597417369783877|
|      106|                0.22|                0.62|    0.11531539914568233|
|      105|                0.29|                0.69|    0.11510721467869486|
|      108|                0.55|                0.95|    0.11563100171171929|
|      112|                0.52|                0.92|    0.11528

In [24]:
## Bucket values
spark.sql('SELECT server_id, FLOOR(cpu_utilization*100/10) bucket FROM utilization').show()

+---------+------+
|server_id|bucket|
+---------+------+
|      100|     5|
|      100|     4|
|      100|     5|
|      100|     5|
|      100|     3|
|      100|     4|
|      100|     5|
|      100|     4|
|      100|     5|
|      100|     5|
|      100|     3|
|      100|     6|
|      100|     6|
|      100|     5|
|      100|     2|
|      100|     4|
|      100|     4|
|      100|     6|
|      100|     4|
|      100|     5|
+---------+------+
only showing top 20 rows



In [27]:
## How often does the cpu utilization falls into one of those buckets
spark.sql('SELECT count(*), \
           FLOOR(cpu_utilization*100/10) bucket \
           FROM utilization \
           GROUP BY bucket \
           ORDER BY bucket').show()

+--------+------+
|count(1)|bucket|
+--------+------+
|    8186|     2|
|   37029|     3|
|   68046|     4|
|  104910|     5|
|  116725|     6|
|   88242|     7|
|   56598|     8|
|   20207|     9|
|      57|    10|
+--------+------+



In [None]:
## Timeseries analysis with DataFrames

In [30]:
spark.sql('SELECT server_id, \
                  min(cpu_utilization), \
                  max(cpu_utilization), \
                  stddev(cpu_utilization) \
           FROM utilization \
           GROUP BY server_id').show()

+---------+--------------------+--------------------+-----------------------+
|server_id|min(cpu_utilization)|max(cpu_utilization)|stddev(cpu_utilization)|
+---------+--------------------+--------------------+-----------------------+
|      103|                0.56|                0.96|    0.11617507884178278|
|      104|                0.51|                0.91|    0.11521679513850488|
|      100|                0.27|                0.67|     0.1152264191787964|
|      101|                 0.6|                 1.0|    0.11651726263197697|
|      102|                0.56|                0.96|    0.11549678751286807|
|      107|                0.45|                0.85|    0.11597417369783877|
|      106|                0.22|                0.62|    0.11531539914568233|
|      105|                0.29|                0.69|    0.11510721467869486|
|      108|                0.55|                0.95|    0.11563100171171929|
|      112|                0.52|                0.92|    0.11528

In [31]:
sql_window = spark.sql('SELECT event_datetime, \
                  server_id, \
                  cpu_utilization, \
                  avg(cpu_utilization) \
            OVER (PARTITION BY server_id) avg_server_util \
            FROM utilization')

In [32]:
sql_window.show()

+-------------------+---------+---------------+------------------+
|     event_datetime|server_id|cpu_utilization|   avg_server_util|
+-------------------+---------+---------------+------------------+
|03/05/2019 08:06:31|      110|           0.68|0.5537749999999892|
|03/05/2019 08:11:31|      110|           0.58|0.5537749999999892|
|03/05/2019 08:16:31|      110|           0.55|0.5537749999999892|
|03/05/2019 08:21:31|      110|           0.63|0.5537749999999892|
|03/05/2019 08:26:31|      110|           0.63|0.5537749999999892|
|03/05/2019 08:31:31|      110|           0.71|0.5537749999999892|
|03/05/2019 08:36:31|      110|           0.67|0.5537749999999892|
|03/05/2019 08:41:31|      110|           0.55|0.5537749999999892|
|03/05/2019 08:46:31|      110|           0.37|0.5537749999999892|
|03/05/2019 08:51:31|      110|            0.7|0.5537749999999892|
|03/05/2019 08:56:31|      110|           0.67|0.5537749999999892|
|03/05/2019 09:01:31|      110|           0.56|0.5537749999999

In [36]:
sql_window2 = spark.sql('SELECT event_datetime, \
                  server_id, \
                  cpu_utilization, \
                  avg(cpu_utilization) \
            OVER (PARTITION BY server_id) avg_server_util, \
            cpu_utilization - avg(cpu_utilization) OVER (PARTITION BY server_id) delta_server_util\
            FROM utilization')

In [37]:
sql_window2.show()

+-------------------+---------+---------------+------------------+--------------------+
|     event_datetime|server_id|cpu_utilization|   avg_server_util|   delta_server_util|
+-------------------+---------+---------------+------------------+--------------------+
|03/05/2019 08:06:31|      110|           0.68|0.5537749999999892|  0.1262250000000108|
|03/05/2019 08:11:31|      110|           0.58|0.5537749999999892|0.026225000000010712|
|03/05/2019 08:16:31|      110|           0.55|0.5537749999999892|-0.00377499999998...|
|03/05/2019 08:21:31|      110|           0.63|0.5537749999999892| 0.07622500000001076|
|03/05/2019 08:26:31|      110|           0.63|0.5537749999999892| 0.07622500000001076|
|03/05/2019 08:31:31|      110|           0.71|0.5537749999999892| 0.15622500000001072|
|03/05/2019 08:36:31|      110|           0.67|0.5537749999999892| 0.11622500000001079|
|03/05/2019 08:41:31|      110|           0.55|0.5537749999999892|-0.00377499999998...|
|03/05/2019 08:46:31|      110| 

In [39]:
sql_window3 = spark.sql('SELECT event_datetime, server_id, cpu_utilization, \
                                avg(cpu_utilization) OVER (PARTITION BY server_id ORDER BY event_datetime \
                                                     ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) avg_server_util \
                        FROM utilization')

In [40]:
sql_window3.show()

+-------------------+---------+---------------+-------------------+
|     event_datetime|server_id|cpu_utilization|    avg_server_util|
+-------------------+---------+---------------+-------------------+
|03/05/2019 08:06:31|      110|           0.68|               0.63|
|03/05/2019 08:11:31|      110|           0.58| 0.6033333333333334|
|03/05/2019 08:16:31|      110|           0.55| 0.5866666666666666|
|03/05/2019 08:21:31|      110|           0.63| 0.6033333333333334|
|03/05/2019 08:26:31|      110|           0.63| 0.6566666666666666|
|03/05/2019 08:31:31|      110|           0.71| 0.6699999999999999|
|03/05/2019 08:36:31|      110|           0.67| 0.6433333333333333|
|03/05/2019 08:41:31|      110|           0.55| 0.5300000000000001|
|03/05/2019 08:46:31|      110|           0.37|               0.54|
|03/05/2019 08:51:31|      110|            0.7|               0.58|
|03/05/2019 08:56:31|      110|           0.67| 0.6433333333333334|
|03/05/2019 09:01:31|      110|           0.56| 