In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import os
from pyspark.sql import functions as F

# DATA_DIR_L =  
# DATA_DIR = DATA_DIR_L

DATA_DIR_M = "/home/masa/Downloads/google-cloud-sdk/our_data"
DATA_DIR = DATA_DIR_M

In [2]:
# We use spark session in order to use DataFrames
ss = SparkSession.builder \
    .appName("GoogleClusterAnalysis") \
    .master("local[2]") \
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/12/23 09:28:31 WARN Utils: Your hostname, masa-VirtualBox, resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
25/12/23 09:28:31 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/23 09:28:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
ss.sparkContext.setLogLevel("ERROR")

### 1. What is the distribution of the machines according to their CPU capacity? Can you explain (motivate) it?

https://www.researchgate.net/publication/261164671_Characterizing_Machines_and_Workloads_on_a_Google_Cluster

TODO: add comments and conclusion

In [4]:
machine_schema = StructType([
    StructField('time', LongType(), True),
    StructField('machine_ID', LongType(), True),
    StructField('event_type', IntegerType(), True),
    StructField('platform_ID', StringType(), False),
    StructField('CPUs', DoubleType(), False),
    StructField('memory_capacity', DoubleType(), False),
    ])

In [5]:
df_machines = ss.read.csv(os.path.join(DATA_DIR,"machine_events/*.csv.gz"), schema=machine_schema)
df_machines.show(5)

[Stage 0:>                                                          (0 + 1) / 1]

+----+----------+----------+--------------------+----+---------------+
|time|machine_ID|event_type|         platform_ID|CPUs|memory_capacity|
+----+----------+----------+--------------------+----+---------------+
|   0|         5|         0|HofLGzk1Or/8Ildj2...| 0.5|         0.2493|
|   0|         6|         0|HofLGzk1Or/8Ildj2...| 0.5|         0.2493|
|   0|         7|         0|HofLGzk1Or/8Ildj2...| 0.5|         0.2493|
|   0|        10|         0|HofLGzk1Or/8Ildj2...| 0.5|         0.2493|
|   0|        13|         0|HofLGzk1Or/8Ildj2...| 0.5|         0.2493|
+----+----------+----------+--------------------+----+---------------+
only showing top 5 rows


                                                                                

In [6]:
df_machines.select('machine_ID').distinct().count()

                                                                                

12583

In [7]:
df_machines.select("event_type").distinct().count()

                                                                                

3

In [8]:
df_machines.filter(df_machines["event_type"] == 0).distinct().count()

                                                                                

21443

In [9]:
total_count_t0 = df_machines.filter(df_machines["time"] == 0).distinct().count()
total_count_t0

                                                                                

12477

In [10]:
df_machines.select("machine_ID").filter(df_machines["time"] == 0).distinct().count()

12477

In [11]:
df_machines.select("CPUs").filter(df_machines["time"] == 0).distinct().show()

+----+
|CPUs|
+----+
| 1.0|
| 0.5|
|0.25|
+----+



In [12]:
df_ex1 = df_machines.select(['machine_ID', 'CPUs']).filter(df_machines["time"] == 0).groupby('CPUs').count()
df_ex1.show()

+----+-----+
|CPUs|count|
+----+-----+
| 1.0|  791|
| 0.5|11563|
|0.25|  123|
+----+-----+



In [13]:
df_ex1.withColumn("percentage", F.round((F.col("count") / total_count_t0) * 100)).show()

+----+-----+----------+
|CPUs|count|percentage|
+----+-----+----------+
| 1.0|  791|       6.0|
| 0.5|11563|      93.0|
|0.25|  123|       1.0|
+----+-----+----------+



- at timestamp==0, there are 12477 machines intially started, and for them we want to  find the distribution (of those machines) acoording to their CPU capacity

### 2. What is the percentage of computational power lost due to maintenance (a machine went offline and reconnected later)? The computational power is proportional to both the CPU capacity and the unavailabil- ity period of machines.

- event_type = 1 means that a machine was removed from the cluster. Removals can occur due to
failures or maintenance
- we want to find all the times machien was removed due to maintenance and then reconnected later (event_type=0 in a time step later than for the event_type=1)

In [36]:
from pyspark.sql import Window

In [82]:
window = Window.partitionBy("machine_ID").orderBy("time")

df_next = df_machines.withColumn("next_event", F.lead("event_type").over(window))
df_next = df_next.withColumn("next_time", F.lead("time").over(window))

df_next.show(5)

+------------+----------+----------+--------------------+----+---------------+----------+------------+
|        time|machine_ID|event_type|         platform_ID|CPUs|memory_capacity|next_event|   next_time|
+------------+----------+----------+--------------------+----+---------------+----------+------------+
|           0|         5|         0|HofLGzk1Or/8Ildj2...| 0.5|         0.2493|         1|835150655707|
|835150655707|         5|         1|HofLGzk1Or/8Ildj2...| 0.5|         0.2493|         0|836124903464|
|836124903464|         5|         0|HofLGzk1Or/8Ildj2...| 0.5|         0.2493|      NULL|        NULL|
|           0|         6|         0|HofLGzk1Or/8Ildj2...| 0.5|         0.2493|      NULL|        NULL|
|           0|         7|         0|HofLGzk1Or/8Ildj2...| 0.5|         0.2493|      NULL|        NULL|
+------------+----------+----------+--------------------+----+---------------+----------+------------+
only showing top 5 rows


- By partitioning the window by machine_id, we ensure that all events types are processed independently for **each machine**. In taht way, we are sure that event pairings will refer to the same physical machine

- The lead() function returns the following event and time for the same machine. Previously, we ordered events by time, and therefore, "next" refers to the next chronological event in that machine's sequence of events. In that way, we can be sure that we are identifying distinct offline intervals without overlappings

In [156]:
# this is just a check to see if time intervals are not overlapping, and if event types are matching
df_next.filter(df_reloaded["machine_ID"] == 43).show(5)

+------------+----------+----------+--------------------+----+---------------+----------+------------+
|        time|machine_ID|event_type|         platform_ID|CPUs|memory_capacity|next_event|   next_time|
+------------+----------+----------+--------------------+----+---------------+----------+------------+
|           0|        43|         0|HofLGzk1Or/8Ildj2...| 0.5|         0.2493|         2|372991930737|
|372991930737|        43|         2|HofLGzk1Or/8Ildj2...| 0.5|         0.2493|         2|373198072841|
|373198072841|        43|         2|HofLGzk1Or/8Ildj2...| 0.5|         0.2493|         2|727861243083|
|727861243083|        43|         2|HofLGzk1Or/8Ildj2...| 0.5|         0.2493|         2|736148630106|
|736148630106|        43|         2|HofLGzk1Or/8Ildj2...| 0.5|         0.2493|         2|736768939531|
+------------+----------+----------+--------------------+----+---------------+----------+------------+
only showing top 5 rows


- Now, we want to filter df_next to reflect only situations when machine went offline and was reloaded afterwards. That can be achieved by observing **event_type=1** followed by **next_event=0**

In [160]:
df_reloaded = df_next.filter((df_next["event_type"]==1) & (df_next["next_event"]==0))
df_reloaded.count()

8860

- We notice that this number matches the number of such observed cases of reloaded machines that were mentioned in paper: https://www.researchgate.net/publication/261164671_Characterizing_Machines_and_Workloads_on_a_Google_Cluster  on page 3 (Figure 3) which tells us we're on good trace

In [161]:
# Here we just extract offline time in a column named "dtime"
df_reloaded = df_reloaded.withColumn("dtime", F.col("next_time").cast("long")-F.col("time").cast("long")).drop("time", "next_time")
df_reloaded.show(5)

+----------+----------+--------------------+----+---------------+----------+-----------+
|machine_ID|event_type|         platform_ID|CPUs|memory_capacity|next_event|      dtime|
+----------+----------+--------------------+----+---------------+----------+-----------+
|         5|         1|HofLGzk1Or/8Ildj2...| 0.5|         0.2493|         0|  974247757|
|        10|         1|HofLGzk1Or/8Ildj2...| 0.5|         0.2493|         0|  998726348|
|        13|         1|HofLGzk1Or/8Ildj2...| 0.5|         0.2493|         0|  997280215|
|        23|         1|HofLGzk1Or/8Ildj2...| 0.5|         0.2493|         0|  120851153|
|        26|         1|HofLGzk1Or/8Ildj2...| 0.5|         0.2493|         0|88666880740|
+----------+----------+--------------------+----+---------------+----------+-----------+
only showing top 5 rows


In [162]:
# Now we want to calculate lost resources for each sample as a product of offline time and CPUs, and then we sum over all samples
df_reloaded = df_reloaded.withColumn("lost", df_reloaded["dtime"]*df_reloaded["CPUs"])
total_lost = df_reloaded.agg(F.sum("lost")).collect()[0][0]
total_lost

80273246292457.75

In [163]:
# Now, in order to find percentage of lost resources, we want to find possible total power
total_cpu = df_machines.select("machine_ID", "CPUs").agg(F.sum("CPUs")).collect()[0]["sum(CPUs)"]
trace_bounds = df_machines.agg(
    F.min("time").alias("start"),
    F.max("time").alias("end")
).collect()[0]
total_time = trace_bounds["end"] - trace_bounds["start"]

total_power = total_cpu * total_time

In [164]:
percentage_lost = total_lost/total_power * 100
print(f"Percentage of computational power lost due to maintenance: {percentage_lost:.4f}%")

Percentage of computational power lost due to maintenance: 0.1613%


- Here, we conclude that the total percentage of computational power lost due to mainenance is 0.16%