In [5]:
from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.sql.types import *
import os
from pyspark.sql import functions as F
import matplotlib.pyplot as plt

# DATA_DIR_L = ""
# DATA_DIR = DATA_DIR_L

DATA_DIR_M = "/home/masa/Downloads/"
DATA_DIR = DATA_DIR_M

In [6]:
# We use spark session in order to use DataFrames
ss = SparkSession.builder \
    .appName("GoogleClusterAnalysis") \
    .master("local[2]") \
    .getOrCreate()

In [7]:
ss.sparkContext.setLogLevel("ERROR")

For this data exploration we are using [https://github.com/alibaba/clusterdata/tree/master/cluster-trace-v2017](https://github.com/alibaba/clusterdata/tree/master/cluster-trace-v2017) 

### 1. What is the distribution of the machines according to their CPU capacity? Can you explain (motivate) it?

- This question analysis is the same as Google trace analysis question 1.

In [8]:
# Define schema based on Alibaba csv
server_event_schema = StructType([
    StructField('timestamp', LongType(), True),
    StructField('machine_id', LongType(), True),
    StructField('event_type', StringType(), True),
    StructField('event_detail', StringType(), True),
    StructField('cpus', IntegerType(), True),
    StructField('memory', DoubleType(), True),
    StructField('disk', DoubleType(), True),
])

In [9]:
# Load server event data
df_servers = ss.read.csv(
    os.path.join(DATA_DIR, "alibaba-trace-2017/server_event.csv"),
    schema=server_event_schema,
    header=False
)

In [10]:
df_servers.limit(5).toPandas()

                                                                                

Unnamed: 0,timestamp,machine_id,event_type,event_detail,cpus,memory,disk
0,0,1148,add,,64,0.690006,1.0
1,0,1149,add,,64,0.690006,1.0
2,0,1150,add,,64,0.690006,1.0
3,0,1,add,,64,0.68997,1.0
4,0,2,add,,64,0.68997,1.0


In [122]:
# Count distinct machines in df_machines
total_machines = df_servers.select('machine_id').distinct().count()
total_machines

1313

In [12]:
# Count occuring event_types (according to documentation there are 2 (add and remove))
df_servers.select("event_type").distinct().count()

2

In [13]:
# Count the occurance of event_type add
df_servers.filter(df_servers["event_type"] == "add").distinct().count()

                                                                                

1313

In [14]:
# Count the distinct number of event_type add at timestamp 0 (when machines are started)
total_count_t0 = df_servers.filter(df_servers["timestamp"] == 0).distinct().count()
total_count_t0

1313

In [15]:
# Count the distinct machines at timestamp 0 (varifying numbers)
df_servers.select("machine_id").filter(df_servers["timestamp"] == 0).distinct().count()

1313

In [16]:
# Check if some machines have more than one CPU
df_servers.groupBy("machine_id").agg(F.count_distinct("cpus").alias("distinct_cpus")).filter(F.col("distinct_cpus") > 1).show()

+----------+-------------+
|machine_id|distinct_cpus|
+----------+-------------+
|      1075|            2|
|       731|            2|
|       618|            2|
|       689|            2|
|       930|            2|
|       372|            2|
|       401|            2|
+----------+-------------+



In [17]:
# Display the type of CPUs machines are using at timestamp 0
df_servers.select("CPUs").filter(df_servers["timestamp"] == 0).distinct().toPandas()

Unnamed: 0,CPUs
0,64


The CPU capacity distribution of the servers in this dataset is extremely simple and almost homogeneous. 
All 1,313 machines start with exactly 64-core-CPUs, there are no other CPU sizes at cluster initialization. In this data, 7 machines show more than one CPU value across time.

### 2. In general, do tasks from the same job run on the same machine? Comment on the observed locality strategy and its pros and cons.

- This question analysis is the same as Google trace analysis question 7.

In [207]:
# define schema based on alibaba csv
batch_instance_schema = StructType([
    StructField("start_timestamp", LongType(), True),
    StructField("end_timestamp", LongType(), True),
    StructField("job_id", LongType(), True),
    StructField("task_id", LongType(), True),
    StructField("machineID", LongType(), True),
    StructField("status", StringType(), True),
    StructField("seq_no", IntegerType(), True),
    StructField("total_seq_no", IntegerType(), True),
    StructField("real_cpu_max", DoubleType(), True),
    StructField("real_cpu_avg", DoubleType(), True),
    StructField("real_mem_max", DoubleType(), True),
    StructField("real_mem_avg", DoubleType(), True)
])

In [208]:
df_batch_instance = ss.read.csv(
    os.path.join(DATA_DIR, "alibaba-trace-2017/batch_instance.csv"),
    schema=batch_instance_schema,
    header=False
)

In [209]:
# filtering for only scheduled tasks
df_running = df_batch_instance.filter(
    F.col("status").isin("Running", "Terminated", "Failed", "Interrupted")
)
df_running.limit(5).toPandas()

Unnamed: 0,start_timestamp,end_timestamp,job_id,task_id,machineID,status,seq_no,total_seq_no,real_cpu_max,real_cpu_avg,real_mem_max,real_mem_avg
0,41562,41618,120,686,299,Terminated,1,1,1.5,0.29,,
1,41561,41619,120,686,1279,Terminated,1,1,0.89,0.28,,
2,41562,41617,120,686,828,Terminated,1,1,0.94,0.29,,
3,41561,41617,120,686,95,Terminated,1,1,1.0,0.31,,
4,41557,41610,120,686,545,Terminated,1,1,1.37,0.29,,


In [210]:
# for each job count number of unique tasks and number of unique machines used
df_jobs_machines = df_running.groupBy("job_id").agg(
    F.countDistinct("task_id").alias("num_of_tasks"),
    F.countDistinct("machineID").alias("num_of_machines")
)

df_jobs_machines.toPandas()

                                                                                

Unnamed: 0,job_id,num_of_tasks,num_of_machines
0,26.0,12,1210
1,29.0,4,552
2,474.0,1,15
3,964.0,1,1
4,1697.0,12,1155
...,...,...,...
12010,3872.0,1,1
12011,8240.0,1,2
12012,9152.0,1,1
12013,7323.0,1,1


In [211]:
tasks_per_machine = df_jobs_machines.withColumn("tasks_per_machine", F.col("num_of_tasks") / F.col("num_of_machines"))
tasks_per_machine.toPandas()

                                                                                

Unnamed: 0,job_id,num_of_tasks,num_of_machines,tasks_per_machine
0,26.0,12,1210,0.009917
1,29.0,4,552,0.007246
2,474.0,1,15,0.066667
3,964.0,1,1,1.000000
4,1697.0,12,1155,0.010390
...,...,...,...,...
12010,3872.0,1,1,1.000000
12011,8240.0,1,2,0.500000
12012,9152.0,1,1,1.000000
12013,7323.0,1,1,1.000000


As some rows contain single-task jobs (num_of_tasks = 1) they will be dropped for the further analysis.

In [212]:
multi_task_jobs = tasks_per_machine.filter(F.col("num_of_tasks") > 1)
multi_task_jobs.toPandas()

                                                                                

Unnamed: 0,job_id,num_of_tasks,num_of_machines,tasks_per_machine
0,26,12,1210,0.009917
1,29,4,552,0.007246
2,1697,12,1155,0.010390
3,1806,3,491,0.006110
4,1950,20,1085,0.018433
...,...,...,...,...
8954,634,2,1,2.000000
8955,3164,2,1,2.000000
8956,676,2,2,1.000000
8957,3725,2,3,0.666667


In [213]:
# sort list into patterns 
multi_task_jobs = multi_task_jobs.withColumn(
    "category",
    F.when(F.col("tasks_per_machine") > 1, "multiple_tasks_per_machine") # multiple tasks run on a machine
     .when(F.col("tasks_per_machine") == 1, "fully_distributed") # 1 task per machine 
     .when(F.col("tasks_per_machine") < 1, "over_distributed") # more machines than tasks
     .otherwise("?")
)
multi_task_jobs.toPandas()

                                                                                

Unnamed: 0,job_id,num_of_tasks,num_of_machines,tasks_per_machine,category
0,26,12,1210,0.009917,over_distributed
1,29,4,552,0.007246,over_distributed
2,1697,12,1155,0.010390,over_distributed
3,1806,3,491,0.006110,over_distributed
4,1950,20,1085,0.018433,over_distributed
...,...,...,...,...,...
8954,634,2,1,2.000000,multiple_tasks_per_machine
8955,3164,2,1,2.000000,multiple_tasks_per_machine
8956,676,2,2,1.000000,fully_distributed
8957,3725,2,3,0.666667,over_distributed


In [214]:
category_counts = multi_task_jobs.groupBy("category").agg(F.count("*").alias("num_of_tasks"))
category_counts.toPandas()

                                                                                

Unnamed: 0,category,num_of_tasks
0,over_distributed,8639
1,multiple_tasks_per_machine,72
2,fully_distributed,248


In [215]:
# counting the unique jobs 
total_scheduled__multit_jobs = multi_task_jobs.select("job_id").distinct().count()
total_scheduled__multit_jobs

                                                                                

8959

In [216]:
# manual check
8639 + 72 + 248

8959

In [217]:
# calculate percentage
category_counts.withColumn("percentage", F.round((F.col("num_of_tasks") / total_scheduled__multit_jobs) * 100)).toPandas()

                                                                                

Unnamed: 0,category,num_of_tasks,percentage
0,over_distributed,8639,96.0
1,multiple_tasks_per_machine,72,1.0
2,fully_distributed,248,3.0


In comparison to the Google cluster, in the Alibaba cluster, almost all multi-task jobs are over-distributed. 96% of jobs have their tasks spread across more machines than tasks. Only 3% are perfectly distributed (one task per machine), and just 1% show any real colocation of tasks on the same machine. This means Alibaba’s batch scheduler is strongly anti-locality and actively avoids putting multiple tasks of the same job on the same machine.

In the Google cluster, the situation was almost the opposite. The dominant pattern was fully distributed jobs (74% of multi-task jobs run with exactly one task per machine).

### 3. How often does it happen that the resources of a machine are over-committed?

- This question analysis is the same as Google trace analysis question 10.


With the Alibaba data the requested and actually used resources can be checked.

In [136]:
# extract one record per server = machine capactiy
machine_capacity = df_servers.filter(F.col("timestamp") == 0) \
    .select("machine_id", "cpus", "memory") \
    .distinct()

machine_capacity.limit(5).toPandas()

Unnamed: 0,machine_id,cpus,memory
0,155,64,0.68997
1,501,64,0.68997
2,603,64,0.68997
3,805,64,0.68997
4,849,64,0.68997


In [137]:
# server_usage.csv contains actual utilization of CPU and memory
# per machine, sampled every 60 seconds and averaged over 5 minutes
server_usage_schema = StructType([
    StructField("timestamp", LongType(), True),
    StructField("machine_id", LongType(), True),
    StructField("cpu_util", DoubleType(), True),
    StructField("mem_util", DoubleType(), True),
    StructField("disk_util", DoubleType(), True),
    StructField("load1", DoubleType(), True),
    StructField("load5", DoubleType(), True),
    StructField("load15", DoubleType(), True),
])

In [138]:
df_usage = ss.read.csv(
    os.path.join(DATA_DIR, "alibaba-trace-2017/server_usage.csv"),
    schema=server_usage_schema,
    header=False
)

In [139]:
# for each timestamped measurement, compute absolute CPU/memory usage
# by multiplying fraction used by machine capacity
df_machine = df_usage.join(machine_capacity, "machine_id")

df_machine = df_machine.withColumn(
    "cpu_used", F.col("cpu_util") * F.col("cpus")
).withColumn(
    "mem_used", F.col("mem_util") * F.col("memory")
)

df_machine.limit(5).toPandas()

Unnamed: 0,machine_id,timestamp,cpu_util,mem_util,disk_util,load1,load5,load15,cpus,memory,cpu_used,mem_used
0,237,41700,23.38,30.08,42.200001,15.82,13.86,12.64,64,0.690001,1496.319995,20.755224
1,265,39600,26.36,29.54,57.599998,17.46,18.9,16.7,64,0.690001,1687.03999,20.382623
2,770,42600,49.14,60.099999,41.860001,33.2,31.22,30.52,64,0.68997,3144.96001,41.467179
3,776,40800,33.24,47.52,43.599998,21.84,22.1,24.02,64,0.68997,2127.359985,32.787361
4,393,42900,45.72,58.72,42.0,34.1,36.239999,36.92,64,0.68997,2926.080029,40.515022


In [140]:
# detect overconsumption

df_over = df_machine.withColumn(
    "cpu_over",
    (F.col("cpu_used") > F.col("cpus")).cast("int")
).withColumn(
    "mem_over",
    (F.col("mem_used") > F.col("memory")).cast("int")
).withColumn(
    "any_over",
    ((F.col("cpu_used") > F.col("cpus")) | (F.col("mem_used") > F.col("memory"))).cast("int")
).withColumn(
    "both_over",
    ((F.col("cpu_used") > F.col("cpus")) & (F.col("mem_used") > F.col("memory"))).cast("int")
)

df_over.limit(5).toPandas()

Unnamed: 0,machine_id,timestamp,cpu_util,mem_util,disk_util,load1,load5,load15,cpus,memory,cpu_used,mem_used,cpu_over,mem_over,any_over,both_over
0,237,41700,23.38,30.08,42.200001,15.82,13.86,12.64,64,0.690001,1496.319995,20.755224,1,1,1,1
1,265,39600,26.36,29.54,57.599998,17.46,18.9,16.7,64,0.690001,1687.03999,20.382623,1,1,1,1
2,770,42600,49.14,60.099999,41.860001,33.2,31.22,30.52,64,0.68997,3144.96001,41.467179,1,1,1,1
3,776,40800,33.24,47.52,43.599998,21.84,22.1,24.02,64,0.68997,2127.359985,32.787361,1,1,1,1
4,393,42900,45.72,58.72,42.0,34.1,36.239999,36.92,64,0.68997,2926.080029,40.515022,1,1,1,1


In [141]:
total = df_over.count()

cpu_over = df_over.filter("cpu_over = 1").count()
mem_over = df_over.filter("mem_over = 1").count()
any_over = df_over.filter("any_over = 1").count()
both_over = df_over.filter("both_over = 1").count()

In [142]:
print(f"Total samples: {total:,}")
print(f"CPU overcommit: {cpu_over/total*100:.2f}%")
print(f"Memory overcommit: {mem_over/total*100:.2f}%")
print(f"Any overcommit: {any_over/total*100:.2f}%")
print(f"Both overcommit: {both_over/total*100:.2f}%")

Total samples: 187,963
CPU overcommit: 99.67%
Memory overcommit: 100.00%
Any overcommit: 100.00%
Both overcommit: 99.67%


Unlike Google, Alibaba already provides machine-level utilization at timestamped intervals (server_usage.csv). Each row is a "checkpoint" sampled every minute. Therefore, we did not need to reconstruct task execution windows with Spark from start/end times.

The analysis of machine over-commitment reveals a contrast between the Alibaba and Google cluster traces. In Alibaba, nearly all machines were overcommitted throughout the trace period, with 99.67% of measurements exceeding CPU capacity, 100% exceeding memory capacity, and effectively all measurements exceeding at least one resource. Alibaba seems to prioritize maximizing resource utilization even at the cost of persistent overcommitment, while Google prioritizes minimal resource contention, maintaining most machines within their capacity limits.

## Additional analysis

### 4. - Machine Reliability & Failure Analysis

We analyze what errors in machines

In [143]:
# Visual check of df
df_servers.limit(3).toPandas()

Unnamed: 0,timestamp,machine_id,event_type,event_detail,cpus,memory,disk
0,0,1148,add,,64,0.690006,1.0
1,0,1149,add,,64,0.690006,1.0
2,0,1150,add,,64,0.690006,1.0


In [144]:
# We want to compute the same analysis of calculating total power lost due to maintenance, but we conclude something new

from pyspark.sql import Window

window = Window.partitionBy("machine_id").orderBy("timestamp")

df_next = df_servers.withColumn("next_event_type", F.lead("event_type").over(window))
df_next = df_next.withColumn("next_time", F.lead("timestamp").over(window))

df_reloaded = df_next.filter((df_next["event_type"]=='softerror') & (df_next["next_event_type"]=='add'))
df_reloaded.limit(3).toPandas()

Unnamed: 0,timestamp,machine_id,event_type,event_detail,cpus,memory,disk,next_event_type,next_time


We conclude that there are no occurences of adding the same machine after software error. Let's see if there is any addition that is not at timestamp 0!

Let's see if any of the additions occur after the timestamp 0.

In [145]:
df_servers.filter((F.col("event_type") == "add") & (F.col("timestamp") > 0)).show()

+---------+----------+----------+------------+----+------+----+
|timestamp|machine_id|event_type|event_detail|cpus|memory|disk|
+---------+----------+----------+------------+----+------+----+
+---------+----------+----------+------------+----+------+----+



We conclude that machines in this dataset never "reconnected" once they went offline. They stayed offline for the remainder of the trace. Having that in mind, let's analyze softerror events

In [146]:
# First, let's see those events:
df_servers.filter(F.col("event_type") == 'softerror').limit(3).toPandas()

Unnamed: 0,timestamp,machine_id,event_type,event_detail,cpus,memory,disk
0,1351,1075,softerror,agent_check,0,0.0,0.0
1,8611,1075,softerror,agent_check,0,0.0,0.0
2,16052,1075,softerror,agent_check,0,0.0,0.0


We observe that, unlike in Google, there are no removal events in Alibaba trace. Instead, nodes stay in the record even after errors, as we can see above. Since we never see an "add" event to bring them back, that first error is basically the moment the machine "dies". Even if we see more error logs for that same machine later, it’s just the system reporting that it’s still broken. We can see that because resources drop to zero.

Let's analyze number of softerrors per machine

In [147]:
df_err = df_servers.filter(F.col("event_type") == "softerror")
df_err = df_err.groupBy("machine_id").agg(F.count("event_type").alias("number_of_errors"))
df_err.toPandas()

Unnamed: 0,machine_id,number_of_errors
0,1075,12
1,731,1
2,618,2
3,689,3
4,930,12
5,372,5
6,401,4


In [148]:
total_error_machines = df_err.count()
chance_of_machine_error = total_error_machines/total_machines * 100
print(f"Chance of machine error: {chance_of_machine_error:.2f}")

Chance of machine error: 0.53


We observe that out of total of 1313 machines, in only 7 of them errors occur! This means that the reliability of this system is very high! Probability of error happening in a machine is only 0.53%!

### 5. What is the percentage of computational power lost due to errors?

Now, for these 7 machines, we can calculate power lost due to their errors.

In [203]:
max_ts = df_servers.select(F.max("timestamp")).collect()[0][0]

# 1. First, we calculate total possible power of all machines together, 
# by calculating each mahcine's lifetime and multiplying by it's intiial cpus
df_machine_lifetimes = df_servers.groupBy("machine_id").agg(
    F.max("cpus").alias("cpus"),
    (F.max("timestamp") - F.min("timestamp")).alias("lifetime"))

df_machine_lifetimes.limit(5).toPandas()

Unnamed: 0,machine_id,cpus,lifetime
0,26,64,0
1,29,64,0
2,474,64,0
3,964,64,0
4,65,64,0


In [204]:
# Many machines show a lifetime of zero because they only have one record (at the very beginning) and were never removed. 
# This means they stayed healthy until the end of the trace, so their lifetime is equal to total time.
total_time = df_servers.select(F.max("timestamp")).collect()[0][0]
df_machine_lifetimes = df_machine_lifetimes.withColumn("lifetime", F.when(F.col("lifetime") == 0, F.lit(total_time)).otherwise(F.col("lifetime")))

# # Then, we continue calculating total power
df_machine_lifetimes = df_machine_lifetimes.withColumn("power", F.col("lifetime") * F.col("cpus"))
df_machine_lifetimes.limit(3).toPandas()

total_potential_power = df_machine_lifetimes.select(F.sum('power')).collect()[0][0]
total_potential_power

6941982208

In [218]:
# 2. Next, we calculate power lost
df_err_machines = df_servers.filter(F.col("event_type") == 'softerror')
df_err_machines = df_err_machines.groupBy("machine_id").agg(F.min("timestamp").alias("first_error_time"))

# for calculating lost power, we need initial cpus from beginning, that's why we join df_all_machines
df_all_machines = df_servers.filter(F.col("event_type") == 'add').select("machine_id", "cpus").distinct()
df_lost_calc = df_err_machines.join(df_all_machines, "machine_id")
df_lost_calc = df_lost_calc.withColumn("power_lost", (max_ts - F.col("first_error_time")) * F.col("cpus"))
total_power_lost = df_lost_calc.select(F.sum("power_lost")).collect()[0][0]

# 3. Final Percentage
percentage_lost = (total_power_lost / total_potential_power) * 100
print(f"Percentage of total cluster power lost: {percentage_lost:.4f}%")

Percentage of total cluster power lost: 0.3386%


The total power lost percentage is 0.3386%. This confirms that the cluster's overall capacity is robust against the rare 'softerror' events observed in the dataset.