In [91]:
import s3fs
import pandas as pd
import numpy as np

In [3]:
# Use no-sign-request to look at the bucket for free
fs = s3fs.S3FileSystem(anon=True)

In [7]:
# Load the main slurm log
df_slurm = pd.read_csv('s3://mit-supercloud-dataset/datacenter-challenge/202201/slurm-log.csv')
print(df_slurm.shape)

(395914, 29)


In [95]:
# job IDs in the slurm log
scheduler_log_job_ids = df_slurm.id_job.unique()

# indices of gpu jobs
gpu_idx = df_slurm.tres_req.apply(lambda x:str(x).find('1001')>0 or str(x).find('1002')>0)
scheduler_log_job_ids_gpu = np.unique(df_slurm[gpu_idx].id_job.values)

print('There are {} jobs in the scheduler log, of which {} requested GPUs.'.format(scheduler_log_job_ids.shape[0],
                                                                                   scheduler_log_job_ids_gpu.shape[0]))

There are 395914 jobs in the scheduler log, of which 98176 requested GPUs.


In [11]:
print(df_slurm.columns)

Index(['id_job', 'id_array_job', 'id_array_task', 'id_user', 'kill_requid',
       'nodes_alloc', 'nodelist', 'cpus_req', 'derived_ec', 'exit_code',
       'gres_used', 'array_max_tasks', 'array_task_pending', 'constraints',
       'flags', 'mem_req', 'partition', 'priority', 'state', 'timelimit',
       'time_submit', 'time_eligible', 'time_start', 'time_end',
       'time_suspended', 'track_steps', 'tres_alloc', 'tres_req', 'job_type'],
      dtype='object')


In [13]:
df_slurm['state'].value_counts()

state
3       309037
4        53980
5        21349
6         7089
11        4414
7           44
1024         1
Name: count, dtype: int64

In [25]:
df_slurm['cpus_req'].value_counts().head(10)

cpus_req
1     170323
4      66540
8      41792
20     37808
10     14571
2      12822
40     11390
16      5538
5       5209
6       4787
Name: count, dtype: int64

In [23]:
df_slurm['nodes_alloc'].value_counts().head(10)

nodes_alloc
1     389353
2       2870
4        982
3        904
8        623
6        279
5        251
7        221
16       114
9         48
Name: count, dtype: int64

In [27]:
# Create new columns for job duration
df_slurm['duration_seconds'] = df_slurm['time_end'] - df_slurm['time_start']
df_slurm['duration_hours'] = df_slurm['duration_seconds'] / 3600
print(df_slurm[['time_start', 'time_end', 'duration_seconds', 'duration_hours']].head(3))

   time_start    time_end  duration_seconds  duration_hours
0  1609806297  1609806605               308        0.085556
1  1609806607  1609807004               397        0.110278
2  1609807004  1609807331               327        0.090833
3  1609807695  1609811546              3851        1.069722
4  1609808470  1609808777               307        0.085278


In [69]:
# Calculate percentage of jobs that take longer than X hours
percentage1 = (df_slurm['duration_hours'] > 1).mean() * 100
print(f"{percentage1:.2f}% of jobs run for more than 1 hour.")
percentage = (df_slurm['duration_hours'] > 2).mean() * 100
print(f"{percentage:.2f}% of jobs run for more than 2 hours.")
percentage = (df_slurm['duration_hours'] > 4).mean() * 100
print(f"{percentage:.2f}% of jobs run for more than 4 hours.")
percentage = (df_slurm['duration_hours'] > 8).mean() * 100
print(f"{percentage:.2f}% of jobs run for more than 8 hours.")

43.96% of jobs run for more than 1 hour.
32.65% of jobs run for more than 2 hours.
23.99% of jobs run for more than 4 hours.
14.50% of jobs run for more than 8 hours.


In [39]:
# Job state value counts
jobs_1hr = df_slurm[df_slurm['duration_hours'] > 1]['state'].value_counts()
jobs_1hr

state
3       128712
4        27044
5         8610
6         7032
11        2602
7           38
1024         1
Name: count, dtype: int64

In [57]:
success = jobs_1hr.iloc[0]
failure = jobs_1hr.iloc[2:].sum()
cancelled = jobs_1hr.iloc[1]
total = jobs_1hr.sum()

fail_percent = (failure / total) * 100
cancelled_percent = (cancelled / total) * 100
success_percent = (success / total) * 100

print(f"{fail_percent:.2f}% of jobs that run for more than 1 hour fail. {failure} total failures")
print(f"{cancelled_percent:.2f}% of jobs that run for more than 1 hour are cancelled by user.")
print(f"{success_percent:.2f}% of jobs that run for more than 1 hour are completed successfully.")

10.51% of jobs that run for more than 1 hour fail. 18283 total failures
15.54% of jobs that run for more than 1 hour are cancelled by user.
73.96% of jobs that run for more than 1 hour are completed successfully.


In [41]:
jobs_2hr = df_slurm[df_slurm['duration_hours'] > 2]['state'].value_counts()
jobs_2hr

state
3       91055
4       22644
6        7021
5        6253
11       2261
7          38
1024        1
Name: count, dtype: int64

In [55]:
success = jobs_2hr.iloc[0]
failure = jobs_2hr.iloc[2:].sum()
cancelled = jobs_2hr.iloc[1]
total = jobs_2hr.sum()

fail_percent = (failure / total) * 100
cancelled_percent = (cancelled / total) * 100
success_percent = (success / total) * 100

print(f"{fail_percent:.2f}% of jobs that run for more than 2 hours fail. {failure} total failures")
print(f"{cancelled_percent:.2f}% of jobs that run for more than 2 hours are cancelled by user.")
print(f"{success_percent:.2f}% of jobs that run for more than 2 hours are completed successfully.")

12.05% of jobs that run for more than 2 hours fail. 15574 total failures
17.52% of jobs that run for more than 2 hours are cancelled by user.
70.44% of jobs that run for more than 2 hours are completed successfully.


In [61]:
jobs_4hr = df_slurm[df_slurm['duration_hours'] > 4]['state'].value_counts()
jobs_4hr

state
3       62700
4       18894
6        7016
5        4809
11       1514
7          37
1024        1
Name: count, dtype: int64

In [63]:
success = jobs_4hr.iloc[0]
failure = jobs_4hr.iloc[2:].sum()
cancelled = jobs_4hr.iloc[1]
total = jobs_4hr.sum()

fail_percent = (failure / total) * 100
cancelled_percent = (cancelled / total) * 100
success_percent = (success / total) * 100

print(f"{fail_percent:.2f}% of jobs that run for more than 4 hours fail. {failure} total failures")
print(f"{cancelled_percent:.2f}% of jobs that run for more than 4 hours are cancelled by user.")
print(f"{success_percent:.2f}% of jobs that run for more than 4 hours are completed successfully.")

14.09% of jobs that run for more than 4 hours fail. 13377 total failures
19.89% of jobs that run for more than 4 hours are cancelled by user.
66.02% of jobs that run for more than 4 hours are completed successfully.


In [65]:
jobs_8hr = df_slurm[df_slurm['duration_hours'] > 8]['state'].value_counts()
jobs_8hr

state
3       32601
4       14439
6        6083
5        3259
11        996
7          37
1024        1
Name: count, dtype: int64

In [67]:
success = jobs_8hr.iloc[0]
failure = jobs_8hr.iloc[2:].sum()
cancelled = jobs_8hr.iloc[1]
total = jobs_8hr.sum()

fail_percent = (failure / total) * 100
cancelled_percent = (cancelled / total) * 100
success_percent = (success / total) * 100

print(f"{fail_percent:.2f}% of jobs that run for more than 8 hours fail. {failure} total failures")
print(f"{cancelled_percent:.2f}% of jobs that run for more than 8 hours are cancelled by user.")
print(f"{success_percent:.2f}% of jobs that run for more than 8 hours are completed successfully.")

18.07% of jobs that run for more than 8 hours fail. 10376 total failures
25.15% of jobs that run for more than 8 hours are cancelled by user.
56.78% of jobs that run for more than 8 hours are completed successfully.


In [73]:
# Lets go with the >2 hour subset
jobs_2hr_df = df_slurm[df_slurm['duration_hours'] > 2]

In [75]:
jobs_2hr_df['cpus_req'].value_counts().head(10)

cpus_req
1     55354
4     19455
20    14820
8      7947
40     5350
10     5008
2      4489
21     3372
16     2540
5      2476
Name: count, dtype: int64

In [77]:
jobs_2hr_df['nodes_alloc'].value_counts().head(10)

nodes_alloc
1     126532
2       1327
4        375
3        331
8        319
6        100
7         76
5         69
16        51
9         18
Name: count, dtype: int64

In [103]:
jobs_2hr_df['job_type'].value_counts()

job_type
OTHER                96468
LLSUB:BATCH          20631
LLMAPREDUCE:MAP       8463
LLSUB:INTERACTIVE     3711
Name: count, dtype: int64

In [115]:
jobs_2hr_df['constraints'].value_counts()

constraints
xeon-g6         65829
\N              55870
xeon-g6&6248     7521
xeon-e5            34
opteron            18
opteron&6274        1
Name: count, dtype: int64

In [97]:
# job IDs in the 2hr subset
scheduler_log_job_ids = jobs_2hr_df.id_job.unique()

# indices of gpu jobs
gpu_idx = jobs_2hr_df.tres_req.apply(lambda x:str(x).find('1001')>0 or str(x).find('1002')>0)
scheduler_log_job_ids_gpu = np.unique(jobs_2hr_df[gpu_idx].id_job.values)

print('There are {} jobs in the 2 hr subset, of which {} requested GPUs.'.format(scheduler_log_job_ids.shape[0],
                                                                                   scheduler_log_job_ids_gpu.shape[0]))

There are 129273 jobs in the 2 hr subset, of which 52025 requested GPUs.


In [99]:
# checking >4 hour subset
jobs_4hr_df = df_slurm[df_slurm['duration_hours'] > 4]

In [101]:
# job IDs in the 4hr subset
scheduler_log_job_ids = jobs_4hr_df.id_job.unique()

# indices of gpu jobs
gpu_idx = jobs_4hr_df.tres_req.apply(lambda x:str(x).find('1001')>0 or str(x).find('1002')>0)
scheduler_log_job_ids_gpu = np.unique(jobs_4hr_df[gpu_idx].id_job.values)

print('There are {} jobs in the 4 hr subset, of which {} requested GPUs.'.format(scheduler_log_job_ids.shape[0],
                                                                                   scheduler_log_job_ids_gpu.shape[0]))

There are 94971 jobs in the 4 hr subset, of which 36763 requested GPUs.


In [None]:
# 40% of jobs over 2 hours in duration requested GPU's. Only 38% of jobs over 4 hours in duration requested GPU's
# Another reason to go with 2 hr subset