In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
#there are some irregular lines in the data, so setting argument to get a note of which lines and skip them when reading in
accre_gpu = pd.read_csv('../data/accre-gpu-jobs-2022-v2.csv', on_bad_lines='warn')

In [None]:
#per our stakeholder, no need to include jobs with state of cancelled, pending, or running in our analysis, so dropping those 1258 rows
accre_gpu = accre_gpu.loc[(~accre_gpu['STATE'].isin(['PENDING', 'RUNNING'])) 
                          & 
                          (~accre_gpu['STATE'].str.contains('CANCELLED'))]

#stripping the 'M' from USEDMEM column and converting it to a numeric data type
accre_gpu['USEDMEM'] = accre_gpu['USEDMEM'].str.strip(to_strip='M').astype(float)

#converting REQTIME and USEDTIME columns to timedelta data types
accre_gpu['REQTIME'] = pd.to_timedelta(accre_gpu['REQTIME'].str.replace('-',' days '))
accre_gpu['USEDTIME'] = pd.to_timedelta(accre_gpu['USEDTIME'].str.replace('-',' days '))

#adding in a column that gives the difference in the time requested and time used
accre_gpu['time_dif'] = accre_gpu['REQTIME'] - accre_gpu['USEDTIME']

#adding in columns that provide versions of the other time columns but in minutes
accre_gpu['REQTIME_s'] = accre_gpu['REQTIME'].astype('timedelta64[s]')
accre_gpu['USEDTIME_s'] = accre_gpu['USEDTIME'].astype('timedelta64[s]')
accre_gpu['time_dif_s'] = accre_gpu['REQTIME_s'] - accre_gpu['USEDTIME_s']

accre_gpu = accre_gpu.reset_index(drop=True)

accre_gpu

#### 4. What is the distribution of different groups and users accessing each partition? In each partition, who are the top users, and do they represent a majority of the runtime-weighted jobs on the partition?   
From the stakeholder: For determining top users, I think the calculation would be the sum of nGPUs * usedtime over all their jobs in the partition. It would also be interesting to see what users/groups seem to be using more memory.

In [None]:
#creating the RUNTIME weighted GPU value
accre_gpu['GPUS_RUNTIME_h'] = accre_gpu['GPUS'] * accre_gpu['USEDTIME_s'] / 3600

# we want to analyze each user within the context of a specific account, creating a combined account_user column
accre_gpu['ACCOUNT_USER'] = accre_gpu['ACCOUNT'] + '_' + accre_gpu['USER']

accre_gpu

In [None]:
#number of jobs either completed, failed, or ran out of memory for each user
(
    accre_gpu
    .groupby(['PARTITION','ACCOUNT_USER'])['JOBID']
    .count()
    .to_frame()
    .reset_index()
    .sort_values('JOBID', ascending=False)
    .rename(columns={'JOBID':'job_count'})
)

Almost 40% of all jobs in this data set came frome one user in one partition

In [None]:
#plotting the runtime weighted GPUs by partition and account
sns.set(rc={"figure.figsize":(10, 8)})
sns.barplot(
             x='GPUS_RUNTIME_h',
             y='ACCOUNT',
             hue='PARTITION',
             data=(accre_gpu
                    .groupby(['PARTITION', 'ACCOUNT'])['GPUS_RUNTIME_h']
                    .sum()
                    .to_frame()
                    .reset_index()
                    .sort_values('GPUS_RUNTIME_h', ascending=False))
            );

In [None]:
#plotting total memory usage by partition and account
sns.set(rc={"figure.figsize":(10, 8)})
sns.barplot(
             x='USEDMEM',
             y='ACCOUNT',
             hue='PARTITION',
             data=(accre_gpu
                    .groupby(['PARTITION', 'ACCOUNT'])['USEDMEM']
                    .sum()
                    .to_frame()
                    .reset_index()
                    .sort_values('USEDMEM', ascending=False))
            );

In [None]:
#creating a dataframe of aggregated values for each user
user_details = (
    accre_gpu
    .groupby(['PARTITION', 'ACCOUNT', 'USER', 'ACCOUNT_USER'])
    .agg(
        sum_MEM = ('USEDMEM','sum'),
        sum_UTIME = ('USEDTIME', 'sum'),
        sum_RTIME = ('REQTIME', 'sum'),
        sum_time_dif = ('time_dif', 'sum'),
        sum_GPUS_RUNTIME_h = ('GPUS_RUNTIME_h', 'sum'),
        job_count = ('JOBID', 'count')
    )
    .reset_index()
)
user_details['time_per_job_m'] = (user_details['sum_UTIME'].astype('timedelta64[s]')/60)/user_details['job_count']
user_details['time_dif_per_job_m'] = (user_details['sum_time_dif'].astype('timedelta64[s]')/60)/user_details['job_count']

user_details

#### Looking at top 5 users  for Maxwell and top 10 for other two by different measures

In [None]:
#total number of users for each partition
print(len(user_details.loc[user_details['PARTITION'] == 'maxwell']))
print(len(user_details.loc[user_details['PARTITION'] == 'pascal']))
print(len(user_details.loc[user_details['PARTITION'] == 'turing']))

In [None]:
m_m = user_details.loc[user_details['PARTITION'] == 'maxwell'].nlargest(5, 'sum_MEM').reset_index(drop=True)

In [None]:
m_gr = user_details.loc[user_details['PARTITION'] == 'maxwell'].nlargest(5, 'sum_GPUS_RUNTIME_h').reset_index(drop=True)

In [None]:
m_jc = user_details.loc[user_details['PARTITION'] == 'maxwell'].nlargest(5, 'job_count').reset_index(drop=True)

In [None]:
p_m = user_details.loc[user_details['PARTITION'] == 'pascal'].nlargest(10, 'sum_MEM').reset_index(drop=True)

In [None]:
t_m = user_details.loc[user_details['PARTITION'] == 'turing'].nlargest(10, 'sum_MEM').reset_index(drop=True)

In [None]:
p_gr = user_details.loc[user_details['PARTITION'] == 'pascal'].nlargest(10, 'sum_GPUS_RUNTIME_h').reset_index(drop=True)

In [None]:
t_gr = user_details.loc[user_details['PARTITION'] == 'turing'].nlargest(10, 'sum_GPUS_RUNTIME_h').reset_index(drop=True)

In [None]:
p_jc = user_details.loc[user_details['PARTITION'] == 'pascal'].nlargest(10, 'job_count').reset_index(drop=True)

In [None]:
t_jc = user_details.loc[user_details['PARTITION'] == 'turing'].nlargest(10, 'job_count').reset_index(drop=True)

In [None]:
#putting together all these nlargest dfs, dropping duplicates
dfs = [m_m, p_m, t_m]
tops_m = pd.concat(dfs).drop_duplicates().reset_index(drop=True)
tops_m

In [None]:
dfs = [m_gr, p_gr, t_gr]
tops_gr = pd.concat(dfs).drop_duplicates().reset_index(drop=True)
tops_gr

#### Portion of the variable of interest accounted for by the top users in that portion

In [None]:
m_gr['sum_GPUS_RUNTIME_h'].sum()/accre_gpu.loc[accre_gpu['PARTITION'] == 'maxwell']['GPUS_RUNTIME_h'].sum()

In [None]:
p_gr['sum_GPUS_RUNTIME_h'].sum()/accre_gpu.loc[accre_gpu['PARTITION'] == 'pascal']['GPUS_RUNTIME_h'].sum()

In [None]:
t_gr['sum_GPUS_RUNTIME_h'].sum()/accre_gpu.loc[accre_gpu['PARTITION'] == 'turing']['GPUS_RUNTIME_h'].sum()

In [None]:
m_m['sum_MEM'].sum()/accre_gpu.loc[accre_gpu['PARTITION'] == 'maxwell']['USEDMEM'].sum()

In [None]:
p_m['sum_MEM'].sum()/accre_gpu.loc[accre_gpu['PARTITION'] == 'pascal']['USEDMEM'].sum()

In [None]:
t_m['sum_MEM'].sum()/accre_gpu.loc[accre_gpu['PARTITION'] == 'turing']['USEDMEM'].sum()

Looking at distribution over several variables by partition

In [None]:
(
    accre_gpu
    .loc[accre_gpu['EXITCODE'] == '0:0']
    .groupby('PARTITION')['USEDTIME']
    .describe(percentiles=[.01, .05, .1, .25, .5, .75, .9, .95, .99])
    .drop(columns=['std', 'min'])
)

In [None]:
(
    accre_gpu
    .loc[accre_gpu['EXITCODE'] == '0:0']
    .groupby('PARTITION')['USEDMEM']
    .describe(percentiles=[.01, .05, .1, .25, .5, .75, .9, .95, .99])
)

In [None]:
(
    accre_gpu
    .loc[accre_gpu['EXITCODE'] == '0:0']
    .groupby('PARTITION')['GPUS']
    .describe(percentiles=[.01, .05, .1, .25, .5, .75, .9, .95, .99])
)

In [None]:
(
    accre_gpu
    .loc[accre_gpu['EXITCODE'] == '0:0']
    .groupby('PARTITION')['time_dif']
    .describe(percentiles=[.01, .05, .1, .25, .5, .75, .9, .95, .99])
)

In [None]:
import plotly.express as px
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

In [None]:
df= tops_m.sort_values(by=['ACCOUNT'], ascending=False)
figtm = px.bar(df, 
             x='sum_MEM', 
             y='USER', 
             color='PARTITION',
             pattern_shape="ACCOUNT",
             pattern_shape_map={
                 "virginia": "x", "harovin": "/", "glasshouse": ".",
                 "prince": "\\", "grisette":"x", "antares":"", 
                 "plumcot":"+", "orange": "/"
             },
             color_discrete_sequence=px.colors.qualitative.Dark24,
             height=800,
             labels={
                 "sum_GPUS_RUNTIME_h" : "Total Runtime over all GPUs(h)",
                 "sum_MEM" : "Total Memory Used (MB)",
                 "job_count" : "Total Jobs",
                 "sum_UTIME" : "Total Time Used",
                 "PARTITION" : "Partition",
                 "USER" : "User name", 
                 "ACCOUNT" : "Account",
                 "time_per_job" : "Average job time"
             },
              hover_data={
                  "sum_GPUS_RUNTIME_h":True,
                  #"time_per_job":True,
                  "job_count":True,
                  "sum_GPUS_RUNTIME_h":":.6s",
                  "sum_MEM":":,.2f"
              },
              color_discrete_map={
                  "maxwell":"#1CA71C",
                  "pascal":"#E15F99",
                  "turing":"#2E91E5"
              })
figtm.update_layout(title_text="Top Users by Total Memory Usage",
                  title_xanchor="auto",
                  title_font_size=18,
                    yaxis={'categoryorder':'category descending'}
                 )




figtm.show()

In [None]:
#figtm.write_html("top_mem.html")

In [None]:
df= tops_gr.sort_values(by=['ACCOUNT'], ascending=False)
figtgr = px.bar(df, 
             x='sum_GPUS_RUNTIME_h', 
             y='USER', 
             color='PARTITION',
             pattern_shape="ACCOUNT",
             pattern_shape_map={
                 "virginia": "x", "harovin": "/", "glasshouse": ".",
                 "prince": "\\", "grisette":"x", "antares":"", 
                 "plumcot":"+", "orange": "/"
             },
             color_discrete_sequence=px.colors.qualitative.Dark24,
             height=800,
             labels={
                 "sum_GPUS_RUNTIME_h" : "Total Runtime over all GPUs(h)",
                 "sum_MEM" : "Total Memory Used (MB)",
                 "job_count" : "Total Jobs",
                 "sum_UTIME" : "Total Time Used",
                 "PARTITION" : "Partition",
                 "USER" : "User name", 
                 "ACCOUNT" : "Account",
                 "time_per_job" : "Average job time"
                  },
              hover_data={
                  #"sum_UTIME":True,
                  #"time_per_job":True,
                  "job_count":True,
                  "sum_MEM":True,
                  "sum_MEM":":,.2f"
                  
              },
              color_discrete_map={
                  "maxwell":"#1CA71C",
                  "pascal":"#E15F99",
                  "turing":"#2E91E5"
              })
figtgr.update_layout(title_text="Top Users by Total Runtime over all GPUs",
                  title_xanchor="auto",
                  title_font_size=18,
                    yaxis={'categoryorder':'category descending'}
                 )




figtgr.show()

In [None]:
#figtgr.write_html("top_gpuruntime.html")

In [None]:
#@interact(partition = user_details['PARTITION'].unique())
#def interactive_plot(partition):

df = user_details

fig =  px.scatter(df, 
                  x="sum_GPUS_RUNTIME_h", 
                  y="sum_MEM",
                  color="PARTITION",
                  size="job_count",
                  symbol='ACCOUNT',
                  height=800,
                  width=800,
                  labels={
                      "sum_GPUS_RUNTIME_h" : "Total Runtime over all GPUs(h)",
                      "sum_MEM" : "Total Memory Used (MB)",
                      "job_count" : "Total Jobs",
                      "sum_UTIME" : "Total Time Used",
                      "sum_GPUS" : "Total GPUs Requested",
                      "PARTITION" : "Partition",
                      "USER" : "User name", 
                      "ACCOUNT" : "Account"
                  },
                  hover_name="ACCOUNT_USER",
                  hover_data={
                  #"sum_UTIME":True,
                  #"time_per_job":True,
                  "job_count":True,
                  "sum_MEM":True,
                  "sum_MEM":":,.2f"
                  
              },
                 )

fig.update_layout(title_text="Title",
                  title_xanchor="auto",
                  title_font_size=14
                 #title_xref="paper"
                 )

fig.update_xaxes(title_font_size=12)

fig.update_yaxes(title_font_size=12)

fig.show()

In [None]:
#dropping 28 users based on the following parameters:
df = user_details.loc[~((user_details['job_count'] < 6) 
                    & 
                    (user_details['sum_UTIME'] < '00:30:00')
                    &
                    (user_details['sum_MEM'] < 1000)
                        )]
                       

fig1 =  px.scatter(df, 
                  x="sum_GPUS_RUNTIME_h", 
                  y="sum_MEM",
                  color= 'ACCOUNT',
                  size='job_count',
                  #symbol='ACCOUNT',
                  facet_row = "PARTITION",
                  color_discrete_sequence=px.colors.qualitative.Dark24,
                  size_max=50,
                  height=1200,
                  width=800,
                  labels={
                      "sum_GPUS_RUNTIME_h" : "Total Runtime over all GPUs(h)",
                      "sum_MEM" : "Total Memory Used (MB)",
                      "job_count" : "Total Jobs",
                      "sum_UTIME" : "Total Time Used",
                      "PARTITION" : "Partition",
                      "ACCOUNT" : "Account",
                      "time_per_job_m" : "Average time per job (minutes)"
                  },
                  hover_name="USER",
                   hover_data={
                       "sum_GPUS_RUNTIME_h":True,
                       "sum_GPUS_RUNTIME_h":":.4s",
                       "sum_MEM":":,.1f",
                       "time_per_job_m":True,
                       "time_per_job_m":":.1f"
                  
              },
                 )

fig1.update_layout(title_text="User Activity Across Partitions",
                  title_xanchor="auto",
                  title_font_size=18
                 )

fig1.update_xaxes(title_font_size=12)

fig1.update_yaxes(title_font_size=12)

fig1.show()


In [None]:
#fig1.write_html("mem_rtgpu.html")

#### Looking at worst offenders for short run time

In [None]:
user_details.loc[(user_details['time_per_job_m'] < 3)
                 &
                 (user_details['job_count'] > 1)]
#21 users who have submitted more than one job who have an average time per job under 3 minutes

In [None]:
user_details.loc[(user_details['time_per_job_m'] < 3)
                 &
                 (user_details['job_count'] > 1)]['job_count'].sum()
#These users account for 114657 jobs, the massive proportion of which are from glasshouse marnie

#### Looking at worst offenders for overestimating time needed

In [None]:
user_details.loc[user_details['job_count'] > 1].nlargest(10, 'time_dif_per_job_m')
#the top 10 worst offenders are all overestimating by over 4 days on average