In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
#there are some irregular lines in the data, so setting argument to get a note of which lines and skip them when reading in
accre_gpu = pd.read_csv('../data/accre-gpu-jobs-2022-v2.csv', on_bad_lines='warn')

In [None]:
#per our stakeholder, no need to include jobs with state of cancelled, pending, or running in our analysis, so dropping those 1258 rows
accre_gpu = accre_gpu.loc[(~accre_gpu['STATE'].isin(['PENDING', 'RUNNING'])) 
                          & 
                          (~accre_gpu['STATE'].str.contains('CANCELLED'))]

#stripping the 'M' from USEDMEM column and converting it to a numeric data type
accre_gpu['USEDMEM'] = accre_gpu['USEDMEM'].str.strip(to_strip='M').astype(float)

#converting REQTIME and USEDTIME columns to timedelta data types
accre_gpu['REQTIME'] = pd.to_timedelta(accre_gpu['REQTIME'].str.replace('-',' days '))
accre_gpu['USEDTIME'] = pd.to_timedelta(accre_gpu['USEDTIME'].str.replace('-',' days '))

#adding in a column that gives the difference in the time requested and time used
accre_gpu['time_dif'] = accre_gpu['REQTIME'] - accre_gpu['USEDTIME']

#adding in columns that provide versions of the other time columns but in minutes
accre_gpu['REQTIME_s'] = accre_gpu['REQTIME'].astype('timedelta64[s]')
accre_gpu['USEDTIME_s'] = accre_gpu['USEDTIME'].astype('timedelta64[s]')
accre_gpu['time_dif_s'] = accre_gpu['REQTIME_s'] - accre_gpu['USEDTIME_s']

accre_gpu = accre_gpu.reset_index(drop=True)

accre_gpu

#### 4. What is the distribution of different groups and users accessing each partition? In each partition, who are the top users, and do they represent a majority of the runtime-weighted jobs on the partition?   
From the stakeholder: For determining top users, I think the calculation would be the sum of nGPUs * usedtime over all their jobs in the partition. It would also be interesting to see what users/groups seem to be using more memory.

In [None]:
#looking to see if users are associated with more than one account
#waiting to hear back if these actually are the same individuals, or if every account/user combo is unique
accre_gpu.groupby('USER')['ACCOUNT'].nunique().sort_values(ascending=False).head(25)

In [None]:
#creating the RUNTIME weighted GPU value
accre_gpu['GPUS_RUNTIME'] = accre_gpu['GPUS'] * accre_gpu['USEDTIME_s']

#under the assumption that we want to analyze each user within the context of a specific account, creating a combined
#account_user column
accre_gpu['ACCOUNT_USER'] = accre_gpu['ACCOUNT'] + '_' + accre_gpu['USER']

accre_gpu

In [None]:
#number of jobs either completed, failed, or ran out of memory for each user
user_counts = (
    accre_gpu
    .groupby(['PARTITION','ACCOUNT','USER'])['JOBID']
    .count()
    .to_frame()
    .reset_index()
    .sort_values('JOBID', ascending=False)
    .rename(columns={'JOBID':'job_count'})
)
user_counts

Almost 40% of all jobs in this data set came frome one user in one partition

In [None]:
#distribution of users by total runtime weighted GPUs
sns.set(rc={"figure.figsize":(14, 10)})
sns.histplot(
    data=(accre_gpu
                    .groupby(['PARTITION', 'ACCOUNT_USER'])['GPUS_RUNTIME']
                    .sum()
                    .to_frame()
                    .reset_index()
                    .sort_values('GPUS_RUNTIME', ascending=False)),         
    x='GPUS_RUNTIME',
    hue='PARTITION',
    bins=27
);

In [None]:
#distribution of users by total memory
sns.set(rc={"figure.figsize":(14, 10)})
sns.histplot(
    data=(accre_gpu
                    .groupby(['PARTITION', 'ACCOUNT_USER'])['USEDMEM']
                    .sum()
                    .to_frame()
                    .reset_index()
                    .sort_values('USEDMEM', ascending=False)),         
    x='USEDMEM',
    hue='PARTITION',
    bins=41
);

In [None]:
#plotting the runtime weighted GPUs by partition and account
sns.set(rc={"figure.figsize":(10, 8)})
sns.barplot(
             x='GPUS_RUNTIME',
             y='ACCOUNT',
             hue='PARTITION',
             data=(accre_gpu
                    .groupby(['PARTITION', 'ACCOUNT'])['GPUS_RUNTIME']
                    .sum()
                    .to_frame()
                    .reset_index()
                    .sort_values('GPUS_RUNTIME', ascending=False))
            );

In [None]:
#plotting runtime weighted GPUS by partition and user
#not yet sure if users can be under multiple accounts
sns.set(rc={"figure.figsize":(10,20)})
sns.barplot(
             x='GPUS_RUNTIME',
             y='ACCOUNT_USER',
             hue='PARTITION',
             data=(accre_gpu
                    .groupby(['PARTITION',
                              'ACCOUNT_USER'])['GPUS_RUNTIME']
                    .sum()
                    .to_frame()
                    .reset_index()
                    .sort_values('GPUS_RUNTIME', ascending=False)
                    .head(60)
                  )
);

In [None]:
#plotting total memory usage by partition and account
sns.set(rc={"figure.figsize":(10, 8)})
sns.barplot(
             x='USEDMEM',
             y='ACCOUNT',
             hue='PARTITION',
             data=(accre_gpu
                    .groupby(['PARTITION', 'ACCOUNT'])['USEDMEM']
                    .sum()
                    .to_frame()
                    .reset_index()
                    .sort_values('USEDMEM', ascending=False))
            );

In [None]:
#plotting total memory usage by partition and account_user
sns.set(rc={"figure.figsize":(10, 16)})
sns.barplot(
             x='USEDMEM',
             y='ACCOUNT_USER',
             hue='PARTITION',
             data=(accre_gpu
                    .groupby(['PARTITION', 'ACCOUNT_USER'])['USEDMEM']
                    .sum()
                    .to_frame()
                    .reset_index()
                    .sort_values('USEDMEM', ascending=False)
                    .head(60))
            );

In [None]:
import plotly.express as px
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

In [None]:
#creating a dataframe of aggregated values for each user
user_details = (
    accre_gpu
    .groupby(['PARTITION', 'ACCOUNT', 'ACCOUNT_USER'])
    .agg(
        sum_GPUS = ('GPUS','sum'),
        sum_MEM = ('USEDMEM','sum'),
        sum_UTIME = ('USEDTIME', 'sum'),
        sum_RTIME = ('REQTIME', 'sum'),
        sum_time_dif = ('time_dif', 'sum'),
        sum_GPUS_RUNTIME = ('GPUS_RUNTIME', 'sum'),
        job_count = ('JOBID', 'count')
    )
    .reset_index()
)
user_details

In [None]:
(
    accre_gpu
    .loc[accre_gpu['EXITCODE'] == '0:0']
    .groupby('PARTITION')['USEDTIME']
    .describe()
)

In [None]:
accre_gpu.loc[accre_gpu['JOBID'].str.contains('_')]

In [None]:
accre_gpu['job_id'] = accre_gpu['JOBID'].str.split("_", n = 1, expand = True)[0]
accre_gpu['array_id'] = accre_gpu['JOBID'].str.split("_", n = 1, expand = True)[1]
accre_gpu.loc[accre_gpu['JOBID'].str.contains('_')].sort_values('JOBID').head(60)

In [None]:
accre_gpu['total_job_time'] = accre_gpu.groupby(['job_id'])['USEDTIME'].transform('sum')
accre_gpu['total_array_parts'] = accre_gpu.groupby(['job_id'])['array_id'].transform('count')
accre_gpu['total_job_mem'] = accre_gpu.groupby(['job_id'])['USEDMEM'].transform('sum')
accre_gpu.loc[accre_gpu['JOBID'].str.contains('_')].sort_values('JOBID').head(60)

In [None]:
accre_gpu['job_id'].value_counts().head(60)

In [None]:
accre_by_job = accre_gpu.drop_duplicates(subset=['job_id']).drop(columns = ['time_dif',
                                                                            'USEDMEM',
                                                                            'USEDTIME',
                                                                            'USEDTIME_s',
                                                                            'time_dif_s', 
                                                                            'GPUS_RUNTIME'])

In [None]:
#adding in a column that gives the difference in the time requested and time used
accre_by_job['job_time_dif'] = accre_by_job['REQTIME'] - accre_by_job['total_job_time']

#adding in columns that provide versions of the other time columns but in seconds
accre_by_job['total_job_time_s'] = accre_by_job['total_job_time'].astype('timedelta64[s]')
accre_by_job['time_dif_s'] = accre_by_job['REQTIME_s'] - accre_by_job['total_job_time_s']

#adding stakeholder requested gpu runtime
accre_by_job['GPUS_RUNTIME'] = accre_by_job['GPUS'] * accre_by_job['total_job_time_s']

accre_by_job

In [None]:
accre_by_job.loc[(accre_gpu['PARTITION'] == 'maxwell')
              &
              (accre_gpu['EXITCODE'] == '0:0')].describe()

In [None]:
accre_by_job.loc[(accre_gpu['PARTITION'] == 'maxwell')
                 &
                 (accre_gpu['ACCOUNT'] != 'glasshouse')
                 &
                 (accre_gpu['EXITCODE'] == '0:0')].describe()

In [None]:
accre_by_job.loc[(accre_gpu['PARTITION'] == 'maxwell')
                 &
                 (accre_gpu['ACCOUNT'] == 'glasshouse')
                 &
                 (accre_gpu['EXITCODE'] == '0:0')].describe()

In [None]:
accre_by_job.loc[(accre_gpu['PARTITION'] == 'pascal')
              &
              (accre_gpu['EXITCODE'] == '0:0')].describe()

In [None]:
accre_by_job.loc[(accre_gpu['PARTITION'] == 'pascal')
                 &
                 (accre_gpu['ACCOUNT'] != 'glasshouse')
                 &
                 (accre_gpu['EXITCODE'] == '0:0')].describe()

In [None]:
accre_by_job.loc[(accre_gpu['PARTITION'] == 'turing')
              &
              (accre_gpu['EXITCODE'] == '0:0')].describe()

In [None]:
accre_by_job.loc[(accre_gpu['PARTITION'] == 'turing')
                 &
                 (accre_gpu['ACCOUNT'] != 'glasshouse')
                 &
                 (accre_gpu['EXITCODE'] == '0:0')].describe()

In [None]:
user_det_by_job = (
    accre_by_job
    .groupby(['PARTITION', 'ACCOUNT', 'ACCOUNT_USER'])
    .agg(
        total_GPUS = ('GPUS','sum'),
        total_MEM = ('total_job_mem','sum'),
        total_UTIME = ('total_job_time', 'sum'),
        total_usedtime_s = ('total_job_time_s', 'sum'),
        total_RTIME = ('REQTIME', 'sum'),
        total_reqtime_s = ('REQTIME_s', 'sum'),
        total_time_dif = ('job_time_dif', 'sum'),
        total_GPUS_RUNTIME = ('GPUS_RUNTIME', 'sum'),
        total_jobs = ('job_id', 'count')
    )
    .reset_index()
)
user_det_by_job['total_usedtime_m'] = (user_det_by_job['total_usedtime_s']/60).round(2)
user_det_by_job['total_reqtime_m'] = (user_det_by_job['total_reqtime_s']/60).round(2)
user_det_by_job

In [None]:
fig = px.scatter(user_details,
                 x="sum_GPUS_RUNTIME", 
                 y="sum_MEM", 
                 color="PARTITION",
                 size="job_count",
                 marginal_y="histogram",
                 marginal_x="histogram", 
                 template="simple_white",
                 height=700,
                 hover_data=["ACCOUNT", "ACCOUNT_USER", "sum_GPUS", "sum_UTIME", "job_count"])

fig.show()

In [None]:
fig =  px.histogram(
    user_details, 
    x="sum_MEM",
    color="PARTITION",
    labels={
                      "sum_GPUS_RUNTIME" : "Total Runtime Weighted GPUs",
                      "sum_MEM" : "Total Memory Used (MB)",
                      "PARTITION" : "Partition"},
    height=750
)
    
fig.show()

In [None]:
fig =  px.histogram(
    user_det_by_job, 
    x="sum_MEM",
    color="PARTITION",
    labels={
                      "sum_GPUS_RUNTIME" : "Total Runtime Weighted GPUs",
                      "sum_MEM" : "Total Memory Used (MB)",
                      "PARTITION" : "Partition"},
    height=750
)
    
fig.show()

In [None]:
fig =  px.histogram(
    user_details, 
    x="sum_GPUS_RUNTIME",
    color="PARTITION",
    labels={
                      "sum_GPUS_RUNTIME" : "Total Runtime Weighted GPUs",
                      "sum_GPUS" : "Total GPUs Requested",
                      "PARTITION" : "Partition"
    },
    height=750
)
    
fig.show()

In [None]:
fig =  px.histogram(
    user_det_by_job, 
    x="sum_GPUS_RUNTIME",
    color="PARTITION",
    labels={
                      "sum_GPUS_RUNTIME" : "Total Runtime Weighted GPUs",
                      "sum_GPUS" : "Total GPUs Requested",
                      "PARTITION" : "Partition"
    },
    height=750
)
    
fig.show()

In [None]:
#@interact(partition = user_details['PARTITION'].unique())
#def interactive_plot(partition):

df = user_details

fig =  px.scatter(df, 
                  x="sum_GPUS_RUNTIME", 
                  y="sum_MEM",
                  color="PARTITION",
                  size="job_count",
                  symbol='ACCOUNT',
                  height=800,
                  width=800,
                  labels={
                      "sum_GPUS_RUNTIME" : "Total Runtime Weighted GPUs",
                      "sum_MEM" : "Total Memory Used (MB)",
                      "job_count" : "Total Jobs",
                      "sum_UTIME" : "Total Time Used",
                      "sum_GPUS" : "Total GPUs Requested",
                      "PARTITION" : "Partition"
                  },
                  hover_name="ACCOUNT_USER",
                 )

fig.update_layout(title_text="Title",
                  title_xanchor="auto",
                  title_font_size=14
                 #title_xref="paper"
                 )

fig.update_xaxes(title_font_size=12)

fig.update_yaxes(title_font_size=12)

fig.show()

In [None]:
#dropping 27 users based on the following parameters:
df = (user_det_by_job.loc[~((user_det_by_job['total_jobs'] < 6) 
                    & 
                    (user_det_by_job['total_UTIME'] < '00:30:00')
                    &
                    (user_det_by_job['total_MEM'] < 1000)
                    &
                    ((user_det_by_job['total_GPUS'])/(user_det_by_job['total_jobs']) < 5))])

fig1 =  px.scatter(df, 
                  x="total_GPUS_RUNTIME", 
                  y="total_MEM",
                  color= 'ACCOUNT',
                  size='total_jobs',
                  #symbol='ACCOUNT',
                  facet_row = "PARTITION",
                  color_discrete_sequence=px.colors.qualitative.Dark24,
                  height=1200,
                  width=800,
                  labels={
                      "total_GPUS_RUNTIME" : "Total Runtime Weighted GPUs",
                      "total_MEM" : "Total Memory Used (MB)",
                      "total_jobs" : "Total Jobs",
                      "total_UTIME" : "Total Time Used",
                      "total_GPUS" : "Total GPUs Requested",
                      "PARTITION" : "Partition"
                  },
                  hover_name="ACCOUNT_USER",
                 )

fig1.update_layout(title_text="Title",
                  title_xanchor="auto",
                  title_font_size=14
                 #title_xref="paper"
                 )

fig1.update_xaxes(title_font_size=12)

fig1.update_yaxes(title_font_size=12)

fig1.show()


In [None]:
fig1.write_html("mem_rtgpu.html")