In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
#there are some irregular lines in the data, so setting argument to get a note of which lines and skip them when reading in
accre_gpu = pd.read_csv('../data/accre-gpu-jobs-2022-v2.csv', on_bad_lines='warn')

In [None]:
#per our stakeholder, no need to include jobs with state of cancelled, pending, or running in our analysis, so dropping those 1258 rows
accre_gpu = accre_gpu.loc[(~accre_gpu['STATE'].isin(['PENDING', 'RUNNING'])) 
                          & 
                          (~accre_gpu['STATE'].str.contains('CANCELLED'))]

#stripping the 'M' from USEDMEM column and converting it to a numeric data type
accre_gpu['USEDMEM'] = accre_gpu['USEDMEM'].str.strip(to_strip='M').astype(float)

#converting REQTIME and USEDTIME columns to timedelta data types
accre_gpu['REQTIME'] = pd.to_timedelta(accre_gpu['REQTIME'].str.replace('-',' days '))
accre_gpu['USEDTIME'] = pd.to_timedelta(accre_gpu['USEDTIME'].str.replace('-',' days '))

#adding in a column that gives the difference in the time requested and time used
accre_gpu['time_dif'] = accre_gpu['REQTIME'] - accre_gpu['USEDTIME']

#adding in columns that provide versions of the other time columns but in minutes
accre_gpu['REQTIME_s'] = accre_gpu['REQTIME'].astype('timedelta64[s]')
accre_gpu['USEDTIME_s'] = accre_gpu['USEDTIME'].astype('timedelta64[s]')
accre_gpu['time_dif_s'] = accre_gpu['REQTIME_s'] - accre_gpu['USEDTIME_s']

#creating the RUNTIME weighted GPU value
accre_gpu['GPUS_RUNTIME'] = accre_gpu['GPUS'] * accre_gpu['USEDTIME_s']

#under the assumption that we want to analyze each user within the context of a specific account, creating a combined
#account_user column
accre_gpu['ACCOUNT_USER'] = accre_gpu['ACCOUNT'] + '_' + accre_gpu['USER']

accre_gpu = accre_gpu.reset_index(drop=True)

accre_gpu

#### 5. Currently there is a 5 day limit on runtime for GPU jobs, although some users have been asking for extensions. What is the distribution of requested runtime and actual runtime on jobs on each partition? Do users really need more time, or are they simply always requesting the maximum?

In [None]:
#creating a dataframe of aggregated values for each user
user_details = (
    accre_gpu
    .groupby(['PARTITION', 'ACCOUNT', 'ACCOUNT_USER'])
    .agg(
        sum_GPUS = ('GPUS','sum'),
        sum_MEM = ('USEDMEM','sum'),
        sum_UTIME = ('USEDTIME', 'sum'),
        sum_RTIME = ('REQTIME', 'sum'),
        sum_time_dif = ('time_dif', 'sum'),
        sum_GPUS_RUNTIME = ('GPUS_RUNTIME', 'sum')
    )
    .reset_index()
)
user_details

In [None]:
#plotting total time difference in m by partition and account
sns.set(rc={"figure.figsize":(10, 8)})
sns.barplot(
             x='time_dif_s',
             y='ACCOUNT',
             hue='PARTITION',
             data=(accre_gpu
                    .groupby(['PARTITION', 'ACCOUNT'])['time_dif_s']
                    .sum()
                    .to_frame()
                    .reset_index()
                    .sort_values('ACCOUNT'))
            );

In [None]:
#how many times did accounts make request for 5 or more days
accre_gpu.loc[accre_gpu['REQTIME'] >= '5 days']['ACCOUNT'].value_counts()

In [None]:
#how many times did accounts make request for 2 or more days
accre_gpu.loc[accre_gpu['REQTIME'] >= '2 days']['ACCOUNT'].value_counts().sort_index()

In [None]:
#how many times did accounts make request for more than 2 days and actually use at least 1 day
accre_gpu.loc[(accre_gpu['REQTIME'] >= '2 days')
             &
              (accre_gpu['USEDTIME'] >= '4 hours')]['ACCOUNT'].value_counts().sort_index()

In [None]:
#how many times did accounts have time differences between requested/used time of at least 2 days (for any amount of time requested)
accre_gpu.loc[accre_gpu['time_dif'] >= '2 days']['ACCOUNT'].value_counts()

In [None]:
#looking at a plot of the count of different time differences (in hours) with each bin being 3 hours wide
sns.set(rc={"figure.figsize":(14, 10)})
sns.histplot(
    data = accre_gpu.loc[accre_gpu['time_dif_s'] >= 0], 
    x = (accre_gpu['time_dif'].astype('timedelta64[h]')),
    binwidth=3,
    hue = 'PARTITION'
);

In [None]:
#looking more specifically at requests that are for 5 or more days, bins here are percentage of jobs rather than counts
print("Number of 5+ day requests: ")
print(len(accre_gpu.loc[accre_gpu['REQTIME'] >= '5 days']))
sns.set(rc={"figure.figsize":(14, 10)})
sns.histplot(
    data = accre_gpu.loc[accre_gpu['REQTIME'] >= '5 days'], 
    x = (accre_gpu['time_dif'].astype('timedelta64[h]')),
    stat='percent',
    binwidth=3,
    hue = 'PARTITION'
);

In [None]:
(
    accre_gpu
    .loc[(accre_gpu['REQTIME'] >= '5 days') 
         #& 
         #(accre_gpu['time_dif'] <= '1 hour')
        ]
    .sort_values('USEDTIME')
)

In [None]:
#looking at the sum of all time differences by partition and account
sns.set(rc={"figure.figsize":(10, 8)})
sns.barplot(
             x='time_dif_h',
             y='ACCOUNT',
             hue='PARTITION',
             data=(user_details
                    .groupby(['PARTITION', 'ACCOUNT'])['time_dif_h']
                    .sum()
                    .to_frame()
                    .reset_index()
                    .sort_values('ACCOUNT'))
            );

In [None]:
#looking at sum of time differences for those users who have instances of differences of at least 4 days
sns.set(rc={"figure.figsize":(10, 8)})
sns.barplot( x='time_dif_h',
             y='USER',
             hue='PARTITION',
             data=(user_details
                    .loc[user_details['USER']
                         .isin(accre_gpu.loc[accre_gpu['time_dif'] >= '4 days']['USER'])]
                    .groupby(['PARTITION', 'USER'])['time_dif_h']
                    .sum()
                    .to_frame()
                    .reset_index()
                    .sort_values('USER'))
            );