In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import itertools

In [None]:
raw_df = pd.read_csv('../data/accre-gpu-jobs-2022-v2.csv', on_bad_lines='skip')

In [None]:
raw_df.columns = raw_df.columns.str.lower()

gpu_jobs_df = (raw_df
               .loc[raw_df['state'] == 'COMPLETED']
               .copy()
              )

gpu_jobs_df['reqtime'] = pd.to_timedelta(
    gpu_jobs_df['reqtime'].str.replace('-',' days ')
)

gpu_jobs_df['usedtime'] = pd.to_timedelta(
    gpu_jobs_df['usedtime'].str.replace('-',' days ')
)

In [None]:
sns.boxplot(
    x = gpu_jobs_df['reqtime'] / np.timedelta64(1,'D')
);

In [None]:
sns.boxplot(
    x = gpu_jobs_df['usedtime'] / np.timedelta64(1,'D')
);

In [None]:
sns.histplot(
    x = gpu_jobs_df['reqtime'] / np.timedelta64(1,'D'),
    hue = gpu_jobs_df['partition'],
    bins = 10
);

In [None]:
sns.histplot(
    x = gpu_jobs_df['usedtime'] / np.timedelta64(1,'D'),
    hue = gpu_jobs_df['partition'],
    bins = 10
);

In [None]:
def all_used_percent(percent):
    values = []
    for x in percent:
        used = gpu_jobs_df.loc[
            gpu_jobs_df['usedtime'] >= gpu_jobs_df['reqtime'] * x
        ]['jobid'].count()
    
        per = (used / gpu_jobs_df['jobid'].count() * 100).round(2)
    
        values.append(per)
    return values

In [None]:
percentages = [.1, .2]
days = ['less than', 1, 2, 3, 4, 5, 'more than']

In [None]:
def less_than_day(percent):
    values = []
    for x in percent:
        used = gpu_jobs_df.loc[
                    ((gpu_jobs_df['reqtime'] < np.timedelta64(1,'D')) &
                    (gpu_jobs_df['usedtime'] >= gpu_jobs_df['reqtime'] * x)
                    )
                ]['jobid'].count()
            
        per = (
            (used / gpu_jobs_df.loc[
                    gpu_jobs_df['reqtime'] < np.timedelta64(1,'D')
            ]['jobid'].count() * 100
            )
        ).round(2)
        values.append(per)
    return values

In [None]:
def per_day(day, percent):
    values = []
    for x in percent:
        used = gpu_jobs_df.loc[
                    ((gpu_jobs_df['reqtime'] >= np.timedelta64(day,'D')) &
                     (gpu_jobs_df['reqtime'] < np.timedelta64(day + 1,'D')) &
                    (gpu_jobs_df['usedtime'] >= gpu_jobs_df['reqtime'] * x)
                    )
                ]['jobid'].count()
        per = (
                (used / gpu_jobs_df.loc[
                    ((gpu_jobs_df['reqtime'] >= np.timedelta64(day,'D')) &
                     (gpu_jobs_df['reqtime'] < np.timedelta64(day + 1,'D'))
                    )
                ]['jobid'].count() * 100)
            ).round(2)
        values.append(per)
    return values

In [None]:
def more_than_five(percent):
    values = []
    for x in percent:
        used = gpu_jobs_df.loc[
                ((gpu_jobs_df['reqtime'] > np.timedelta64(5,'D')) &
                (gpu_jobs_df['usedtime'] >= gpu_jobs_df['reqtime'] * x)
                )
            ]['jobid'].count()
        per = (
            (used / gpu_jobs_df.loc[
                gpu_jobs_df['reqtime'] > np.timedelta64(5,'D')
            ]['jobid'].count() * 100
            )
        ).round(2)
        values.append(per)
    return values

In [None]:
percent_df = pd.DataFrame()

percent_df['percentages'] = percentages
percent_df['percentages'] = percent_df['percentages'] * 100

percent_df['all_entries'] = all_used_percent(percentages)

percent_df['less_than_day'] = less_than_day(percentages)

percent_df['one_day'] = per_day(1, percentages)
percent_df['two_days'] = per_day(2, percentages)
percent_df['three_days'] = per_day(3, percentages)
percent_df['four_days'] = per_day(4, percentages)
percent_df['five_days'] = per_day(5, percentages)

percent_df['more_than_five'] = more_than_five(percentages)

percent_df = percent_df.transpose().reset_index()
percent_df.columns = percent_df.iloc[0]
percent_df = (percent_df
              .drop(percent_df.index[0])
              .rename(columns = {'percentages' : 'req_days'})
             )

In [None]:
percent_df

In [None]:
req_2_more_df = gpu_jobs_df.loc[
    gpu_jobs_df['reqtime'] >= np.timedelta64(2,'D')
]

In [None]:
def used_percent(data, percent):
    used = data.loc[
        data['usedtime'] >= data['reqtime'] * percent
    ]['jobid'].count()
    
    per = (used / data['jobid'].count() * 100).round(2)
    
    return per

In [None]:
used_percent(req_2_more_df, .1)

In [None]:
used_percent(req_2_more_df, .2)

In [None]:
all_entries = (req_2_more_df.loc[
    req_2_more_df['usedtime'] <= np.timedelta64(1,'h')
]['jobid'].count() / req_2_more_df['jobid'].count() * 100).round(2)

In [None]:
maxwell = (req_2_more_df.loc[
    (req_2_more_df['usedtime'] <= np.timedelta64(1,'h')) & (req_2_more_df['partition'] == 'maxwell')
]['jobid'].count() / req_2_more_df.loc[req_2_more_df['partition'] == 'maxwell']['jobid'].count() * 100).round(2)

In [None]:
pascal = (req_2_more_df.loc[
    (req_2_more_df['usedtime'] <= np.timedelta64(1,'h')) & (req_2_more_df['partition'] == 'pascal')
]['jobid'].count() / req_2_more_df.loc[req_2_more_df['partition'] == 'pascal']['jobid'].count() * 100).round(2)

In [None]:
turing = (req_2_more_df.loc[
    (req_2_more_df['usedtime'] <= np.timedelta64(1,'h')) & (req_2_more_df['partition'] == 'turing')
]['jobid'].count() / req_2_more_df.loc[req_2_more_df['partition'] == 'turing']['jobid'].count() * 100).round(2)

In [None]:
under_hour_df =  pd.DataFrame()

under_hour_df['partition'] = ['all_entries', 'maxwell', 'pascal', 'turing']

under_hour_df['total'] = [
    req_2_more_df['jobid'].count(),
    req_2_more_df.loc[req_2_more_df['partition'] == 'maxwell']['jobid'].count(),
    req_2_more_df.loc[req_2_more_df['partition'] == 'pascal']['jobid'].count(),
    req_2_more_df.loc[req_2_more_df['partition'] == 'turing']['jobid'].count()
]

under_hour_df['under_hour'] = [
    req_2_more_df.loc[req_2_more_df['usedtime'] <= np.timedelta64(1,'h')]['jobid'].count(),
    (req_2_more_df.loc[
        (req_2_more_df['usedtime'] <= np.timedelta64(1,'h')) & (req_2_more_df['partition'] == 'maxwell')
    ]['jobid'].count()),
    (req_2_more_df.loc[
        (req_2_more_df['usedtime'] <= np.timedelta64(1,'h')) & (req_2_more_df['partition'] == 'pascal')
    ]['jobid'].count()),
    (req_2_more_df.loc[
        (req_2_more_df['usedtime'] <= np.timedelta64(1,'h')) & (req_2_more_df['partition'] == 'turing')
    ]['jobid'].count())
]

under_hour_df['percent'] = [all_entries, maxwell, pascal, turing]

In [None]:
under_hour_df

In [None]:
sns.barplot(
    data = under_hour_df,
    x = 'partition',
    y = 'total'
);

In [None]:
top_ten_per_df = req_2_more_df.loc[req_2_more_df['usedtime'] >= req_2_more_df['reqtime'] * .1].copy()

In [None]:
top_ten_per_df.loc[top_ten_per_df['partition'] == 'pascal']['jobid'].count() / top_ten_per_df['jobid'].count()

In [None]:
top_ten_per_df.loc[top_ten_per_df['partition'] == 'maxwell']['jobid'].count() / top_ten_per_df['jobid'].count()

In [None]:
top_ten_per_df.loc[top_ten_per_df['partition'] == 'turing']['jobid'].count() / top_ten_per_df['jobid'].count()

In [None]:
top_ten_per_df['used_per_day'] = top_ten_per_df['usedtime'] / np.timedelta64(1,'D')
top_ten_per_df['req_per_day'] = top_ten_per_df['reqtime'] / np.timedelta64(1,'D')

top_ten_per_df['time_difference'] = top_ten_per_df['req_per_day'] - top_ten_per_df['used_per_day']

In [None]:
five_day_run_out_df = top_ten_per_df.loc[
    (top_ten_per_df['reqtime'] == np.timedelta64(5, 'D')) & (top_ten_per_df['time_difference'] <= 0.0417)
]

In [None]:
five_day_run_out_df.groupby(['account', 'user'])['jobid'].count().sort_values(ascending = False)

In [None]:
five_day_run_out_df.groupby(['account', 'user','partition'])['jobid'].count().sort_values(ascending = False)

In [None]:
five_day_run_out_df['jobid'].count()

In [None]:
def total(account, user):
    count = gpu_jobs_df.loc[
        ((gpu_jobs_df['account'] == account) & 
         (gpu_jobs_df['user'] == user) )
    ]['jobid'].count()
    
    return count

In [None]:
def five_hour_req(account, user):
    count = gpu_jobs_df.loc[
        ((gpu_jobs_df['account'] == account) & 
         (gpu_jobs_df['user'] == user) & 
         (gpu_jobs_df['reqtime'] == np.timedelta64(5,'D')))
    ]['jobid'].count()
    
    return count

In [None]:
def under_hour(account, user):
    count = gpu_jobs_df.loc[
        ((gpu_jobs_df['account'] == account) & 
         (gpu_jobs_df['user'] == user) & 
         (gpu_jobs_df['reqtime'] == np.timedelta64(5,'D')) &
         ((gpu_jobs_df['usedtime'] <= np.timedelta64(1,'h'))))
    ]['jobid'].count()
    
    return count

In [None]:
har_total = total('harovin', 'porsha')

har_five = five_hour_req('harovin', 'porsha')

har_under_hour = under_hour('harovin', 'porsha')

In [None]:
gla_total = total('glasshouse', 'jolette')

gla_five = five_hour_req('glasshouse', 'jolette')

gla_under_hour = under_hour('glasshouse', 'jolette')

In [None]:
val_total = total('harovin', 'valentina')

val_five = five_hour_req('harovin', 'valentina')

val_under_hour = under_hour('harovin', 'valentina')

In [None]:
eri_total = total('glasshouse', 'erick')

eri_five = five_hour_req('glasshouse', 'erick')

eri_under_hour = under_hour('glasshouse', 'erick')

In [None]:
two_run_out_df = pd.DataFrame()

two_run_out_df['account'] = ['harovin', 'glasshouse', 'harovin', 'glasshouse']

two_run_out_df['user'] = ['porsha', 'jolette', 'valentina', 'erick']

two_run_out_df['total_jobs'] = [har_total, gla_total, val_total, eri_total]

two_run_out_df['five_day_total'] = [har_five, gla_five, val_five, eri_five]

two_run_out_df['finish_under_hour'] = [har_under_hour, gla_under_hour, val_under_hour, eri_under_hour]

two_run_out_df['five_day_timeout'] = [273, 213, 50, 47]

two_run_out_df['maxwell'] = [0, 43, 0, 37]

two_run_out_df['pascal'] = [0, 170, 0, 10]

two_run_out_df['turing'] = [273, 0, 50, 0]

In [None]:
two_run_out_df