In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import statsmodels.api as sm
import seaborn as sns
import re
from jupyterthemes import jtplot
from datetime import timedelta, datetime
import calendar
import plotly.express as px
jtplot.style()

In [2]:
def getlogs(server):
    """Get server logs and format for analysis"""
    x = pd.read_csv(f'../data/slurm_wrapper_{server}.log', 
                      sep = ' - ', 
                      engine = 'python', 
                      names = ['DATETIME', 'USER', 'RETRY', 'TIMELAPSE', 'RETURNCODE', 'COMMAND'])
    x['DATETIME'] = pd.to_datetime(x['DATETIME'])
    x['USER'] = x['USER'].str.extract(r'(\d+)')
    x['RETRY'] = x['RETRY'].str.extract('(\d+)').astype('int')
    x['TIMELAPSE'] = x['TIMELAPSE'].str.replace('time ', '').astype('float')
    x['RETURNCODE'] = x['RETURNCODE'].str.extract('(\d+)')
    x['JOBID'] = x['COMMAND'].str.extract(r"'(\d{8})'")
    x['COMMANDTYPE'] = x['COMMAND'].str.extract(r"bin/(s\w+)'")
    x['TESTING'] = (x['COMMANDTYPE'] == 'sbatch') & (x['USER'] == '9204')
    x['UNRESPONSIVE'] = (x['TIMELAPSE'] >= 15) & (x['RETURNCODE'] == '1')
    x['SERVER'] = f'{server}'
    return x

In [3]:
ce5 = getlogs('ce5')
ce6 = getlogs('ce6')

In [4]:
ce = pd.concat([ce5, ce6]).sort_values('DATETIME').reset_index(drop = True)

Unnamed: 0,DATETIME,USER,RETRY,TIMELAPSE,RETURNCODE,COMMAND,JOBID,COMMANDTYPE,TESTING,UNRESPONSIVE,SERVER
0,2020-10-16 08:15:39.278699,0,0,0.073476,0,"command ['/usr/bin/sacct', '-u', 'appelte1', '...",,sacct,False,False,ce5
1,2020-10-16 08:18:08.313309,0,0,0.183632,0,"command ['/usr/bin/sacct', '-u', 'appelte1', '...",,sacct,False,False,ce5
2,2020-10-16 08:22:48.128689,0,0,0.075471,0,"command ['/usr/bin/sacct', '-u', 'appelte1', '...",,sacct,False,False,ce5
3,2020-10-16 08:25:13.257408,0,0,0.094844,0,"command ['/usr/bin/sacct', '-u', 'appelte1', '...",,sacct,False,False,ce5
4,2020-10-16 08:31:01.460723,0,0,0.074988,0,"command ['/usr/bin/sacct', '-u', 'appelte1', '...",,sacct,False,False,ce5
...,...,...,...,...,...,...,...,...,...,...,...
9547408,2021-10-07 21:59:35.014602,9221,0,0.060087,0,"command ['/usr/bin/squeue', '-o', '%i %T', '-u...",,squeue,False,False,ce6
9547409,2021-10-07 21:59:35.238970,9202,0,0.098044,0,"command ['/usr/bin/squeue', '-o', '%i %T', '-u...",,squeue,False,False,ce6
9547410,2021-10-07 21:59:57.265189,9203,0,0.024550,0,"command ['/usr/bin/squeue', '-o', '%i %T', '-u...",,squeue,False,False,ce6
9547411,2021-10-07 22:00:04.024360,9201,0,0.039419,0,"command ['/usr/bin/squeue', '-o', '%i %T', '-u...",,squeue,False,False,ce6


# Problem 3

Calculate some descriptive statistics about 

(3A) how often the scheduler was unresponsive, 

(3B) how long these periods of time were, and 

(3C) create a time series plot of when the scheduler was having difficulties.

## (3A) How often was the scheduler unresponsive?

In [19]:
def ct_prop(df, group):
    a = ce[group].value_counts().reset_index().rename(columns = {'index' : f'{group}', 'UNRESPONSIVE' : 'COUNT'})
    b = ce[group].value_counts(normalize = True).reset_index().rename(columns = {'index' : f'{group}', 'UNRESPONSIVE' : 'PROPORTION'})
    ab = ce_unresp_ct.merge(ce_unresp_prop)
    return ab

In [20]:
ct_prop(ce, 'UNRESPONSIVE')

Unnamed: 0,UNRESPONSIVE,COUNT,PROPORTION
0,0,8533047,0.893755
1,1,1014366,0.106245


In [25]:
ce.groupby(['SERVER', 'UNRESPONSIVE'])['UNRESPONSIVE'].count()

SERVER  UNRESPONSIVE
ce5     0               4335065
        1                435828
ce6     0               4197982
        1                578538
Name: UNRESPONSIVE, dtype: int64

In [27]:
ce.groupby(['SERVER'])['UNRESPONSIVE'].mean()

SERVER
ce5    0.091351
ce6    0.121121
Name: UNRESPONSIVE, dtype: float64

## Problem 4

Finally, combine the time series information from the two datasets together to see 

(4A) how well correlated heavy job-completion load is with the unresponsiveness, and 

(4B) to see if there is some threshold of job completions per hour that generally results in unresponsiveness.

In [None]:
slurm_jobs = pd.read_csv('../data/slurm_jobs.csv')