## FRESCO Analytics Notebook
### Overview
This notebook has been designed to make analysis of the Anvil dataset as easy as possible. Generally speaking, it will allow the user to access the Anvil files stored locally, select a number of analysis options, and view the results.
### Instructions
1. Run the first cell and provide the complete directory path to the data in the 'Path' field.
2. Run the second cell. If data preprocessing is needed, select the desired options. Otherwise, skip to the "If data preprocessing options were not selected, follow these instructions" section below.
#### If data preprocessing options were selected, follow these instructions
3. Run cell 3 and provide start time and end times.
4. Run cell 4 and select the units to be included in the timeseries data.
5. Run cell 5 and provide the desired options.
6. Run cell 6 and provide the desired statistic.
7. Run cell 7 and provide the desired data visualisation options.
8. Run cell 8 to see the data visualizations.

#### If data preprocessing options were not selected, follow these instructions
3. Run cell 4 and provide start time and end times and select the units to be included in the timeseries data.
4. Run cell 5 and provide the desired options.
5. Run cell 6 and provide the desired statistic.
6. Run cell 7 and provide the desired data visualisation options.
7. Run cell 8 to see the data visualizations.


In [1]:
# -------------- CELL 1 --------------

from IPython.display import display, FileLink
import ipywidgets as widgets
import pandas as pd
import notebook_functions as nbf

print(r"Please provide the directory path to the data files e.g., D:\Data")
dir_path = widgets.Text(
    value='',
    placeholder='',
    description='Path:',
    disabled=False
)
display(dir_path)

Please provide the directory path to the data files e.g., D:\Data


Text(value='', description='Path:', placeholder='')

In [2]:
# -------------- CELL 2 --------------

data_path = nbf.get_data_files_directory(dir_path.value)

print("Data preprocessing: select either option (or both):")
preprocessing = widgets.SelectMultiple(
    options=['None', 'Remove Rows with Missing Metrics?', 'Add an Interval Column?'],
    value=['None'],
    description='Preprocessing:',
    disabled=False,
)

display(preprocessing)

Data preprocessing: select either option (or both):


SelectMultiple(description='Preprocessing:', index=(0,), options=('None', 'Remove Rows with Missing Metrics?',…

In [3]:
# -------------- CELL 3 --------------
pre_proc_bools = {"preprocess_remove": False, "add_interval_col": False}

for value in preprocessing.value:
    if "Remove" in value or "Add" in value:
        print("Please add the start time and end time for the data preprocessing. Please use the form MM-DD-YYYY HH:MM:SS")
        start_time = widgets.Text(
            value='01-01-2020 00:01:01',
            placeholder='',
            description='Start Time:',
            disabled=False
        )

        end_time = widgets.Text(
            value='12-31-9999 23:59:59',
            placeholder='',
            description='End Time:',
            disabled=False
        )
        display(start_time, end_time)
        break

for value in preprocessing.value:
    if "Remove" in value:
        pre_proc_bools["preprocess_remove"] = True
    if "Add" in value:
        pre_proc_bools["add_interval_col"] = True



In [4]:
# -------------- CELL 4 --------------

dataframe = pd.DataFrame()

if pre_proc_bools["preprocess_remove"]:
    dataframe = nbf.handle_missing_metrics(start_time.value, end_time.value, data_path)

if pre_proc_bools["add_interval_col"]:
    dataframe = nbf.add_interval_column(start_time.value, end_time.value, data_path)

if not pre_proc_bools["preprocess_remove"] and not pre_proc_bools["add_interval_col"]:
    print("Please enter a start time and end time. Please use the form MM-DD-YYYY HH:MM:SS")
    start_time = widgets.Text(
        value='01-01-2020 00:01:01',
        placeholder='',
        description='Start Time:',
        disabled=False
    )

    end_time = widgets.Text(
        value='12-31-9999 23:59:59',
        placeholder='',
        description='End Time:',
        disabled=False
    )
    display(start_time, end_time)

print("Optional: select the units to be included in the timeseries data.")
units = widgets.SelectMultiple(
    options=['None', 'CPU %', 'GPU %', 'GB:memused', 'GB:memused_minus_diskcache', 'GB/s', 'MB/s'],
    value=['None'],
    description='Units:',
    disabled=False,
)

display(units)

Please enter a start time and end time.


Text(value='01-01-2020', description='Start Time:', placeholder='')

Text(value='12-31-9999', description='End Time:', placeholder='')

Optional: select the units to be included in the timeseries data.


SelectMultiple(description='Units:', index=(0,), options=('None', 'CPU %', 'GPU %', 'GB:memused', 'GB:memused_…

In [5]:
# -------------- CELL 5 --------------

unit_values = {}  # stores user low and high value user input such that: key = a unit from the units list above /// value = (low_value, high_value)

for value in units.value:
    if value != 'None':
        nbf.setup_widgets(unit_values, value)

print("Optional: provide the hosts to be included in the timeseries data e.g., 'NODE1, NODE2'")
hosts = widgets.Text(
    value='',
    placeholder='',
    description='Hosts:',
    disabled=False
)
display(hosts)

print("Optional: provide the jobs to be included in the timeseries data e.g., 'JOB1, JOB2'")
job_ids = widgets.Text(
    value='',
    placeholder='',
    description='Jobs:',
    disabled=False
)
display(job_ids)

print("Optional: select if you want the account logs to be returned for the Job IDs matching your query.")
return_account_logs = widgets.ToggleButton(
    value=False,
    description='Account Logs',
    disabled=False,
    button_style='',
    tooltip='Return Account Logs?',
    icon='check'
)
display(return_account_logs)

print("Optional: select the columns to be included in the timeseries data (hold control to select multiple). If no columns are "
      "selected, all columns will be included.")
timeseries_return_columns = widgets.SelectMultiple(
    options=['None', 'Job Id', 'Hosts', 'Events', 'Units', 'Values', 'Timestamps'],
    value=['None'],
    description='Return Columns',
    disabled=False
)
display(timeseries_return_columns)

********************************
Enter the low value for CPU %


FloatText(value=0.1, description='CPU % Low Value:')

Enter the high value for CPU %


FloatText(value=99.9, description='CPU % High Value:')

Button(description='Save Values', style=ButtonStyle())

********************************
Enter the low value for GPU %


FloatText(value=0.1, description='GPU % Low Value:')

Enter the high value for GPU %


FloatText(value=99.9, description='GPU % High Value:')

Button(description='Save Values', style=ButtonStyle())

Optional: provide the hosts to be included in the timeseries data e.g., 'NODE1, NODE2'


Text(value='', description='Hosts:', placeholder='')

Optional: provide the jobs to be included in the timeseries data e.g., 'JOB1, JOB2'


Text(value='', description='Jobs:', placeholder='')

Optional: select if you want the account logs to be returned for the Job IDs matching your query.


ToggleButton(value=False, description='Account Logs', icon='check', tooltip='Return Account Logs?')

Optional: select the columns to be included in the timeseries data (hold control to select multiple). If no columns are selected, all columns will be included.


SelectMultiple(description='Return Columns', index=(0,), options=('None', 'Job Id', 'Hosts', 'Events', 'Units'…

In [None]:
# -------------- CELL 6 --------------

nbf.get_timeseries_by_timestamp(start_time.value, end_time.value, timeseries_return_columns.value)

if units.value != "None":
    nbf.get_timeseries_by_values_and_unit(unit_values)

if len(hosts.value) > 0:
    nbf.get_timeseries_by_hosts(hosts.value)
    
if len(job_ids.value) > 0:
    nbf.get_account_logs_by_job_ids(job_ids.value)

print("Do you want to download the filtered data?")
# Create download button
download_button = widgets.Button(description="Download Data")

def on_button_clicked(b):
    # Create download link when button is clicked and display link
    display(nbf.create_download_link(dataframe))

download_button.on_click(on_button_clicked)
display(download_button)

print("Please select which statistics to be calculated:")
stats = widgets.SelectMultiple(
    options=['Average', 'Mean', 'Median', 'Standard Deviation', 'PDF', 'CDF', 'Ratio of Data Outside Threshold'],
    value=['Mean'],
    description='Statistics',
    disabled=False
)
display(stats)

In [None]:
# -------------- CELL 7 --------------

# Display statistical data here


# Give the user the option to calculate correlations
print("If you would like to explore correlations among metrics and statistics, select from the options below:")

correlations = widgets.SelectMultiple(
    options=['None', 'CPU %', 'GPU %', 'GB(memused_minus_diskcache)', 'GB(memused)', 'GB/s', 'MB/s'],
    value=['None'],
    description='Metrics',
    disabled=False
)

stats = widgets.SelectMultiple(
    options=['Average', 'Mean', 'Median', 'Standard Deviation', 'PDF', 'CDF', 'Ratio of Data Outside Threshold'],
    value=['Mean'],
    description='Statistics',
    disabled=False
)

display(correlations, stats)


In [None]:
# -------------- CELL 8 --------------

# calculate correlations
nbf.calculate_correlation()

# Display correlation visualizations here


In [None]:
# -------------- CELL 9 ---------------

# Give the user the option to download data here.
print("Select the files to be downloaded:")
files_to_provide = widgets.SelectMultiple(
    options=['None', 'job_ts_metrics_aug2022_anon', 'job_ts_metrics_dec2022_anon',
             'job_ts_metrics_jan2022_anon', 'job_ts_metrics_july2022_anon',
             'job_ts_metrics_nov2022_anon', 'job_ts_metrics_sep2022_anon'],
    value=['None'],
    description='Files',
    disabled=False
)
display(files_to_provide)

# Create and display download button
download_button = widgets.Button(description='Download File/s')
download_button.on_click(nbf.on_download_button_clicked)
display(download_button)

In [None]:
# -------------- CELL 10 ---------------
