# FRESCO Analytics Notebook
### Overview
This notebook has been designed to make analysis of the Anvil dataset as easy as possible. Generally speaking, it will allow the user to access the Anvil files stored locally, select a number of analysis options, and view the results.
### Instructions
1. Run the first cell and provide the window for your dataset. This window of time will be used to pull the host timeseries and job accounting data from the database.
2. Run cell 2 and select the desired preprocessing options (can be more than one).
3. Run cell 3 and select the desired host data time series units to be included (can be more than one).
4. Run cell 4 and provide the desired values and options. **Note**: if units were selected in step 3, the "Save Values" button must be selected before moving on.
5. Run cell 5 and select the desired options.
6. Run cell 6 to see your data visualizations. If you would like to explore correlations among metrics and statistics, select from the provided options.
7. Run cell 7 to see the correlation visualizations.
8. Run cell 8 and download the desired data.


In [None]:
# -------------- CELL 1 --------------

from IPython.display import display, FileLink, clear_output, display
import ipywidgets as widgets
import notebook_functions as nbf
from datetime import datetime
import pandas as pd


print(r"Please provide a time window for your host dataset.")

start_time = widgets.NaiveDatetimePicker(
    value=datetime.now().replace(microsecond=0),
    placeholder='',
    description='Start Time:',
    disabled=False
)

end_time = widgets.NaiveDatetimePicker(
    value=datetime.now().replace(microsecond=0),
    placeholder='',
    description='End Time:',
    disabled=False
)

def validate_date_range(change):
    # 'change' includes information about the change event
    # including the 'owner' which is the widget itself
    if change['owner'] == start_time:
        if end_time.value and change['new'] >= end_time.value:
            print("Error: Start Time should be less than End Time")
        else:
            print("Time range is valid")
    elif change['owner'] == end_time:
        if start_time.value and change['new'] <= start_time.value:
            print("Error: End Time should be greater than Start Time")
        else:
            print("Time range is valid")

# Attach the validation function to the 'value' trait of the date picker widgets
start_time.observe(validate_date_range, 'value')
end_time.observe(validate_date_range, 'value')

display(start_time, end_time)

In [None]:
# -------------- CELL 2 --------------

print("Data preprocessing: select either option (or both):")
preprocessing = widgets.SelectMultiple(
    options=['None', 'Remove Rows with Missing Metrics?', 'Add an Interval Column?'],
    value=['None'],
    description='Options:',
    disabled=False,
)

display(preprocessing)

In [None]:
# -------------- CELL 3 --------------

# get timeseries from the DB
time_series_df = nbf.get_time_series_from_database(start_time.value.strftime('%Y-%m-%d %H:%M:%S'), end_time.value.strftime('%Y-%m-%d %H:%M:%S'))

# get the account logs from the DB
account_log_df = nbf.get_account_log_from_database(start_time.value.strftime('%Y-%m-%d %H:%M:%S'), end_time.value.strftime('%Y-%m-%d %H:%M:%S'))

# do the preprocessing
for value in preprocessing.value:
    if "Remove" in value:
        time_series_df = time_series_df.dropna()
    if "Add" in value:
        time_series_df = nbf.add_interval_column(end_time.value, time_series_df)

print("Optional: select the units to be included in the timeseries data.")
units = widgets.SelectMultiple(
    options=['None', 'CPU %', 'GPU %', 'GB:memused', 'GB:memused_minus_diskcache', 'GB/s', 'MB/s'],
    value=['None'],
    description='Units:',
    disabled=False,
)

display(units)


In [None]:
# -------------- CELL 4 --------------

unit_values = {}  # stores user low and high value user input such that: key = a unit from the units list above /// value = (low_value, high_value)

for value in units.value:
    if value != 'None':
        nbf.setup_widgets(unit_values, value)

print("Optional: provide the hosts to be included in the timeseries data e.g., 'NODE1, NODE2'")
hosts = widgets.Text(
    value='',
    placeholder='',
    description='Hosts:',
    disabled=False
)
display(hosts)

print("Optional: provide the jobs to be included in the timeseries data e.g., 'JOB1, JOB2'")
job_ids = widgets.Text(
    value='',
    placeholder='',
    description='Jobs:',
    disabled=False
)
display(job_ids)

print("Optional: select if you want the account logs to be returned for the Job IDs matching your query.")
return_account_logs = widgets.ToggleButton(
    value=False,
    description='Account Logs',
    disabled=False,
    button_style='',
    tooltip='Return Account Logs?',
    icon='check'
)
display(return_account_logs)

print("Optional: select the columns to be included in the timeseries data (hold control to select multiple). If no columns are "
      "selected, all columns will be included.")
timeseries_return_columns = widgets.SelectMultiple(
    options=['None', 'Job Id', 'Hosts', 'Events', 'Units', 'Values', 'Timestamps'],
    value=['None'],
    description='Return Columns',
    disabled=False
)
display(timeseries_return_columns)

In [None]:
unit_values

In [None]:
# -------------- CELL 5 --------------

if units.value != "None":
    time_series_df = nbf.get_timeseries_by_values_and_unit(unit_values, time_series_df)

if len(hosts.value) > 0:
    time_series_df = nbf.get_timeseries_by_hosts(hosts.value, time_series_df)
    
if len(job_ids.value) > 0:
    account_log_df = nbf.get_account_logs_by_job_ids(job_ids.value)

if return_account_logs:
    account_log_df = nbf.get_account_logs_by_job_ids(time_series_df, account_log_df)

print("Do you want to download the filtered data?")
# Create download button
download_button = widgets.Button(description="Download Data")

def on_button_clicked(b):
    # Create download link when button is clicked and display link
    display(nbf.create_download_link(time_series_df))

download_button.on_click(on_button_clicked)
display(download_button)

print("Please select which statistics to be calculated:")
stats = widgets.SelectMultiple(
    options=['Average', 'Mean', 'Median', 'Standard Deviation', 'PDF', 'CDF', 'Ratio of Data Outside Threshold'],
    value=['Mean'],
    description='Statistics',
    disabled=False
)
display(stats)

In [None]:
# -------------- CELL 6 --------------

# Display statistical data here


# Give the user the option to calculate correlations
print("If you would like to explore correlations among metrics and statistics, select from the options below:")

correlations = widgets.SelectMultiple(
    options=['None', 'CPU %', 'GPU %', 'GB(memused_minus_diskcache)', 'GB(memused)', 'GB/s', 'MB/s'],
    value=['None'],
    description='Metrics',
    disabled=False
)

stats = widgets.SelectMultiple(
    options=['Average', 'Mean', 'Median', 'Standard Deviation', 'PDF', 'CDF', 'Ratio of Data Outside Threshold'],
    value=['Mean'],
    description='Statistics',
    disabled=False
)

display(correlations, stats)


In [None]:
# -------------- CELL 7 --------------

# calculate correlations
nbf.calculate_correlation()

# Display correlation visualizations here


In [1]:
# -------------- CELL 8 ---------------

# Give the user the option to download data here.
print("Select the files to be downloaded:")
files_to_provide = widgets.SelectMultiple(
    options=['None', 'job_ts_metrics_aug2022_anon', 'job_ts_metrics_dec2022_anon',
             'job_ts_metrics_jan2022_anon', 'job_ts_metrics_july2022_anon',
             'job_ts_metrics_nov2022_anon', 'job_ts_metrics_sep2022_anon'],
    value=['None'],
    description='Files',
    disabled=False
)
display(files_to_provide)

# Create and display download button
download_button = widgets.Button(description='Download File/s')
download_button.on_click(nbf.on_download_button_clicked)
display(download_button)

Select the files to be downloaded:


NameError: name 'widgets' is not defined

In [None]:
# -------------- CELL 9 ---------------
