In [1]:
#hide
%load_ext autoreload
%autoreload 2

In [2]:
# default_exp data

# data

> Read data from khulnasoft rest api into a pandas dataframe.

In [3]:
# hide
from nbdev.showdoc import *

In [4]:
# hide
# export
from typing import Union
import re
import time
import asks
from asks import BasicAuth
import trio
import pandas as pd
import requests
from requests.auth import HTTPBasicAuth
from khulnasoft_pandas.wrangle import drop_low_uniqueness_cols, drop_low_std_cols

In [5]:
# export


def get_chart_list(host: str = '127.0.0.1:19999', starts_with: str = None, ends_with: str = None, protocol: str = 'http',
    verify: Union[str, bool] = True) -> list:
    """Get list of all available charts on a `host`.  
    
    ##### Parameters:  
    - **host** `str` The host we want to get a list of available charts from.
    - **starts_with** `str` A string to filter the list of charts returns to just those that start with `starts_with`.
    - **ends_with** `str` A string to filter the list of charts returns to just those that end with `ends_with`.
    - **protocol** `str` 'http' or 'https'.
    - **verify** `Union[str, bool]` `verify` parameter to be set to `requests` for SSL cert verification.
    
    ##### Returns:  
    - **chart_list** `list` A list of availalbe charts.
    
    """    
    url = f"{protocol}://{host}/api/v1/charts"
    r = requests.get(url, verify=verify)
    charts = r.json().get('charts')
    chart_list = [chart for chart in charts]
    if starts_with:
        chart_list = [chart for chart in chart_list if chart.startswith(starts_with)]
    if ends_with:
        chart_list = [chart for chart in chart_list if chart.endswith(ends_with)]
    return chart_list



In [6]:
# hide
# tests

# get some charts from london demo site
charts = get_chart_list('london.my-khulnasoft.com', starts_with='system.')

# check just system. charts returned
assert set([chart.split('.')[0] for chart in charts]) == set(['system'])

# get some charts from london demo site
charts = get_chart_list('london.my-khulnasoft.com', ends_with='.cpu')

# check just system. charts returned
assert set([chart.split('.')[-1] for chart in charts]) == set(['cpu'])

In [7]:
# export


async def get_chart(api_call: str, data: list, col_sep: str ='|', numeric_only: bool = True, float_size: str = 'float64',
                    host_prefix: bool = False, host_sep: str = ':'):
    """Get data for an individual chart.
    
    ##### Parameters:  
    - **api_call** `tuple` A tuple of (`url`,`chart`) for the url to pull data from and chart it represents.
    - **data** `list` A list for dataframes for each chart to be appended to.
    - **col_sep** `str` A character for separating chart and dimension in column names of dataframe.
    - **numeric_only** `bool` Set to true if you want to filter out any non numeric data.
    - **float_size** `str` float size to use if would like to save some memory, eg can use 'float32' or 'float16'.
    - **host_prefix** `bool` True to prefix each colname with the corresponding host.
    - **host_sep** `str` A character for separating host and chart and dimensions in column names of dataframe.
    
    """
    url, chart, host, user, pwd = api_call
    if user and pwd:
        user_pwd = (user, pwd)
        r = await asks.get(url, auth=BasicAuth(user_pwd))
    else:
        r = await asks.get(url)
    try:
        r_json = r.json()
        df = pd.DataFrame(r_json['data'], columns=['time_idx'] + r_json['labels'][1:])
        if host_prefix:
            df = df.set_index(['time_idx']).add_prefix(f'{host}{host_sep}{chart}{col_sep}')
        else:
            df['host'] = host
            df = df.set_index(['host','time_idx']).add_prefix(f'{chart}{col_sep}')
        if numeric_only:
            df = df._get_numeric_data().astype(float_size)
        data.append(df)
    except:
        print(f'error found on data from: {url}')


In [8]:
# export

async def get_charts(api_calls: list, col_sep: str ='|', timeout: int = 60, numeric_only: bool = True, float_size: str = 'float64',
                     host_prefix: bool = False, host_sep: str = ':') -> pd.DataFrame:
    """Create a nursey to make seperate async calls to get each chart.
    
    ##### Parameters:  
    - **api_calls** `list` A list of tuple's of [(`url`,`chart`),...] of api calls that need to be made.
    - **col_sep** `str` A character for separating chart and dimension in column names of dataframe.
    - **timeout** `int` The number of seconds for trio to [move_on_after](https://trio.readthedocs.io/en/stable/reference-core.html#trio.move_on_after).
    - **numeric_only** `bool` Set to true if you want to filter out any non numeric data.
    - **float_size** `str` float size to use if would like to save some memory, eg can use 'float32' or 'float16'.
    - **host_prefix** `bool` True to prefix each colname with the corresponding host.
    - **host_sep** `str` A character for separating host and chart and dimensions in column names of dataframe.
    
    ##### Returns:  
    - **df** `pd.DataFrame` A pandas dataframe with all chart data outer joined based on time index.
    
    """
    n_hosts = len(set([x[2] for x in api_calls]))
    data = []
    with trio.move_on_after(timeout):
        async with trio.open_nursery() as nursery:
            for api_call in api_calls:
                nursery.start_soon(get_chart, api_call, data, col_sep, numeric_only, float_size, host_prefix, host_sep)
    if n_hosts == 1 or host_prefix:
        df = pd.concat(data, join='outer', axis=1, sort=True)
    else:
        df = pd.concat(data, join='outer', axis=0, sort=True)
    return df



In [9]:
# export


def get_data(hosts: list = ['london.my-khulnasoft.com'], charts: list = ['system.cpu'], after: int = -60, 
             before: int = 0, points: int = 0, col_sep: str = '|', numeric_only: bool = True,
             ffill: bool = True, diff: bool = False, timeout: int = 60, nunique_thold = None, 
             std_thold: float = None, index_as_datetime: bool = False, freq: str = 'infer', 
             group: str = 'average', sort_cols: bool = True, user: str = None, pwd: str = None, 
             protocol: str = 'http', sort_rows: bool = True, float_size: str = 'float64',
             host_charts_dict: dict = None, host_prefix: bool = False, host_sep: str = ':',
             charts_regex: str = None, verify: Union[str, bool] = True, dimensions: str = '*', 
             options: str = '') -> pd.DataFrame:
    """Define api calls to make and any post processing to be done.
    
    ##### Parameters:  
    - **hosts** `list` A list of hosts to pull data from.
    - **charts** `list` A list of charts to pull data for.
    - **after** `int` The timestamp or relative integer from which to pull data after.
    - **before** `int` The timestamp or relative integer from which to pull data before.
    - **points** `int` The `points` parameter to pass to the api call if need to aggregate data in some way.
    - **col_sep** `str` A character for separating chart and dimension in column names of dataframe.
    - **numeric_only** `bool` Set to true if you want to filter out any non numeric data.
    - **ffill** `bool` Set to true if you want to forward fill any null or missing values.
    - **diff** `bool` Set to true if you want to get the difference of metrics as opposed to their raw value.
    - **timeout** `int` The number of seconds for trio to [move_on_after](https://trio.readthedocs.io/en/stable/reference-core.html#trio.move_on_after).
    - **nunique_thold** [`float`,`int`] If defined calls function to filter cols with low number of unique values.
    - **std_thold** `float` If defined calls function to filter cols with low standard deviation.
    - **index_as_datetime** `bool` If true, set the index to be a pandas datetime.
    - **freq** `str` Freq to be passed to pandas datetime index.
    - **group** `str` The grouping function to use in the khulnasoft api call.
    - **sort_cols** `bool` True to sort columns by name.
    - **user** `str` A username to use if khulnasoft is password protected.
    - **pwd** `str` A password to use if khulnasoft is password protected.
    - **protocol** `str` 'http' or 'https'.
    - **sort_rows** `bool` True to sort rows by index.
    - **float_size** `str` float size to use if would like to save some memory, eg can use 'float32' or 'float16'.
    - **host_charts_dict** `dict` dictionary of hosts to pull for where each value is list of relevant charts to pull from that host.
    - **host_prefix** `bool` True to prefix each colname with the corresponding host.
    - **host_sep** `str` A character for separating host and chart and dimensions in column names of dataframe.
    - **charts_regex** `str` A regex expression for charts you want data for.
    - **verify** `Union[str, bool]` `verify` parameter to be set to `requests` for SSL cert verification.
    - **dimensions** `str` The `dimensions` parameter to pass to the api call, defaults to '*' for all dimensions.
    - **options** `str` The `options` parameter to pass to the api call, defaults to '' to just accept defaults.
        
    ##### Returns:  
    - **df** `pd.DataFrame` A pandas dataframe with all chart data outer joined based on time index and any post processing done.
    
    """
    # if hosts is a string make it a list of one
    if isinstance(hosts, str):
        hosts = [hosts]
    
    # get list of host chart tuples we need to get data for
    if host_charts_dict:
        host_charts = [(k, v) for k in host_charts_dict for v in host_charts_dict[k]]
        hosts = list(set(host_charts_dict.keys()))
    elif charts_regex:
        charts_regex = re.compile(charts_regex)
        host_charts = [(host, chart) for host in hosts for chart in list(filter(charts_regex.match, get_chart_list(host, verify=verify)))]
    elif charts == ['all']:
        host_charts = [(host, chart) for host in hosts for chart in get_chart_list(host, verify=verify)]
    else:
        host_charts = [(host, chart) for host in hosts for chart in charts]
        
    # define points based on freq if given
    window_length = before - after
    if freq != 'infer':
        if freq.endswith('s'):
            points = int(window_length / int(freq.replace('s','')))
        elif freq.endswith('m'):
            points = int(window_length / (int(freq.replace('m','')) * 60))
        elif freq.endswith('h'):
            points = int(window_length / (int(freq.replace('h','')) * 60 * 60))
    
    # define list of all api calls to be made
    api_calls = [
        (
            f'{protocol}://{host_chart[0]}/api/v1/data?chart={host_chart[1]}&after={after}&before={before}&points={points}&format=json&group={group}&dimensions={dimensions}&options={options}', 
            host_chart[1], 
            host_chart[0], 
            user, 
            pwd
        )
        for host_chart in host_charts
    ] 
    # get the data
    df = trio.run(get_charts, api_calls, col_sep, timeout, numeric_only, float_size, host_prefix, host_sep)
    # post process the data
    if host_prefix:
        df = df.groupby(by=['time_idx']).max()
    else:
        df = df.groupby(by=['host','time_idx']).max()
    if len(hosts) == 1:
        df = df.reset_index(level=0, drop=True)
    if sort_rows:
        df = df.sort_index()
    if ffill:
        df = df.ffill()
    if diff:
        df = df.diff().dropna(how='all')
    if nunique_thold:
        df = drop_low_uniqueness_cols(df, nunique_thold)
    if std_thold:
        df = drop_low_std_cols(df, std_thold)
    if index_as_datetime:
        df = df.set_index(pd.DatetimeIndex(pd.to_datetime(df.index, unit='s'), freq=freq))
    if sort_cols:
        df = df.reindex(sorted(df.columns), axis=1)
    return df



In [10]:
# hide
#%timeit -r1 -n2 get_data('newyork.my-khulnasoft.com', ['all'], after=-60, before=0)

In [11]:
# hide
%timeit -r1 -n2 get_data('london.my-khulnasoft.com', ['system.cpu', 'system.load'], after=-60, before=0)

  class ExceptionGroup(BaseExceptionGroup, trio.MultiError):


258 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 2 loops each)


In [12]:
# hide
# tests
# examples

host_charts_dict = {
    "london.my-khulnasoft.com" : ['system.io','system.ip'],
    "newyork.my-khulnasoft.com" : ['system.io','system.net'],
}
expected_cols = ['london.my-khulnasoft.com:system.io|in', 'london.my-khulnasoft.com:system.io|out', 'london.my-khulnasoft.com:system.ip|received', 
                 'london.my-khulnasoft.com:system.ip|sent', 'newyork.my-khulnasoft.com:system.io|in', 'newyork.my-khulnasoft.com:system.io|out', 
                 'newyork.my-khulnasoft.com:system.net|received', 'newyork.my-khulnasoft.com:system.net|sent']
df = get_data(host_charts_dict=host_charts_dict, host_prefix=True)
print(df.shape)
assert len(df) in [59, 60, 61, 62, 63, 64, 65]
assert len(df.columns) == 8
assert set(df.columns) == set(expected_cols)
df.head()

(60, 8)


Unnamed: 0_level_0,london.my-khulnasoft.com:system.io|in,london.my-khulnasoft.com:system.io|out,london.my-khulnasoft.com:system.ip|received,london.my-khulnasoft.com:system.ip|sent,newyork.my-khulnasoft.com:system.io|in,newyork.my-khulnasoft.com:system.io|out,newyork.my-khulnasoft.com:system.net|received,newyork.my-khulnasoft.com:system.net|sent
time_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1681833352,0.0,0.0,661.606,-1042.9999,0.0,-100.67192,734.2613,-133.41216
1681833353,0.0,-37.83092,404.1322,-609.2885,0.0,-150.97168,719.4314,-139.5932
1681833354,0.0,-54.93924,715.0553,-1131.3562,0.0,-48.35639,201.5743,-328.947
1681833355,327.7204,-36.87633,1691.915,-1276.2466,0.0,-16.390833,35.21111,-457.2971
1681833356,192.2796,-10.35352,1431.6574,-1506.7408,0.0,-7.609167,22.07439,-281.6784


In [13]:
# hide
# tests
# examples

df = get_data('london.my-khulnasoft.com', charts_regex='system|apps|users|services\..*', after=-60, before=0, nunique_thold=0.05)
print(df.shape)
assert len(df) in [59, 60, 61, 62, 63, 64, 65, 66, 67]
assert len(df.columns) > 200
assert len(df.columns) < 300
df.head()

(62, 221)


Unnamed: 0_level_0,apps.cpu_system|apps.plugin,apps.cpu_system|charts.d.plugin,apps.cpu_system|httpd,apps.cpu_system|kernel,apps.cpu_system|khulnasoft,apps.cpu_system|python.d.plugin,apps.cpu_system|system,apps.cpu_system|tc-qos-helper,apps.cpu_system|vpn,apps.cpu_user|apps.plugin,...,users.uptime|do-agent,users.uptime|messagebus,users.uptime|mysql,users.uptime|khulnasoft,users.uptime|ntp,users.uptime|postfix,users.uptime|root,users.uptime|ulog,users.uptime|unscd,users.uptime|www-data
time_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1681833350,,,,,,,,,,,...,,,,,,,,,,
1681833352,1.0002,0.0,1.0003,0.0,1.0002,0.0,0.0,0.0,0.0,0.0,...,3032054.0,3032054.0,3032053.0,2401807.0,3032053.0,3032053.0,3032056.0,3032054.0,3032054.0,3032053.0
1681833353,0.0,0.0,0.0,0.0,0.9997,0.0,0.0,0.0,0.0,0.0,...,3032055.0,3032055.0,3032054.0,2401808.0,3032054.0,3032054.0,3032057.0,3032055.0,3032055.0,3032054.0
1681833354,0.0,0.0,0.0,0.0,0.6243,0.0,0.0,0.6244,0.0,0.6243,...,3032056.0,3032056.0,3032055.0,2401809.0,3032055.0,3032055.0,3032058.0,3032056.0,3032056.0,3032055.0
1681833355,0.0,0.0034,1.0003,0.0,2.0002,0.0,0.0,0.0,1.0003,0.0,...,3032057.0,3032057.0,3032056.0,2401810.0,3032056.0,3032056.0,3032059.0,3032057.0,3032057.0,3032056.0


In [14]:
# hide
# tests

# test `freq` parameter

df = get_data('london.my-khulnasoft.com', charts=['system.cpu'], after=-60, before=0, freq='10s')
print(df.shape)
assert len(df) in [5,6,7]
df.head()

(6, 10)


Unnamed: 0_level_0,system.cpu|guest,system.cpu|guest_nice,system.cpu|idle,system.cpu|iowait,system.cpu|irq,system.cpu|nice,system.cpu|softirq,system.cpu|steal,system.cpu|system,system.cpu|user
time_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1681833360,0.0,0.0,98.466812,0.025189,0.0,0.0,0.075505,0.075251,0.578402,0.77884
1681833370,0.0,0.0,98.644471,0.0,0.0,0.0,0.075377,0.075126,0.577513,0.627513
1681833380,0.0,0.0,98.520094,0.0,0.0,0.0,0.075126,0.050063,0.6772,0.677516
1681833390,0.0,0.0,98.719393,0.0,0.0,0.0,0.050315,0.050127,0.47733,0.702834
1681833400,0.0,0.0,98.668765,0.025063,0.0,0.0,0.075378,0.025063,0.577896,0.627834


In [15]:
# hide
# tests

# get some test data
test_host = 'london.my-khulnasoft.com'
test_charts = ['system.cpu', 'system.load']
df = get_data(test_host, test_charts, after=-60, before=0, col_sep='|')

# look for some expected columns
assert 'system.load|load1' in df.columns
assert 'system.cpu|user' in df.columns
# check expected shape of data
assert str(df.shape) == '(60, 12)' or '(61, 12)'
# check that all types are float64 or int64
assert len(df.dtypes[df.dtypes != 'int64'][df.dtypes != 'float64']) == 0

# test index as datetime
df = get_data('london.my-khulnasoft.com', ['system.cpu'], index_as_datetime=True)
assert isinstance(df.index, pd.core.indexes.datetimes.DatetimeIndex)
assert isinstance(df.index.freq, pd.tseries.offsets.Second)

# test multiple hosts and charts
df = get_data(['london.my-khulnasoft.com', 'newyork.my-khulnasoft.com'], ['system.cpu', 'system.load'], after=-60, before=0)
assert df.shape[0] in [120, 121, 122]
assert df.shape[1] == 13
assert set(df.columns) == set(['system.cpu|guest', 'system.cpu|guest_nice', 'system.cpu|iowait', 'system.cpu|irq', 'system.cpu|nice', 'system.cpu|softirq', 'system.cpu|steal', 'system.cpu|system', 'system.cpu|user', 'system.cpu|idle', 'system.load|load1', 'system.load|load15', 'system.load|load5'])

# test memory savings from float 32
df64 = get_data('london.my-khulnasoft.com', test_charts, after=-600, before=0)
df32 = get_data('london.my-khulnasoft.com', test_charts, after=-600, before=0, float_size='float32')
assert df32.memory_usage('deep').sum() < df64.memory_usage('deep').sum()

# test options and dimensions params
df = get_data('london.my-khulnasoft.com', charts=['system.net'], after=-600, before=0, options='abs', dimensions='sent')
assert min(df.min()) >= 0

In [16]:
# export


def get_alarm_log(host: str = '127.0.0.1:19999', datetimes: bool = True, user: str = None, 
                  pwd: str = None, protocol: str = 'http', include_children: bool = False) -> pd.DataFrame:
    """Get alarm log from `host`.  
    
    ##### Parameters:  
    - **host** `str` The host we want to get the alarm log from.
    - **user** `str` A username to use if khulnasoft is password protected.
    - **pwd** `str` A password to use if khulnasoft is password protected.
    - **protocol** `str` 'http' or 'https'.
    - **include_children** `bool` 'True' to include alarm log for all children streamed to host.
    
    ##### Returns:  
    - **df** `pd.DataFrame` A df of the alarm_log.
    
    """

    def get_alarm_log_df(protocol, host, user, pwd, child=None):
        if child:
            url = f"{protocol}://{host}/host/{child}/api/v1/alarm_log"
        else:
            url = f"{protocol}://{host}/api/v1/alarm_log"
        if user and pwd:
            r = requests.get(url, auth=HTTPBasicAuth(user, pwd))
        else:
            r = requests.get(url)
        alarm_log = r.json()
        df = pd.DataFrame(alarm_log)
        return df

    def get_children(protocol, host):
        r = requests.get(f"{protocol}://{host}/api/v1/info")
        children = r.json()['mirrored_hosts']
        children.pop(0)
        return children
    
    df = get_alarm_log_df(protocol, host, user, pwd)
    if include_children:
        children = get_children(protocol, host)
        for child in children:
            df = df.append(get_alarm_log_df(protocol, host, user, pwd, child))
    if datetimes:
        for col in ['when', 'delay_up_to_timestamp']:
            df[col] = pd.to_datetime(df[col], unit='s')
    return df



In [17]:
# hide
# tests 

df = get_alarm_log('london.my-khulnasoft.com')

expected_cols = ['hostname', 'utc_offset', 'timezone', 'unique_id', 'alarm_id',
       'alarm_event_id', 'config_hash_id', 'name', 'chart', 'context',
       'family', 'class', 'component', 'type', 'processed', 'updated',
       'exec_run', 'exec_failed', 'exec', 'recipient', 'exec_code', 'source',
       'command', 'units', 'when', 'duration', 'non_clear_duration', 'status',
       'old_status', 'delay', 'delay_up_to_timestamp', 'updated_by_id',
       'updates_id', 'value_string', 'old_value_string', 'last_repeat',
       'silenced', 'info', 'value', 'old_value']

assert list(df.columns) == expected_cols
assert len(df) >= 1

In [18]:
# hide
# tests

# test grouping behaves as expected

# get some test data
test_host = 'london.my-khulnasoft.com'
test_charts = ['system.load']

# get raw data
df_last100 = get_data(test_host, test_charts, after=-100, before=0, col_sep='|')

# directly get aggregations
df_avg = get_data(test_host, test_charts, after=-100, before=0, col_sep='|', points=1, group='average')
df_std = get_data(test_host, test_charts, after=-100, before=0, col_sep='|', points=1, group='stddev')
df_min = get_data(test_host, test_charts, after=-100, before=0, col_sep='|', points=1, group='min')
df_max = get_data(test_host, test_charts, after=-100, before=0, col_sep='|', points=1, group='max')

# calc by hand
df_last100_avg = df_last100.mean()
df_last100_std = df_last100.std()
df_last100_min = df_last100.min()
df_last100_max = df_last100.max()

# get diffs
avg_diffs = round(abs(df_avg - df_last100_avg), 2)
std_diffs = round(abs(df_std - df_last100_std), 2)
min_diffs = round(abs(df_min - df_last100_min), 2)
max_diffs = round(abs(df_max - df_last100_max), 2)

# assert abs differences are small
tolerance = 0.5
assert (avg_diffs <= tolerance).values.tolist() == [[True, True, True]]
assert (std_diffs <= tolerance).values.tolist() == [[True, True, True]]
assert (min_diffs <= tolerance).values.tolist() == [[True, True, True]]
assert (max_diffs <= tolerance).values.tolist() == [[True, True, True]]

In [19]:
# export


def get_allmetrics(host='london.my-khulnasoft.com', charts: list = None, wide: bool = False, col_sep: str = '|', sort_cols: bool = True,
                   user: str = None, pwd: str = None, protocol: str = 'http', numeric_only: bool = True, 
                   float_size: str = 'float64', host_charts_dict: dict = None, host_prefix: bool = False, 
                   host_sep: str = ':') -> pd.DataFrame:
    """Get allmetrics into a df.  
    
    ##### Parameters:  
    - **host** `str` The host we want to get the alarm log from.
    - **charts** `list` A list of charts to pull data for.
    - **wide** `bool` True if you want to return the data in wide format as opposed to long.
    - **user** `str` A username to use if khulnasoft is password protected.
    - **pwd** `str` A password to use if khulnasoft is password protected.
    - **protocol** `str` 'http' or 'https'.
    - **numeric_only** `bool` Set to true if you want to filter out any non numeric data.
    - **float_size** `str` float size to use if would like to save some memory, eg can use 'float32' or 'float16'.
    
    ##### Returns:  
    - **df** `pd.DataFrame` A df of the latest data from allmetrics.
    
    """
    
    if not host_charts_dict:
        host_charts_dict = {host: charts}
    
    data = []
    for host in host_charts_dict:
        charts = host_charts_dict[host]
        url = f'{protocol}://{host}/api/v1/allmetrics?format=json'
        if user and pwd:
            raw_data = requests.get(url, auth=HTTPBasicAuth(user, pwd)).json()
        else:
            raw_data = requests.get(url).json()
        if charts is None:
            charts = list(raw_data.keys())
        for k in raw_data:
            if k in charts:
                time = raw_data[k]['last_updated']
                dimensions = raw_data[k]['dimensions']
                for dimension in dimensions:
                    # [time, chart, name, value]
                    if host_prefix:
                        data.append(
                            [time, f"{host}{host_sep}{k}", f"{host}{host_sep}{k}{col_sep}{dimensions[dimension]['name']}", dimensions[dimension]['value']]
                        )
                    else:
                        data.append(
                            [time, k, "{}{}{}".format(k, col_sep, dimensions[dimension]['name']), dimensions[dimension]['value']]
                        )
    
    df = pd.DataFrame(data, columns=['time','chart','dimension','value'])
    if wide:
        df = df[['dimension', 'value']].groupby('dimension').mean().reset_index().pivot_table(columns=['dimension'])
        if sort_cols:
            df = df.reindex(sorted(df.columns), axis=1)
        if numeric_only:
            df = df._get_numeric_data().astype(float_size)
    return df



In [20]:
# hide
# tests

host_charts_dict = {'london.my-khulnasoft.com': ['system.net', 'system.ip']}
df = get_allmetrics(host_charts_dict=host_charts_dict, host_prefix=True, host_sep='::')

print(df.shape)
assert df.shape == (4,4)
assert set(df['dimension'].values) == set(['london.my-khulnasoft.com::system.ip|received', 'london.my-khulnasoft.com::system.ip|sent', 'london.my-khulnasoft.com::system.net|received', 'london.my-khulnasoft.com::system.net|sent'])
df.head()

(4, 4)


Unnamed: 0,time,chart,dimension,value
0,1681833416,london.my-khulnasoft.com::system.ip,london.my-khulnasoft.com::system.ip|received,1770.453762
1,1681833416,london.my-khulnasoft.com::system.ip,london.my-khulnasoft.com::system.ip|sent,-2064.424975
2,1681833416,london.my-khulnasoft.com::system.net,london.my-khulnasoft.com::system.net|received,832.271059
3,1681833416,london.my-khulnasoft.com::system.net,london.my-khulnasoft.com::system.net|sent,-1022.88922


In [21]:
# hide
# tests

host = 'london.my-khulnasoft.com'
df = get_allmetrics(host)

print(df.shape)
assert len(df) >= 1800
assert list(df.columns) == ['time','chart','dimension','value']
assert 'system.cpu' in list(df.chart.unique())
df.head()

(3654, 4)


Unnamed: 0,time,chart,dimension,value
0,1681833419,system.idlejitter,system.idlejitter|min,70.0
1,1681833419,system.idlejitter,system.idlejitter|max,249.0
2,1681833419,system.idlejitter,system.idlejitter|average,121.0
3,1681833419,khulnasoft.statsd_metrics,khulnasoft.statsd_metrics|gauges,0.0
4,1681833419,khulnasoft.statsd_metrics,khulnasoft.statsd_metrics|counters,0.0


In [22]:
# export


async def _get_allmetrics_async_single(api_call: str, data: list, col_sep: str ='|', numeric_only: bool = True, float_size: str = 'float64',
                    host_prefix: bool = False, host_sep: str = ':', wide: bool = False, sort_cols: bool = True):
    """Get all metrics for individual host.
    
    ##### Parameters:  
    - **api_call** `tuple` A tuple of (`url`,`chart`) for the url to pull data from and chart it represents.
    - **data** `list` A list for dataframes for each chart to be appended to.
    - **col_sep** `str` A character for separating chart and dimension in column names of dataframe.
    - **numeric_only** `bool` Set to true if you want to filter out any non numeric data.
    - **float_size** `str` float size to use if would like to save some memory, eg can use 'float32' or 'float16'.
    - **host_prefix** `bool` True to prefix each colname with the corresponding host.
    - **host_sep** `str` A character for separating host and chart and dimensions in column names of dataframe.
    
    """
    url, host, charts, user, pwd = api_call
    if user and pwd:
        user_pwd = (user, pwd)
        r = await asks.get(url, auth=BasicAuth(user_pwd))
    else:
        r = await asks.get(url)
    raw_data = r.json()
    if charts is None:
        charts = list(raw_data.keys())
    tmp_data = []
    for k in raw_data:
        if k in charts:
            time = raw_data[k]['last_updated']
            dimensions = raw_data[k]['dimensions']
            for dimension in dimensions:
                # [time, chart, name, value]
                if host_prefix:
                    tmp_data.append(
                        [time, f"{host}{host_sep}{k}", f"{host}{host_sep}{k}{col_sep}{dimensions[dimension]['name']}", dimensions[dimension]['value']]
                    )
                else:
                    tmp_data.append(
                        [time, k, "{}{}{}".format(k, col_sep, dimensions[dimension]['name']), dimensions[dimension]['value']]
                    )    
    df = pd.DataFrame(tmp_data, columns=['time','chart','dimension','value'])
    df['host'] = host
    if wide:
        df = df[['dimension', 'value']].groupby('dimension').mean().reset_index().pivot_table(columns=['dimension'])
        if sort_cols:
            df = df.reindex(sorted(df.columns), axis=1)
        if numeric_only:
            df = df._get_numeric_data().astype(float_size)

    data.append(df)

    

In [23]:
# export


async def _get_allmetrics_async_runner(api_calls: list, col_sep: str ='|', timeout: int = 60, numeric_only: bool = True, float_size: str = 'float64',
                     host_prefix: bool = False, host_sep: str = ':', wide: bool = False, sort_cols: bool = True) -> pd.DataFrame:
    """Create a nursey to make seperate async calls to get each chart.
    
    ##### Parameters:  
    - **api_calls** `list` A list of tuple's of [(`url`,`chart`),...] of api calls that need to be made.
    - **col_sep** `str` A character for separating chart and dimension in column names of dataframe.
    - **timeout** `int` The number of seconds for trio to [move_on_after](https://trio.readthedocs.io/en/stable/reference-core.html#trio.move_on_after).
    - **numeric_only** `bool` Set to true if you want to filter out any non numeric data.
    - **float_size** `str` float size to use if would like to save some memory, eg can use 'float32' or 'float16'.
    - **host_prefix** `bool` True to prefix each colname with the corresponding host.
    - **host_sep** `str` A character for separating host and chart and dimensions in column names of dataframe.
    
    ##### Returns:  
    - **df** `pd.DataFrame` A pandas dataframe with all chart data outer joined based on time index.
    
    """
    n_hosts = len(set([x[1] for x in api_calls]))
    data = []
    with trio.move_on_after(timeout):
        async with trio.open_nursery() as nursery:
            for api_call in api_calls:
                nursery.start_soon(_get_allmetrics_async_single, api_call, data, col_sep, numeric_only, float_size, host_prefix, host_sep, wide, sort_cols)
    if n_hosts == 1:
        df = pd.concat(data, join='outer', axis=1, sort=True)
    else:
        df = pd.concat(data, join='outer', axis=0, sort=True)
    return df



In [24]:
# export


def get_allmetrics_async(host_charts_dict: dict = None, col_sep: str = '|', numeric_only: bool = True,
                   timeout: int = 60, index_as_datetime: bool = False, freq: str = 'infer', sort_cols: bool = True, user: str = None, 
                   pwd: str = None, protocol: str = 'http', float_size: str = 'float64',
                   host_prefix: bool = False, host_sep: str = ':', wide: bool = False) -> pd.DataFrame:
    """Define api calls to make and any post processing to be done.
    
    ##### Parameters:  
    - **col_sep** `str` A character for separating chart and dimension in column names of dataframe.
    - **numeric_only** `bool` Set to true if you want to filter out any non numeric data.
    - **timeout** `int` The number of seconds for trio to [move_on_after](https://trio.readthedocs.io/en/stable/reference-core.html#trio.move_on_after).
    - **index_as_datetime** `bool` If true, set the index to be a pandas datetime.
    - **freq** `str` Freq to be passed to pandas datetime index.
    - **sort_cols** `bool` True to sort columns by name.
    - **user** `str` A username to use if khulnasoft is password protected.
    - **pwd** `str` A password to use if khulnasoft is password protected.
    - **protocol** `str` 'http' or 'https'.
    - **float_size** `str` float size to use if would like to save some memory, eg can use 'float32' or 'float16'.
    - **host_charts_dict** `dict` dictionary of hosts to pull for where each value is list of relevant charts to pull from that host.
    - **host_prefix** `bool` True to prefix each colname with the corresponding host.
    - **host_sep** `str` A character for separating host and chart and dimensions in column names of dataframe.
        
    ##### Returns:  
    - **df** `pd.DataFrame` A pandas dataframe with all chart data outer joined based on time index and any post processing done.
    
    """
    
    # define list of all api calls to be made
    api_calls = [
        (f'{protocol}://{host}/api/v1/allmetrics?format=json', host, host_charts_dict[host], user, pwd)
        for host in host_charts_dict
    ]
    # get the data
    df = trio.run(_get_allmetrics_async_runner, api_calls, col_sep, timeout, numeric_only, float_size, host_prefix, host_sep, wide, sort_cols)
    #df = df.max().to_frame()
    df = df.groupby(by=df.index).max()    
    if index_as_datetime:
        df['time_idx'] = int(time.time())
        df = df.set_index('time_idx')
    if sort_cols:
        df = df.reindex(sorted(df.columns), axis=1)
    return df



In [25]:
# hide
# tests

# test host_charts_dict
host_charts_dict = {'london.my-khulnasoft.com': ['system.net', 'system.ip'], 'newyork.my-khulnasoft.com': ['system.cpu']}
df = get_allmetrics_async(host_charts_dict=host_charts_dict, host_prefix=True, host_sep='::', wide=True, index_as_datetime=True)

print(df.shape)
assert df.shape == (1,14)
df.head()

(1, 14)


dimension,london.my-khulnasoft.com::system.ip|received,london.my-khulnasoft.com::system.ip|sent,london.my-khulnasoft.com::system.net|received,london.my-khulnasoft.com::system.net|sent,newyork.my-khulnasoft.com::system.cpu|guest,newyork.my-khulnasoft.com::system.cpu|guest_nice,newyork.my-khulnasoft.com::system.cpu|idle,newyork.my-khulnasoft.com::system.cpu|iowait,newyork.my-khulnasoft.com::system.cpu|irq,newyork.my-khulnasoft.com::system.cpu|nice,newyork.my-khulnasoft.com::system.cpu|softirq,newyork.my-khulnasoft.com::system.cpu|steal,newyork.my-khulnasoft.com::system.cpu|system,newyork.my-khulnasoft.com::system.cpu|user
time_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1681833423,409.192473,-637.465884,72.091176,-280.63187,0.0,0.0,96.969697,0.0,0.0,2.020202,0.0,0.0,1.010101,0.0


In [27]:
# hide
# tests

# test chart_regex
df = get_data('london.my-khulnasoft.com', charts_regex='system.*', after=-60, before=0)
assert sum([c.startswith('system.') for c in df.columns]) == len(df.columns)

#df = get_data('london.my-khulnasoft.com', charts_regex='.*', after=-60, before=0)
#assert len(set([c.split('|')[0] for c in df.columns])) == len(get_chart_list('london.my-khulnasoft.com'))

df = get_data(['london.my-khulnasoft.com', 'newyork.my-khulnasoft.com'], charts_regex='system.cpu.*|system.load.*', after=-60, before=0)
assert df.shape[0] >= 118 
assert df.shape[0] <= 123 
assert df.shape[1] >= 13 
assert df.shape[1] <= 17
assert set([c.split('|')[0] for c in df.columns]) == set(['system.cpu','system.cpu_some_pressure','system.cpu_some_pressure_stall_time','system.load'])