In [None]:
import pandas as pd
from collections import Counter

In [None]:
df = pd.read_csv('../data/logs_aggregated_concurrent.csv')
print(df.columns)
df.head()

In [None]:
df[df['log_level'].notnull()]

In [None]:
set(df[df['python_module'].notnull()]['python_module'])

In [None]:
print(len(df[df['@timestamp'].isna()]))
pd.to_datetime(df['@timestamp'])

In [None]:
rel_df = df[['Hostname', 'log_level', 'programname', 'python_module', 'http_status', 'http_method']]
rel_df['DateTime'] = pd.to_datetime(df['@timestamp'])
rel_df = rel_df.fillna('')
rel_df['log_level'] = rel_df['log_level'].apply(lambda x: x.upper())
rel_df['http_method'] = rel_df['http_method'].apply(lambda x: x.upper())
rel_df = rel_df.sort_values(by='DateTime')
rel_df

In [None]:
import http
import re

http_descriptions = {}
for status in list(http.HTTPStatus):
    http_descriptions[str(status.value) + '.0'] = status.name.lower().replace('_', ' ')

descriptions = {}
for column in ['Hostname', 'log_level', 'programname', 'python_module', 'http_status', 'http_method']:
    values = set(rel_df[column].dropna())
    values = set([str(x).lower() for x in values if len(str(x)) > 0])
    for value in values:
        if column == 'Hostname':
            name = value.rstrip('0123456789')
            number = value[len(name):]
            descriptions[value] = name + ' ' + number
        elif column == 'http_status':
            descriptions[value] = http_descriptions[value]
        else:
            descriptions[value] = ' '.join(re.split('[,._-]+', value))
    
descriptions

In [None]:
labels = rel_df.drop(columns=['DateTime']).values.tolist()
labels = [[str(l) for l in label_list if len(str(l)) > 0] for label_list in labels]
labels_df = pd.DataFrame(data={
    'labels': labels
})
labels_df

In [None]:
max_sequence_length = 5
num_datapoints = len(labels)
next_start_idx = 0
next_end_idx = min(max_sequence_length, num_datapoints)
subsequences = []
while next_end_idx < num_datapoints:
    subsequences.append(labels[next_start_idx:next_end_idx])
    next_start_idx = next_end_idx
    next_end_idx = min(next_end_idx + max_sequence_length, num_datapoints)

pd.DataFrame(data={
    'sequences': subsequences
})

In [None]:
program_names = [x for x in set(rel_df['programname']) if len(x) > 0]
program_name_parts = []
rel_program_name_parts = []

for program_name in program_names:
    parts = program_name.split('-')
    program_name_parts.append(parts[0])
    rel_program_name_parts.append(parts[0])
    for i in range(2, len(parts)+1):
        program_name_parts.append('-'.join(parts[:i]))

Counter(program_name_parts)

In [None]:
import re

hierarchy_df = pd.DataFrame(columns=['parent', 'child'])
for column in ['programname']:#['Hostname', 'log_level', 'programname', 'python_module', 'http_status', 'http_method']:
    hierarchy_df = hierarchy_df.append({
        'parent': 'root',
        'child': column,
    }, ignore_index=True)

    values = set(rel_df[column].dropna())
    values = set([str(x).lower() for x in values if len(str(x)) > 0])
    for value in values:
        hierarchy_elements = [column]
        if column == 'Hostname':
            hierarchy_elements.append(value.rstrip('0123456789'))
        elif column == 'http_status':
            hierarchy_elements.append(value[0] + '00')
        else:
            hierarchy_elements = hierarchy_elements + re.split('[,._-]+', value)
        hierarchy_elements.append(value)

        hierarchy = []
        for i in range(1, len(hierarchy_elements)):
            hierarchy.append('->'.join(hierarchy_elements[0:i]))

        print(hierarchy)
        parent = column
        for i in range(len(hierarchy)):
            child = hierarchy[i]
            if not parent == child: 
                hierarchy_df = hierarchy_df.append({
                    'parent': parent,
                    'child': hierarchy[i],
                }, ignore_index=True)
            parent = hierarchy[i]

        

hierarchy_df