# Clustering utilities

This notebook contains functions for visualization and utilities needed for clustering

In [None]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.cluster import OPTICS
from scipy import stats
import seaborn as sns
import itertools
import statistics
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import collections
from collections import Counter
from scipy.stats import pearsonr

## Data preprocessing

Functions used for preprocessing and matrix operations

*prepare_tlist_for_vectorizer* takes a list containing Training objects and transforms it to the list of dictionaries with command count, which is the format DictVectorizer requires for input

In [None]:
def prepare_tlist_for_vectorizer(tlist, no_commands = False):
  training_list = []
  for training in tlist:
    training_dict = {}
    if no_commands:
      training_dict = training.get_command_count()
    training_list.append(training_dict)
  return training_list

Adds selected features for each training in a list of dictionaries

In [None]:
def add_training_features(prepared_list, training_list):
  for training_dict, training in zip(prepared_list, training_list):
      training_dict['total_time'] = training.total_time
      training_dict['total_cmd'] = training.total_cmd
      training_dict['average_diff'] = training.average_diff
      training_dict['delay_count'] = training.delay_count
      training_dict['percent_cmd'] = training.percent_cmd

Uses DictVectorizer to create feature matrix

In [None]:
def get_feature_matrix(training_list):
  vectorizer = DictVectorizer(sparse=False)
  feature_matrix = vectorizer.fit_transform(training_list)
  return feature_matrix

Scales Feature matrix

In [None]:
def max_abs_scaling(feature_matrix):
  scaler = MaxAbsScaler()
  scaler.fit(feature_matrix)
  feature_matrix = scaler.transform(feature_matrix)
  return feature_matrix

## Clustering and cluster extraction

Functions that apply clustering algorithm and return clustered data

Function applies OPTICS clustering algorithm to feature matrix with given parameters used as minimal size of a cluster and similarity measure

In [None]:
def cluster(feature_matrix, min_samples, metric):
  clustering = OPTICS(min_samples=min_samples, metric=metric)
  clustering.fit(feature_matrix)
  return clustering

Returns all Training object that belong to a required cluster

In [None]:
def get_cluster(cluster_dct, cluster_name):
  training_cluster = []
  return [elem for elem in cluster_dct[cluster_name]]

Creates dictionary containing Training objects for each cluster

In [None]:
def create_cluster_dict(clustering, training_list):
  cluster_dct = {}

  for i in range(-1, len(clustering.cluster_hierarchy_) - 1):
    cluster_dct['cluster%s' % i] = []

  for label, training in zip(clustering.labels_, training_list):
    key = 'cluster' + str(label)
    if key in cluster_dct:
      cluster_dct[key].append(training)
      
  return cluster_dct

Creates list containing list of Training objects for each cluster

In [None]:
def extract_training_data(clustering, training_list, cluster_names):
  cluster_dct = create_cluster_dict(clustering, training_list)
  data = []

  for name in cluster_names:
    data.append(get_cluster(cluster_dct, name))
    
  return data

Returns selected feature values from Training lists

In [None]:
def get_attribute_values(data, attribute_name):
  all_values = []

  for cluster in data:
    cluster_values = []
    for training in cluster:
      cluster_values.append(getattr(training, attribute_name))
    all_values.append(cluster_values)
    
  return all_values

## Utilities

Calculates median values for each cluster in the data

In [None]:
def get_medians(data):
  medians = []

  for cluster in data:
    m_values = []
    for values in cluster:
      m_values.append(statistics.median(values))
    medians.append(m_values)
    
  return medians

Returns frequency of each tool in a cluster

In [None]:
def get_command_freq(cluster):
  cluster_commands = []
  
  for training in cluster:
    commands = []
    for cmd in training.commands:
      commands.append(cmd.program)
    cluster_commands.append(commands)

  freq = list(map(Counter, cluster_commands))
  return {elem: [count[elem] for count in freq] for elem in {elem for count in freq for elem in count}}

Scales list by it's maximum value

In [None]:
def scale_list(data):
  scaled = []

  for cluster in data:
    cluster = [float(i)/max(cluster) for i in cluster]
    scaled.append(cluster)
    
  return scaled

Prints locations of log files by cluster

In [None]:
def print_filelocs(training_data):
  for cluster in training_data:
    print('-------')
    for training in cluster:
      print(training.file_name)

Counts occurence of each option for given tools

In [None]:
def count_options(tools, cluster):
  common = tools
  commands_options = {new_list: [] for new_list in common}
  
  for training in cluster:
    for cmd in training.commands:
      if cmd.program in common:
        if cmd.options == '':
          commands_options[cmd.program].append('NO_OPTION')
        values = cmd.options.split()
        for elem in values:
          commands_options[cmd.program].append(elem)

  for option in common:
    counter=collections.Counter(commands_options[option])
    commands_options[option] = counter

  return commands_options

Returns list of most frequent tool and option combination

In [None]:
def get_most_frequent_commands(training_data, tool_count, option_count):
  result_list = []

  for i in range(len(training_data)):
    res = get_command_freq(training_data[i])
    for elem in res:
      res[elem] = statistics.median(res[elem])

    res_orders = sorted(res.items(), key=lambda x: x[1], reverse=True)
    res_commands = [r[0] for r in res_orders]
    options = count_options(res_commands, training_data[i])

    for i in range(tool_count):
      result_list.append((res_orders[i], options[res_orders[i][0]].most_common(option_count)))
  return result_list

Creates dictionary containing average delays for each tool

In [None]:
def get_command_delays(training_data):
  diff_dict = {}
  for training in training_data:
    for i in range(1, len(training.commands)):
      if training.commands[i].time - training.commands[i - 1].time < 3600:
        cmd = training.commands[i].program
        tdif = training.commands[i].time - training.commands[i - 1].time
      if cmd not in diff_dict:
        diff_dict[cmd] = []
      diff_dict[cmd].append(tdif)
  for command, difference in diff_dict.items():
    diff_dict[command] = sum(difference) / len(difference)
  return diff_dict

## Visualizations

Creates reachability plot

In [None]:
def basic_cluster_visualization(clustering, training_count, cluster_labels):
  space = np.arange(training_count)
  labels = clustering.labels_[clustering.ordering_]
  reachability = clustering.reachability_[clustering.ordering_]

  plt.figure(figsize=(10, 8))
  colors = ['og', '^r', 'sb', 'py', 'Dc', '<k', '>m', '*y', '|r']

  for cluster, color in zip(range(-1, 8), colors):
      x = space[labels == cluster]
      y = reachability[labels == cluster]
      plt.plot(x, y, color)
      
  plt.grid(alpha=0.3)
  plt.legend(cluster_labels, loc=(0.75, 0.85), labelspacing=0.1)
  plt.ylabel('Reachability (epsilon distance)')
  plt.xlabel('Point ordering')
  figure = plt.gcf()
  plt.show()

Creates a single boxplot

In [None]:
def draw_boxplot(name, data, labels):
  fig = plt.figure(figsize=(10, 6))
  ax = fig.add_subplot(121)
  ax.set_title(name)
  bp = ax.boxplot(data[:-1])
  ax.set_xticklabels(labels)
  fig.show()

Creates heatmap of correlation between attributes

In [None]:
def draw_cor_heatmap(attribute_labels, attribute_data):
  attributes = []
  for i in range(len(attribute_labels)):
    attributes.append(list(itertools.chain.from_iterable(attribute_data[i])))

  rows = []
  for i in range(len(attribute_labels)):
    column_values = []
    for j in range(len(attribute_labels)):
      corr, _ = pearsonr(attributes[i], attributes[j])
      column_values.append(round(corr, 3))
    rows.append(column_values)
    
  df = pd.DataFrame(np.array(rows),
                    columns=attribute_labels, index=attribute_labels)
  #mask to only show lower triangle
  #triangle = np.triu(np.triu(np.ones_like(df, dtype=bool)))
  sns.heatmap(df, annot = True, square=True)

This is an example code published on Matplotlib webpage, we use it to draw the radar chart

Authors: John Hunter, Darren Dale, Eric Firing, Michael Droettboom and the Matplotlib development team

Accessed on: 23.12.2020

Title: Radar chart (aka spider or star chart)

Doc version: v3.1.0-4-g0078324e4

URL:https://matplotlib.org/3.1.0/gallery/specialty_plots/radar_chart.html

In [None]:
from matplotlib.patches import Circle, RegularPolygon
from matplotlib.path import Path
from matplotlib.projections.polar import PolarAxes
from matplotlib.projections import register_projection
from matplotlib.spines import Spine
from matplotlib.transforms import Affine2D

def radar_factory(num_vars, frame='circle'):
    """Create a radar chart with `num_vars` axes.

    This function creates a RadarAxes projection and registers it.

    Parameters
    ----------
    num_vars : int
        Number of variables for radar chart.
    frame : {'circle' | 'polygon'}
        Shape of frame surrounding axes.

    """
    # calculate evenly-spaced axis angles
    theta = np.linspace(0, 2*np.pi, num_vars, endpoint=False)

    class RadarAxes(PolarAxes):

        name = 'radar'
        # use 1 line segment to connect specified points
        RESOLUTION = 1

        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)
            # rotate plot such that the first axis is at the top
            self.set_theta_zero_location('N')

        def fill(self, *args, closed=True, **kwargs):
            """Override fill so that line is closed by default"""
            return super().fill(closed=closed, *args, **kwargs)

        def plot(self, *args, **kwargs):
            """Override plot so that line is closed by default"""
            lines = super().plot(*args, **kwargs)
            for line in lines:
                self._close_line(line)

        def _close_line(self, line):
            x, y = line.get_data()
            # FIXME: markers at x[0], y[0] get doubled-up
            if x[0] != x[-1]:
                x = np.concatenate((x, [x[0]]))
                y = np.concatenate((y, [y[0]]))
                line.set_data(x, y)

        def set_varlabels(self, labels):
            self.set_thetagrids(np.degrees(theta), labels)

        def _gen_axes_patch(self):
            # The Axes patch must be centered at (0.5, 0.5) and of radius 0.5
            # in axes coordinates.
            if frame == 'circle':
                return Circle((0.5, 0.5), 0.5)
            elif frame == 'polygon':
                return RegularPolygon((0.5, 0.5), num_vars,
                                      radius=.5, edgecolor="k")
            else:
                raise ValueError("unknown value for 'frame': %s" % frame)

        def _gen_axes_spines(self):
            if frame == 'circle':
                return super()._gen_axes_spines()
            elif frame == 'polygon':
                # spine_type must be 'left'/'right'/'top'/'bottom'/'circle'.
                spine = Spine(axes=self,
                              spine_type='circle',
                              path=Path.unit_regular_polygon(num_vars))
                # unit_regular_polygon gives a polygon of radius 1 centered at
                # (0, 0) but we want a polygon of radius 0.5 centered at (0.5,
                # 0.5) in axes coordinates.
                spine.set_transform(Affine2D().scale(.5).translate(.5, .5)
                                    + self.transAxes)
                return {'polar': spine}
            else:
                raise ValueError("unknown value for 'frame': %s" % frame)

    register_projection(RadarAxes)
    return theta

Draws radar chart showing median attribute values for each cluster

In [None]:
def draw_radar_chart(attribute_labels, attribute_data, cluster_labels):
  labels = attribute_labels
  scaled_medians = scale_list(get_medians(attribute_data))
  cluster_medians = [list(a) for a in zip(*scaled_medians)]
  theta = radar_factory(5, frame='polygon')

  fig = plt.figure()
  ax = fig.add_subplot(121, projection='radar')
  colors = ['r', 'b', 'y', 'c', 'k', 'm', 'g']

  for d, color in zip(cluster_medians, colors):
      ax.plot(theta, d, color=color)
      ax.fill(theta, d, facecolor=color, alpha=0.25)
      ax.set_varlabels(labels)
  
  labels = cluster_labels
  legend = ax.legend(labels, loc=(0.9, .95), labelspacing=0.1, fontsize='small')

  for d, color in zip(cluster_medians, colors):
    fig = plt.figure()
    ax = fig.add_subplot(121, projection='radar')
    ax.plot(theta, d, color=color)
    ax.fill(theta, d, facecolor=color, alpha=0.25)
    ax.set_varlabels(attribute_labels)
  plt.show()

Draws scatterplot for two given attributes

In [None]:
def scatter_plot(attribute_data, attribute_1, attribute_2, xlabel, ylabel, c_labels, dictClusterDPs):
  plot_data = attribute_data
  fig = plt.figure()
  ax = plt.gca()

  clusters = range(0, len(attribute_data[0]))
  colors = ['r', 'b', 'y', 'c', 'k', 'm', 'g']
  shapes = ['^', 's', 'p', 'D', '<', '>', 'o']

  for i, color in zip(clusters, colors):
    x = plot_data[attribute_1][i]
    y = plot_data[attribute_2][i]
    if i <= len(c_labels) - 1:
      for DBInCluster in range(len(x)):
        ax.text(x[DBInCluster], y[DBInCluster], dictClusterDPs[i][DBInCluster][-4:])
      ax.scatter(x, y, marker=shapes[i], color=color, s=(10*50./fig.dpi)**2)

  legend = ax.legend(c_labels, loc=(1, 1), labelspacing=0.1, fontsize='small')
  
  ax.set_xlabel(xlabel)
  ax.set_ylabel(ylabel)
  fig.set_size_inches(18, 10)
  plt.show()

Draws a single plot containing four boxplots

In [None]:
def draw_boxplots(attribute_data, labels):
  fig, axs = plt.subplots(2, 3)

  axs[0, 0].set_title('Number of typed Bash commmands')
  axs[0, 0].boxplot(attribute_data[0])
  axs[0, 0].set_xticklabels(labels)

  axs[0, 1].set_title('Total time')
  axs[0, 1].boxplot(attribute_data[1])
  axs[0, 1].set_xticklabels(labels)

  axs[1, 0].set_title('Time elapsed between two commands')
  axs[1, 0].boxplot(attribute_data[2])
  axs[1, 0].set_xticklabels(labels)

  axs[1, 1].set_title('Delay count')
  axs[1, 1].boxplot(attribute_data[3])
  axs[1, 1].set_xticklabels(labels)

  axs[0, 2].set_title('Percent_cmd')
  axs[0, 2].boxplot(attribute_data[4])
  axs[0, 2].set_xticklabels(labels)

  fig.subplots_adjust(left=0.08, right=1.4, bottom=0.05, top=2.5,
                    hspace=0.2, wspace=0.2)
  fig.show()

Prepares lists of values for y axis and bubble coloring

In [None]:
def prepare_bubble_plot_yaxis(data):
  y = []
  opacity = []
  for i in range(len(data)):
    command, options = data[i]
    for j in range(len(options)):
        y.append(command[0])
        opacity.append(command[1])
  return y, opacity

Prepares lists of values for x axis and bubble size

In [None]:
def prepare_bubble_plot_xaxis(data):
  x = []
  size = []
  for i in range(len(data)):
    _ , options = data[i]
    for option in options:
      x.append(option[0])
      size.append(option[1]*50)
  return x, size

Creates bubble plot showing frequent tools and options

In [None]:
def bubbleplot(x, y, opacity,size):
  plt.figure(figsize=(10, 5))
  plt.grid(linestyle='dashed')
  plt.xticks(rotation=90)
  plt.scatter(x, y, s=size, c=opacity, alpha=0.8, edgecolors='black')
  plt.colorbar(label='median tool frequency')
  plt.show()