# Log Parser
Log parser extracts training data from the command log files and saves them in a format suitable for clustering

In [None]:
import numpy as np

Mounting a google drive containing log files

In [None]:
from google.colab import drive 
drive.mount('/content/drive', force_remount=True)

Class representing a command

In [None]:
class Command:

  def __init__(self, program, cmd, time):
    self.program = program
    self.cmd = cmd
    self.time = time

Class representing one training session

*option_changes* -represents consecutive use of the same tool with different options

*requested_help* -represents the number of times trainee used built-in method to learn more information about a tool he uses

In [None]:
class Training:

  def __init__(self, commands, percentage, name):
    self.commands = commands
    self.total_cmd = len(commands)
    self.average_diff = np.average(get_time_difference(commands))
    self.total_time = sum(get_time_difference(commands))
    self.delay_count = get_delay_count(commands)
    self.percent_cmd = percentage
    self.file_name = name
  
  def get_command_count(self):
    command_list = []
    junk_cmd = ["cd", "pwd", "ls"]
    for command in self.commands:
      if command.program not in junk_cmd:
        command_list.append(command.program)
    command_list = (zip(command_list, [command_list.count(s) for s in command_list]))
    return dict(command_list)


*get_command_count* method that returns a count for each tool trainee used

## Get time data
Another attribute that needs to be calculated is the time spent between executing two commands

Sometimes there are extremely long times (delays) between two typed commands

We disregard too big or negative time difference

In [None]:
def get_time_difference(commands):
  time_diff = []
  for i in range(0, len(commands) - 1):
    difference = (commands[i+1].time - commands[i].time) % 3600
    time_diff.append(difference)
  return time_diff

In [None]:
def get_delay_count(commands):
  delay_count = 0
  for difference in get_time_difference(commands):
    if difference > 1200:
      delay_count += 1
  return delay_count

# Extract Training instances
Set of functions we use to get Training instances from a directory

Function that converts dictionary containing log details into Command object

In [None]:
def extract_command(command_dict):
  splitted_cmd = command_dict['cmd']
  program = splitted_cmd.split(" ", 1)[0]
  time =  command_dict['timestamp_ms']
  command = Command(program, splitted_cmd, time)
  return command

Each log in the file is parsed, resulting in a list of Command objects representing one training

In [None]:
import json

We need to limit max length of the tool as some commands contain wrong input

In [None]:
MAX_TOOL_LENGTH = 20
ip_list = []

In [None]:
#130.127
#130.162

cmd_pattern = [["vol.py -f", "volatility -f", "--info", "strings grep", "linux_bash"], ["strings grep", "linux_bash"], ["echo aW5zZWNsYWJ7dzNsYzBtM190MF9MMW51WF9tM20wcllfZjByM25zMWM1fQ==", "base64"]]

def get_cmd_percentage(command, pos):
  if pos < len(cmd_pattern):
    for pattern in cmd_pattern[pos]:
      if all(keyword in command for keyword in pattern.split(' ')):
        pos = pos + 1
        break
  return pos

In [None]:
def parse_commands(file_name):
  command_list = []
  with open(file_name) as file:
    pos = 0
    for line in file:
      #this line is needed only because some log files contain trailing 
      #characters at the begging
      command_dict = json.loads(line)
      command = extract_command(command_dict)
      if len(command.program) < MAX_TOOL_LENGTH:
        command_list.append(command)
        pos = get_cmd_percentage(command.cmd, pos)
  command_list.sort(key=lambda x: x.time)
  percentage = pos / len(cmd_pattern)
  ip_list.append(str(file_name).split("/")[-1][:-5])
  return [command_list, percentage]

This function returns a list of Training instances created from all valid log files in a directory

In [None]:
from pathlib import Path

In [None]:
def process_files(dir_path):
  training_list = []
  for path in Path(dir_path).rglob('*'):
    cmd_parsed = parse_commands(path)
    training = Training(cmd_parsed[0], cmd_parsed[1], path)
    training_list.append(training)
  return training_list