## logzip empirical study

In [60]:
import sys
sys.path.append("../logzip")
from logzip.logzipper import Ziplog
import pandas as pd
import re
import pickle
import lzma
import os
import tarfile

In [33]:
# set path dir
logs_path = "../logs"
template_filepath = "../logs"
output_dir = "./zip_out"
tmp_dir = os.path.join(output_dir, "tmp_dir")
result_path = "./results"

if not os.path.exists(output_dir):
    os.makedirs(output_dir, exist_ok=True)

if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir, exist_ok=True)

if not os.path.exists(result_path):
    os.makedirs(result_path, exist_ok=True)

In [34]:
logformat = {
    "HDFS": "<Date> <Time> <Pid> <Level> <Component>: <Content>",
    "Hadoop": "<Date> <Time> <Level> \[<Process>\] <Component>: <Content>",
    "Spark": "<Date> <Time> <Level> <Component>: <Content>",
    "Zookeeper": "<Date> <Time> - <Level>  \[<Node>:<Component>@<Id>\] - <Content>",
    "BGL": "<Label> <Timestamp> <Date> <Node> <Time> <NodeRepeat> <Type> <Component> <Level> <Content>",
    "HPC": "<LogId> <Node> <Component> <State> <Time> <Flag> <Content>",
    "Thunderbird": "<Label> <Timestamp> <Date> <User> <Month> <Day> <Time> <Location> <Component>(\[<PID>\])?: <Content>",
    "Windows": "<Date> <Time>, <Level>                  <Component>    <Content>",
    "Linux": "<Month> <Date> <Time> <Level> <Component>(\[<PID>\])?: <Content>",
    "Andriod": "<Date> <Time>  <Pid>  <Tid> <Level> <Component>: <Content>",
    "HealthApp": "<Time>\|<Component>\|<Pid>\|<Content>",
    "Apache": "\[<Time>\] \[<Level>\] <Content>",
    "Proxifier": "\[<Time>\] <Program> - <Content>",
    "OpenSSH": "<Date> <Day> <Time> <Component> sshd\[<Pid>\]: <Content>",
    "OpenStack": "<Logrecord> <Date> <Time> <Pid> <Level> <Component> \[<ADDR>\] <Content>",
    "Mac": "<Month>  <Date> <Time> <User> <Component>\[<PID>\]( \(<Address>\))?: <Content>",
}

In [35]:
def evaluate_compression_ratio(infile, outfile):
    return os.path.getsize(infile) / os.path.getsize(outfile)

In [36]:
def run_logzip_(n_workers, logformat, out_dir, outname, kernel, tmp_dir, level, in_filepath, template_filepath):
    zipper = Ziplog(logformat=logformat,
                    outdir=out_dir,
                    outname=outname,
                    kernel=kernel,
                    tmp_dir=tmp_dir,
                    level=level)
    zipper.zip_file(in_filepath, template_filepath)
    

In [75]:
def run_logzip(mode="all", kernel="bz2"):
    # basic setting of logzip
    level = 1
    n_workers = 1
    results = dict()

    if mode == "all":
        for ds in os.listdir(logs_path):
            # define log format
            out_dir = os.path.join(output_dir, ds)
            outname = ds + '.logzip'
            in_filepath = os.path.join(logs_path, ds, ds + '_2k.log')
            template_filepath = os.path.join(logs_path, ds, ds + '_2k.log_templates.csv')
            run_logzip_(n_workers, logformat[ds], out_dir, outname, kernel, tmp_dir, level, in_filepath, template_filepath)
            output_path = os.path.join(output_dir, ds, outname + '.tar.' + kernel)
            results[ds] = evaluate_compression_ratio(in_filepath, output_path)
    pickle.dump(results, open(os.path.join(result_path, "logzip_" + kernel + ".pkl"), 'wb'))
    return results

In [69]:
def run_baselines(mode="all", kernel="gz"):
    results = dict()

    for ds in os.listdir(logs_path):
        # output_dir = os.path.join(output_dir, ds)
        output_name = os.path.join(output_dir, ds, ds + "." + kernel)
        source_dir = os.path.join(logs_path, ds, ds + '_2k.log')

        if kernel in ["gz", "bz2"]:
            with tarfile.open(output_name, "w:gz") as tar:
                tar.add(source_dir, arcname=os.path.basename(source_dir))
        elif kernel in ["lzma"]:
            with open(source_dir, 'rb') as f, open(output_name, 'wb') as out:
                out.write(lzma.compress(bytes(f.read())))
        
        results[ds] = evaluate_compression_ratio(source_dir, output_name)
    
    pickle.dump(results, open(os.path.join(result_path, "tar_" + kernel + ".pkl"), 'wb'))
    return results
        

In [76]:
run_baselines(mode="all", kernel="gz")
run_baselines(mode="all", kernel="bz2")
run_baselines(mode="all", kernel="lzma")

run_logzip(kernel="gz")
run_logzip(kernel="bz2")

Loading log messages to dataframe...
Total lines 2000
Worker 38452 processing.
Worker 38452 finish.
Loading 2000 messages done, loading rate: 100.0%, failed lines: 0
Time taken 0.02s
Processing log file: None...
Building match tree...
Matching event templates...
Worker 38452 start matching 2000 lines.
Matching done, matching rate: 0.0% [Time taken: 0:00:00.037853]
Loading log messages to dataframe...
Total lines 2000
Worker 38452 processing.
Worker 38452 finish.
Loading 2000 messages done, loading rate: 100.0%, failed lines: 0
Time taken 0.01s
Processing log file: None...
Building match tree...
Matching event templates...
Worker 38452 start matching 2000 lines.
Matching done, matching rate: 0.0% [Time taken: 0:00:00.033847]
Loading log messages to dataframe...
Total lines 2000
Worker 38452 processing.
Worker 38452 finish.
Loading 2000 messages done, loading rate: 100.0%, failed lines: 0
Time taken 0.02s
Processing log file: None...
Building match tree...
Matching event templates...
Wor

{'Spark': 2.825017813776957,
 'Apache': 2.528687544824289,
 'Mac': 3.334926822093109,
 'HealthApp': 2.6698289761603133,
 'Linux': 3.1671071866278813,
 'HDFS': 2.7842247265430955,
 'Hadoop': 5.177015316813346,
 'OpenStack': 7.7748502366064995,
 'Proxifier': 3.1823211839596035,
 'Andriod': 3.7678207185400745,
 'BGL': 3.4197927404915633,
 'OpenSSH': 3.038371491574334,
 'Zookeeper': 3.4987976078061065,
 'Thunderbird': 4.305451202941412,
 'HPC': 2.093550016840687,
 'Windows': 4.165819101090567}