## logzip empirical study

In [41]:
import sys
sys.path.append("../logzip")
from logzip.logzipper import Ziplog
import pandas as pd
import re
import pickle
import lzma
import os
import shutil
import tarfile

In [42]:
# set path dir
from genericpath import exists


logs_path = "../logs"
template_filepath = "../logs"
output_dir = "./zip_out"
tmp_dir = os.path.join(output_dir, "tmp_dir")
result_path = "./results"

if not os.path.exists(output_dir):
    os.makedirs(output_dir, exist_ok=True)


if os.path.exists(tmp_dir):
    shutil.rmtree(tmp_dir)
os.makedirs(tmp_dir, exist_ok=True)


if not os.path.exists(result_path):
    os.makedirs(result_path, exist_ok=True)

In [43]:
logformat = {
    "HDFS": "<Date> <Time> <Pid> <Level> <Component>: <Content>",
    "Hadoop": "<Date> <Time> <Level> \[<Process>\] <Component>: <Content>",
    "Spark": "<Date> <Time> <Level> <Component>: <Content>",
    "Zookeeper": "<Date> <Time> - <Level>  \[<Node>:<Component>@<Id>\] - <Content>",
    "BGL": "<Label> <Timestamp> <Date> <Node> <Time> <NodeRepeat> <Type> <Component> <Level> <Content>",
    "HPC": "<LogId> <Node> <Component> <State> <Time> <Flag> <Content>",
    "Thunderbird": "<Label> <Timestamp> <Date> <User> <Month> <Day> <Time> <Location> <Component>(\[<PID>\])?: <Content>",
    "Windows": "<Date> <Time>, <Level>                  <Component>    <Content>",
    "Linux": "<Month> <Date> <Time> <Level> <Component>(\[<PID>\])?: <Content>",
    "Andriod": "<Date> <Time>  <Pid>  <Tid> <Level> <Component>: <Content>",
    "HealthApp": "<Time>\|<Component>\|<Pid>\|<Content>",
    "Apache": "\[<Time>\] \[<Level>\] <Content>",
    "Proxifier": "\[<Time>\] <Program> - <Content>",
    "OpenSSH": "<Date> <Day> <Time> <Component> sshd\[<Pid>\]: <Content>",
    "OpenStack": "<Logrecord> <Date> <Time> <Pid> <Level> <Component> \[<ADDR>\] <Content>",
    "Mac": "<Month>  <Date> <Time> <User> <Component>\[<PID>\]( \(<Address>\))?: <Content>",
}

In [44]:
def evaluate_compression_ratio(infile, outfile):
    return os.path.getsize(infile) / os.path.getsize(outfile)

In [45]:
def run_logzip_(n_workers, logformat, out_dir, outname, kernel, tmp_dir, level, in_filepath, template_filepath):
    zipper = Ziplog(logformat=logformat,
                    outdir=out_dir,
                    outname=outname,
                    kernel=kernel,
                    tmp_dir=tmp_dir,
                    level=level)
    zipper.zip_file(in_filepath, template_filepath)
    

In [46]:
def run_logzip(mode="all", kernel="bz2"):
    # basic setting of logzip
    level = 1
    n_workers = 1
    results = dict()

    if mode == "all":
        dataset = os.listdir(logs_path)
    else:
        dataset = mode

    for ds in dataset:
        # define log format
        out_dir = os.path.join(output_dir, ds)
        if not os.path.exists(out_dir):
            os.mkdir(out_dir)
        outname = ds + '.logzip'
        in_filepath = os.path.join(logs_path, ds, ds + '_2k.log')
        template_filepath = os.path.join(logs_path, ds, ds + '_2k.log_templates.csv')
        # (n_workers, logformat, out_dir, outname, kernel, tmp_dir, level, in_filepath, template_filepath)
        run_logzip_(n_workers, logformat[ds], out_dir, outname, kernel, tmp_dir, level, in_filepath, template_filepath)
        output_path = os.path.join(output_dir, ds, outname + '.tar.' + kernel)
        print("input file ", in_filepath, output_path)
        results[ds] = evaluate_compression_ratio(in_filepath, output_path)
    
    pickle.dump(results, open(os.path.join(result_path, "logzip_" + kernel + ".pkl"), 'wb'))
    return results

In [47]:
# run_logzip(mode=["HDFS"], kernel="gz")

out_dir = "./zip_out/"
outname = "HDFS_2k" + ".logzip"
tmp_dir = "./zip_out/tmp_dir"

level = 1
kernel = "bz2"   # options: (1) gz  (2) bz2
n_workers = 1

zipper = Ziplog(logformat="<Date> <Time> <Pid> <Level> <Component>: <Content>",
                outdir=out_dir,
                outname=outname,
                kernel=kernel,
                tmp_dir=tmp_dir,
                level=level)
zipper.zip_file("../logs/HDFS/HDFS_2k.log", "../logs/HDFS/HDFS_2k.log_templates.csv")

Loading log messages to dataframe...
Total lines 2000
Worker 70704 processing.
Worker 70704 finish.
Loading 2000 messages done, loading rate: 100.0%, failed lines: 0
Time taken 0.02s
Processing log file: None...
Building match tree...
Matching event templates...
Worker 70704 start matching 2000 lines.
Matching done, matching rate: 0.0% [Time taken: 0:00:00.065413]


In [48]:
def run_baselines(mode="all", kernel="gz"):
    results = dict()

    for ds in os.listdir(logs_path):
        out_dir = os.path.join(output_dir, ds)
        output_name = os.path.join(output_dir, ds, ds + "." + kernel)
        if not os.path.exists(out_dir):
            os.mkdir(out_dir)
        source_dir = os.path.join(logs_path, ds, ds + '_2k.log')

        if kernel in ["gz", "bz2"]:
            with tarfile.open(output_name, "w:gz") as tar:
                tar.add(source_dir, arcname=os.path.basename(source_dir))
        elif kernel in ["lzma"]:
            with open(source_dir, 'rb') as f, open(output_name, 'wb') as out:
                out.write(lzma.compress(bytes(f.read())))
        
        results[ds] = evaluate_compression_ratio(source_dir, output_name)
    
    pickle.dump(results, open(os.path.join(result_path, "tar_" + kernel + ".pkl"), 'wb'))
    return results
        

In [49]:
run_baselines(mode="all", kernel="gz")
run_baselines(mode="all", kernel="bz2")
run_baselines(mode="all", kernel="lzma")


{'Spark': 19.591367486889876,
 'Apache': 23.337010479867622,
 'Mac': 9.151626110021912,
 'HealthApp': 13.885669362084457,
 'Linux': 18.395025728987992,
 'HDFS': 6.693078580125503,
 'Hadoop': 29.17039914686167,
 'OpenStack': 14.148854961832061,
 'Proxifier': 13.193875278396437,
 'Andriod': 16.50846043851287,
 'BGL': 7.927928154558261,
 'OpenSSH': 18.852787162162162,
 'Zookeeper': 15.904990842490843,
 'Thunderbird': 15.577067669172932,
 'HPC': 7.838272383354351,
 'Windows': 30.489888123924267}

In [50]:
run_logzip(mode=["HDFS"], kernel="gz")
# run_logzip(kernel="bz2")

Loading log messages to dataframe...
Total lines 2000
Worker 70704 processing.
Worker 70704 finish.
Loading 2000 messages done, loading rate: 100.0%, failed lines: 0
Time taken 0.03s
Processing log file: None...
Building match tree...
Matching event templates...
Worker 70704 start matching 2000 lines.
Matching done, matching rate: 0.0% [Time taken: 0:00:00.092982]
input file  ../logs/HDFS/HDFS_2k.log ./zip_out/HDFS/HDFS.logzip.tar.gz


{'HDFS': 5.433856097329151}