In [1]:
from random import choice, randint
import gensim
import numpy as np
from tqdm.notebook import tqdm

from gensim.corpora.dictionary import Dictionary
from pyemd import emd
from gensim.models.doc2vec import TaggedDocument
from scipy import spatial

# function to generate logs

In [2]:
def generate_logs(sizelog1, sizelog2, averagesize, dictionarysize):
    sentences1 = [
    ['Act{}'.format(randint(1,dictionarysize)) for w in range(randint(averagesize-2,averagesize+2))]
    for i in range(sizelog1)]
    
    sentences2 = [
    ['Act{}'.format(randint(1,dictionarysize)) for w in range(randint(averagesize-2,averagesize+2))]
    for i in range(sizelog2)]
    
    return(sentences1, sentences2)

In [3]:
logA, logB = generate_logs(1000, 1000, 20, 10)

# define wmd, ict and t2v

In [4]:
class WmDistance(object):
    def __init__(self, wv, docset1, docset2):
        self.wv = wv
        self.docset1 = docset1
        self.docset2 = docset2
        self.dists = np.full((len(self.docset1), len(self.docset2)), np.nan)
        self.dictionary = Dictionary(documents=self.docset1 + self.docset2)
        self.vocab_len = len(self.dictionary)
        self._cache_nbow()
        self._cache_dmatrix()
    def _cache_nbow(self):
        self.nbow1 = [self._nbow(doc) for doc in self.docset1]
        self.nbow2 = [self._nbow(doc) for doc in self.docset2]
    def _nbow(self, document):
        d = np.zeros(self.vocab_len, dtype=np.double)
        nbow = self.dictionary.doc2bow(document)
        doc_len = len(document)
        for idx, freq in nbow:
            d[idx] = freq / float(doc_len)
        return d
    def _cache_dmatrix(self):
        self.distance_matrix = np.zeros((self.vocab_len, self.vocab_len), dtype=np.double)
        for i, t1 in self.dictionary.items():
            for j, t2 in self.dictionary.items():
                if self.distance_matrix[i, j] != 0.0: continue
                self.distance_matrix[i, j] = self.distance_matrix[j, i] = \
                    np.sqrt(np.sum((self.wv[t1] - self.wv[t2])**2))
    def __getitem__(self, ij):
        if np.isnan(self.dists[ij[0], ij[1]]):
            self.dists[ij[0], ij[1]] = emd(self.nbow1[ij[0]], self.nbow2[ij[1]], self.distance_matrix)
        return self.dists[ij[0], ij[1]]

#p and q are given as nbow, so an array with voc size and count weights
def ACT(p, q, C, k): #for now C is new every trace comparison, ADD LATER old used for the early stopping
    t = 0
    for i in range(0, len(p)):
        pi = p[i] #the weight of the ith element in p trace
        if pi == 0.: #if this activity is not actually in p pi will be zero
            continue
        dummy_s = np.argsort(C[i]) #have to change to only use the thing where q[j] != 0
        s = np.ones(k, dtype=int)
        it = 0
        j = 0
        while it<k and j<len(dummy_s):
            if q[dummy_s[j]] != 0.:
                s[it] = int(dummy_s[j])
                it = it + 1
            j = j+1
        l = 0
        while l<k and pi>0:
            r = min(pi, q[s[l]])
            pi = pi - r
            t = t + r*C[i, s[l]] 
            l = l+1
        if pi != 0:
            t =  t + pi*C[i, s[l-1]]
    return t

class ICT(object):
    def __init__(self, wv, docset1, docset2, k):
        self.wv = wv
        self.docset1 = docset1
        self.docset2 = docset2
        self.k = k
        self.dists = np.full((len(self.docset1), len(self.docset2)), np.nan)
        self.dictionary = Dictionary(documents=self.docset1 + self.docset2)
        self.vocab_len = len(self.dictionary)
        self._cache_nbow()
        self._cache_dmatrix()
    def _cache_nbow(self):
        self.nbow1 = [self._nbow(doc) for doc in self.docset1]
        self.nbow2 = [self._nbow(doc) for doc in self.docset2]
    def _nbow(self, document):
        d = np.zeros(self.vocab_len, dtype=np.double)
        nbow = self.dictionary.doc2bow(document)
        doc_len = len(document)
        for idx, freq in nbow:
            d[idx] = freq / float(doc_len)
        return d
    def _cache_dmatrix(self):
        self.distance_matrix = np.zeros((self.vocab_len, self.vocab_len), dtype=np.double)
        for i, t1 in self.dictionary.items():
            for j, t2 in self.dictionary.items():
                if self.distance_matrix[i, j] != 0.0: continue
                self.distance_matrix[i, j] = self.distance_matrix[j, i] = \
                    np.sqrt(np.sum((self.wv[t1] - self.wv[t2])**2))
    def __getitem__(self, ij):
        if np.isnan(self.dists[ij[0], ij[1]]):
            self.dists[ij[0], ij[1]] = ACT(self.nbow1[ij[0]], self.nbow2[ij[1]], self.distance_matrix, self.k)
        return self.dists[ij[0], ij[1]]

In [5]:
def t2v(log1, log2):
    
    
    tags_GT_log = []
    tags_pert_log = []
    
    for j in range(len(log1)):
        ID = str()
        for i in range(len(log1[j])):
            ID = ID + log1[j][i].replace(" ", "")
        trace_id = [ID]
        tags_GT_log.append(trace_id)
        
    for j in range(len(log2)):
        ID = str()
        for i in range(len(log2[j])):
            ID = ID + log2[j][i].replace(" ", "")
        trace_id = [ID]
        tags_pert_log.append(trace_id)
        
    bothlog = log1 + log2
    taggedlog = []
    
    
    for j in range(len(bothlog)):
        eventlist = []
        ID = str()
        for i in range(len(bothlog[j])):
            ID = ID + bothlog[j][i].replace(" ", "")
        trace_id = [ID]
        td = TaggedDocument(bothlog[j], trace_id)
        taggedlog.append(td)
    

    
    #use a combination of both logs to train, but each variant only once
    model = gensim.models.Doc2Vec(taggedlog, alpha=0.025, vector_size= 8, window=3,  min_count=1, dm = 0)
    model.train(taggedlog, total_examples=len(taggedlog), epochs=100)
    
    print("Model training done")
    
    def cosdis(trace1, trace2):
        rep1 = model.docvecs[trace1[0]]
        rep2 = model.docvecs[trace2[0]]
        return spatial.distance.cosine(rep1, rep2)
    
    def distmatrix(GTlog, pertlog):
        distances = np.full((len(pertlog),len(GTlog)), 100.0) #each trace of the perturbed log is a row and each column is a trace from GT
        for i in range(len(pertlog)):
            #if i % 50 == 0:
                #print ('Now calculating trace number %s'%i)
            for j in range(len(GTlog)):
                distances[i][j] = cosdis(pertlog[i],GTlog[j])
        return distances
    
    disM = distmatrix(tags_GT_log, tags_pert_log)
    #print(disM)
    
    #precision = np.average(np.amin(disM, axis=1)) #average of the minima of each row = compare pert to GT
    
    #fitness = np.average(np.amin(disM, axis=0)) #aevrage of the minima of each column = compare GT to pert
    
    #print(np.amin(disM, axis=0))
    
    #return(precision, fitness)

# testing example

In [31]:
logA, logB = generate_logs(100, 100, 20, 10)

In [32]:
%%time
t2v(logA, logB)

Model training done
Wall time: 1.03 s


In [6]:
%%time

model = gensim.models.Word2Vec(logA + logB, min_count=1, size=8, window=3)

dist_matrix2 = np.zeros((len(logA), len(logB)))

wmcalc = WmDistance(model.wv, logA, logB)

for i, first_sent in enumerate(tqdm(logA)):
    for j, second_sent in enumerate(logB):
        dist_matrix2[i,j] = wmcalc[i, j]

HBox(children=(IntProgress(value=0), HTML(value='')))


Wall time: 882 ms


In [7]:
%%time

model = gensim.models.Word2Vec(logA + logB, min_count=1, size=8, window=3)

dist_matrix3 = np.zeros((len(logA), len(logB)))

wmcalc = ICT(model.wv, logA, logB, 3) #k=3

for i, first_sent in enumerate(tqdm(logA)):
    for j, second_sent in enumerate(logB):
        dist_matrix3[i,j] = wmcalc[i, j]

HBox(children=(IntProgress(value=0), HTML(value='')))


Wall time: 1.08 s
