# Synthetic Dataset Generator

In [1]:
import random
import pandas as pd

class synthetic_logs:
    _gens = []
    _symbols = list("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
    _noisy   = list("abcdefghijklmnopqrstuvwxyz")
    
    def __init__(self):
        self._gens = []
        self._used_symbols = []
        self.symbol = self.create_symbols( self._symbols )
        self.noisy = self.create_symbols( self._noisy )
        
    def create_symbols(self, symbols):
        def combine(A,B):
            return [ a+b for a in A for b in B ]
        used = symbols[:]
        stack = used[:]
        while True:
            nxt = stack.pop(0)
            if not len(stack):
                stack = combine( used, symbols )
                used = stack[:]
            yield nxt
            
            
    def generate_instances(self, instances=5):
        self.instances = [ self.generate_trace() for i in range(instances) ]
        return self.instances

    def generate_trace(self):
        if len(self._gens) == 0:
            self.symbol = self.create_symbols( self._symbols )
            return [ (i, next(self.symbol) ) for i in range(5) ]
        else:
            trace = []
            for generator in self._gens:
                trace = trace + generator.generate_trace()
            return sorted(trace, key=lambda e: e[0])
            #return trace
            
    def show_instances(self, head=10):
        for i in range( min(head, len(self.instances))  ):
            trace = []
            for t, l in self.instances[i]:
                trace.append(l)
            print("%3d : [%s]" % (i+1, " ".join(trace)))
            
    def add(self, generator):
        generator.log = self
        self._gens.append(generator)
        
    def describe(self):
        if len(self._gens) == 0:
            return []
        else:
            desc = {}
            desc['generators'] = []
            for gen in self._gens:
                desc['generators'].append( gen.describe() )
            return desc
        
class noisy_path:
    def __init__(self, every, num_symbols, count):
        self.every = every
        self.num_symbols = num_symbols
        self.count = count
        self.noise = False
        self.last = []
    def generate_trace(self):
        if not self.noise:
            self.noise = [ next(self.log.noisy) for i in range(self.num_symbols) ]
        t = 0
        trace = []
        for i in range(self.count):
            trace.append( (t, self.noise[ random.randint(0, len(self.noise)-1) ] ) )
            t += self.every
            
        self.last = trace
        return self.last

    
    def describe(self):
        return {
            'class': self.__class__.__name__,
            '#symbols': len(self.noise),
            'example': " ".join([b for (a,b) in self.last[:10] ]),
        }
    
class serial_path:
    def __init__(self, size, every, error=0, probability=1):
        self.every = every
        self.size = size
        self.error = error
        self.probability = probability
        self.path = False
        self.last = [ ]
        
    def generate_trace(self):
        if not self.path:
            self.path = [ next(self.log.symbol) for i in range(self.size) ]
        t = random.randint(0, self.error)
        trace = []
        for s in self.path:
            trace.append( (t, s) )
            err = random.randint(-self.error, self.error)
            t = max( t, t + self.every + err )
        self.last = trace
        if random.random() < self.probability:
            return self.last
        else:
            return []

    
    def describe(self):
        return {
            'class': self.__class__.__name__,
            '#symbols': len(self.path),
            'example': " ".join([b for (a,b) in self.last[:10] ]),
            'error' : self.error,
            'probability' : self.probability
        }

### Default values

In [2]:
logs = synthetic_logs()

In [3]:
logs.generate_trace()

[(0, 'A'), (1, 'B'), (2, 'C'), (3, 'D'), (4, 'E')]

In [4]:
logs.generate_instances()

[[(0, 'A'), (1, 'B'), (2, 'C'), (3, 'D'), (4, 'E')],
 [(0, 'A'), (1, 'B'), (2, 'C'), (3, 'D'), (4, 'E')],
 [(0, 'A'), (1, 'B'), (2, 'C'), (3, 'D'), (4, 'E')],
 [(0, 'A'), (1, 'B'), (2, 'C'), (3, 'D'), (4, 'E')],
 [(0, 'A'), (1, 'B'), (2, 'C'), (3, 'D'), (4, 'E')]]

In [5]:
logs.show_instances()

  1 : [A B C D E]
  2 : [A B C D E]
  3 : [A B C D E]
  4 : [A B C D E]
  5 : [A B C D E]


In [6]:
logs.describe()

[]

## Noise

In [7]:
logs = synthetic_logs()
logs.add( noisy_path( every=10, num_symbols=20, count=5)  )

In [8]:
logs.generate_trace()

[(0, 'c'), (10, 'i'), (20, 'r'), (30, 'a'), (40, 'j')]

In [9]:
logs.generate_instances()
logs.show_instances()

  1 : [i t d t s]
  2 : [d o c b d]
  3 : [q t n k d]
  4 : [i f a g r]
  5 : [k b n s i]


In [10]:
logs.describe()

{'generators': [{'class': 'noisy_path',
   '#symbols': 20,
   'example': 'k b n s i'}]}

## Serial Paths

In [11]:
logs = synthetic_logs()
logs.add( serial_path( size=5, every=10, error=3)  )

In [12]:
logs.generate_trace()

[(1, 'A'), (10, 'B'), (23, 'C'), (31, 'D'), (42, 'E')]

In [13]:
logs.generate_instances()
logs.show_instances()

  1 : [A B C D E]
  2 : [A B C D E]
  3 : [A B C D E]
  4 : [A B C D E]
  5 : [A B C D E]


In [14]:
logs.describe()

{'generators': [{'class': 'serial_path',
   '#symbols': 5,
   'example': 'A B C D E',
   'error': 3,
   'probability': 1}]}

## Noise + Serial

In [15]:
logs = synthetic_logs()
logs.add( noisy_path( every=10, num_symbols=50, count=30)  )
logs.add( serial_path( size=7, every=25, error=0, probability=1)  )
logs.add( serial_path( size=5, every=25, error=5, probability=0.5)  )

In [16]:
logs.generate_trace()[:7]

[(0, 'y'), (0, 'A'), (0, 'H'), (10, 'au'), (20, 'w'), (23, 'I'), (25, 'B')]

In [17]:
logs.generate_instances(10)
logs.show_instances()

  1 : [au A H a q I B g af J d C g aq K D h h L ap E am ae F ad j g G p c ah as aa k aq w af b ae aw ax i]
  2 : [ag A f af B v u m C q n D g an k E q r F d ak ad G e al j c o ak e ar m p as ac l u]
  3 : [k A l i B j ai aq C p f D ax as au E ak q F ah c g G v b q k ax aw b ab h m am j av au]
  4 : [aa A aw ak B j ah at C d z D ai aa aj E av n F w l f G b af p am s av ag ad at au x j p v]
  5 : [z A ag z B ax ai at C l i D g y l E t i F ap z x G am k p f ap am ah k av al af l a au]
  6 : [l A H aj ai B d I ah as C J v a D k K h a E aq L i F ae p c G g t u ac ai an ab e am j p ai w b]
  7 : [x A as u B as aa aa C n av D t am v E aw ae F aj z z G am aq aw ah ag ai as y i ao ar q ai o]
  8 : [n A m ae B q q ar C j y D ab aa ah E w af F r b ai G l aa h k as al au i af ao ag aw g ak]
  9 : [v A H k am B n I av al C J b r D k K ao d E L ac g F ak i i G ar k af n x ak x at o x o as i b]
 10 : [ab A ai aj B ac al an C v aq D aa an q E at ad F s t s G n o ac g y ag l g j aa aa z i y]


In [18]:
pd.DataFrame(logs.describe()['generators'])

Unnamed: 0,class,#symbols,example,error,probability
0,noisy_path,50,ab ai aj ac al an v aq aa an,,
1,serial_path,7,A B C D E F G,0.0,1.0
2,serial_path,5,H I J K L,5.0,0.5


## Three serials

In [43]:
logs = synthetic_logs()
logs.add( serial_path( size=7, every=25, error=5, probability=1)  )
logs.add( serial_path( size=7, every=25, error=10, probability=1)  )
logs.add( serial_path( size=10, every=10, error=5, probability=0.5)  )

In [44]:
logs.generate_instances(100)
logs.show_instances(20)

  1 : [A H B I C J K D E L M F G N]
  2 : [H A O P Q I B R S T J U C V W K X D L E M F N G]
  3 : [A H B I C J D K E L F M G N]
  4 : [A H B I J C K D L E M F G N]
  5 : [A O H P B Q R I S C T U V D J W K X E L F M G N]
  6 : [A H B I C J D K E L F G M N]
  7 : [A H B I C J D K E L F M G N]
  8 : [A H B I C J K D L E M F N G]
  9 : [A O H P Q R S T I B U V J W C X D K L E F M N G]
 10 : [O A H P I Q B R S J T C U V K W D X L E M F N G]
 11 : [H A B I C J D K E L F G M N]
 12 : [A H B I C J K D E L F M G N]
 13 : [A H B I C J K D L E M F N G]
 14 : [A O H P Q B I R S C J T U K V D L W X E M N F G]
 15 : [A H I B C J K D L E F M G N]
 16 : [A O H P Q R B I S T C J U V K D W X L E M F N G]
 17 : [H A O P I Q B R J S C T K U V D W L X E M F N G]
 18 : [A H B I C J D E K F L G M N]
 19 : [A O H P Q B I R S C J T U V D K W X E L M F N G]
 20 : [A H B I J C K D L E M F N G]


In [42]:
pd.DataFrame(logs.describe()['generators'])

Unnamed: 0,class,#symbols,example,error,probability
0,serial_path,7,A B C D E F G,0,1.0
1,serial_path,7,H I J K L M N,10,1.0
2,serial_path,10,O P Q R S T U V W X,5,0.5
