# Synthetic Dataset Generator

In [372]:
import random
import pandas as pd

class synthetic_logs:
    _gens = []
    _symbols = list("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
    _noisy   = list("abcdefghijklmnopqrstuvwxyz")
    
    def __init__(self):
        self._gens = []
        self._used_symbols = []
        self.symbol = self.create_symbols( self._symbols )
        self.noisy = self.create_symbols( self._noisy )
        
    def create_symbols(self, symbols):
        def combine(A,B):
            return [ a+b for a in A for b in B ]
        used = symbols[:]
        stack = used[:]
        while True:
            nxt = stack.pop(0)
            if not len(stack):
                stack = combine( used, symbols )
                used = stack[:]
            yield nxt
            
            
    def generate_instances(self, instances=5):
        self.instances = [ self.generate_trace() for i in range(instances) ]
        return self.instances

    def generate_trace(self):
        if len(self._gens) == 0:
            self.symbol = self.create_symbols( self._symbols )
            return [ (i, next(self.symbol) ) for i in range(5) ]
        else:
            trace = []
            for generator in self._gens:
                trace = trace + generator.generate_trace()
            return sorted(trace, key=lambda e: e[0])
            #return trace
            
    def show_instances(self, head=10):
        for i in range( min(head, len(self.instances))  ):
            trace = []
            for t, l in self.instances[i]:
                trace.append(l)
            print("%3d : [%s]" % (i+1, " ".join(trace)))
            
    def add(self, generator):
        generator.log = self
        self._gens.append(generator)
        
    def describe(self):
        if len(self._gens) == 0:
            return []
        else:
            desc = {}
            desc['generators'] = []
            for gen in self._gens:
                desc['generators'].append( gen.describe() )
            return desc
        
class noisy_path:
    def __init__(self, every, num_symbols, count):
        self.every = every
        self.num_symbols = num_symbols
        self.count = count
        self.noise = False
        self.last = []
    def generate_trace(self):
        if not self.noise:
            self.noise = [ next(self.log.noisy) for i in range(self.num_symbols) ]
        t = 0
        trace = []
        for i in range(self.count):
            trace.append( (t, self.noise[ random.randint(0, len(self.noise)-1) ] ) )
            t += self.every
            
        self.last = trace
        return self.last

    
    def describe(self):
        return {
            'class': self.__class__.__name__,
            '#symbols': len(self.noise),
            'example': " ".join([b for (a,b) in self.last[:10] ]),
        }
    
class serial_path:
    def __init__(self, size, every, error=0, probability=1):
        self.every = every
        self.size = size
        self.error = error
        self.probability = probability
        self.path = False
        self.last = [ ]
        
    def generate_trace(self):
        if not self.path:
            self.path = [ next(self.log.symbol) for i in range(self.size) ]
        t = random.randint(0, self.error)
        trace = []
        for s in self.path:
            trace.append( (t, s) )
            err = random.randint(-self.error, self.error)
            t = max( t, t + self.every + err )
        self.last = trace
        if random.random() < self.probability:
            return self.last
        else:
            return []

    
    def describe(self):
        return {
            'class': self.__class__.__name__,
            '#symbols': len(self.path),
            'example': " ".join([b for (a,b) in self.last[:10] ]),
            'error' : self.error,
            'probability' : self.probability
        }

### Default values

In [350]:
logs = synthetic_logs()

In [351]:
logs.generate_trace()

[(0, 'A'), (1, 'B'), (2, 'C'), (3, 'D'), (4, 'E')]

In [352]:
logs.generate_instances()

[[(0, 'A'), (1, 'B'), (2, 'C'), (3, 'D'), (4, 'E')],
 [(0, 'A'), (1, 'B'), (2, 'C'), (3, 'D'), (4, 'E')],
 [(0, 'A'), (1, 'B'), (2, 'C'), (3, 'D'), (4, 'E')],
 [(0, 'A'), (1, 'B'), (2, 'C'), (3, 'D'), (4, 'E')],
 [(0, 'A'), (1, 'B'), (2, 'C'), (3, 'D'), (4, 'E')]]

In [353]:
logs.show_instances()

  1 : [A B C D E]
  2 : [A B C D E]
  3 : [A B C D E]
  4 : [A B C D E]
  5 : [A B C D E]


In [354]:
logs.describe()

[]

## Noise

In [355]:
logs = synthetic_logs()
logs.add( noisy_path( every=10, num_symbols=20, count=5)  )

In [356]:
logs.generate_trace()

[(0, 'o'), (10, 'h'), (20, 'm'), (30, 'n'), (40, 'm')]

In [357]:
logs.generate_instances()
logs.show_instances()

  1 : [o a f j d]
  2 : [i m j l h]
  3 : [o o i m a]
  4 : [o g e s b]
  5 : [k e a g k]


In [358]:
logs.describe()

{'generators': [{'class': 'noisy_path',
   '#symbols': 20,
   'example': 'k e a g k'}]}

## Serial Paths

In [359]:
logs = synthetic_logs()
logs.add( serial_path( size=5, every=10, error=3)  )

In [360]:
logs.generate_trace()

[(2, 'A'), (11, 'B'), (21, 'C'), (28, 'D'), (36, 'E')]

In [361]:
logs.generate_instances()
logs.show_instances()

  1 : [A B C D E]
  2 : [A B C D E]
  3 : [A B C D E]
  4 : [A B C D E]
  5 : [A B C D E]


In [362]:
logs.describe()

{'generators': [{'class': 'serial_path',
   '#symbols': 5,
   'example': 'A B C D E',
   'error': 3}]}

## Noise + Serial

In [373]:
logs = synthetic_logs()
logs.add( noisy_path( every=10, num_symbols=50, count=30)  )
logs.add( serial_path( size=7, every=25, error=0, probability=1)  )
logs.add( serial_path( size=5, every=25, error=5, probability=0.5)  )

In [374]:
logs.generate_trace()[:7]

[(0, 'l'), (0, 'A'), (10, 'c'), (20, 'u'), (25, 'B'), (30, 'b'), (40, 'x')]

In [375]:
logs.generate_instances(10)
logs.show_instances()

  1 : [o A H c j B m I at au C m J e D n K s ag E p L ax F j v ab G a k ae an z ar w j f ae am al al av]
  2 : [p A H ar z B af I al aa C J r e K D w o L y E ab as F aj s u G n af ak h aj aq ao ae aw j v o aw as]
  3 : [b A b e B af ab y C au aa D ai z d E au ai F b f g G r o u t af ao z y t r au ak aw b]
  4 : [ak A b ag B ae ag h C h h D n ag n E t ap F aj c av G b ai ac p m i ax ae n ap w av ad ar]
  5 : [o A H i t I B af t J z C ap p K D a o L s E t av F o s q G q h w af m t d g ar aa ae aq e z]
  6 : [s A H c f B as I f l C J j ai D K ab at i E L ar r F w r u G o o ag as j ao ah af ab r aq aw a ag]
  7 : [al A H x n B I ar al at C J k g K D ab av L o E g ar F w au n G q f am aq aj p u aw al aw b ab j k]
  8 : [z A H aa a B I av aq J aq C h K ah D ak a L am E aj h F p e v G c ar z as ad x e an ax aw x g ah ac]
  9 : [x A z s B ad x an C g l D ae av al E aj ar F av av av G aj d f ab p x y m av f aw r ak i]
 10 : [d A v s B ao x n C s am D p c t E au y F ap ar aq G i an f b al ap ab 

In [376]:
pd.DataFrame(logs.describe()['generators'])

Unnamed: 0,class,#symbols,example,error,probability
0,noisy_path,50,d v s ao x n s am p c,,
1,serial_path,7,A B C D E F G,0.0,1.0
2,serial_path,5,H I J K L,5.0,0.5
