##Syntetic Data Generator

This is a documentation for syntetic_data_generator.py, a class which can be used for generating data for Machine Learning tests.

dataGenerator = syntetic_data_generator(defaultSeed=None, noiseSeed=None, permutationSeed=None)




The object can be initialized without any variables, and it supports changing its defaultSeed, noiseSeed, and permutationSeed. If the parameters are not declared, the defaultSeed and noiseSeed will be the system time, and permutation won't be activated in vectors generation as default.

The object's default values can be changed using the functions:

setSeed(newSeed)

setNoiseSeed(noiseSeed)

setPermutation(newPermutationSeed)


The object can has the following features:

* Generate vector with specific size, mean, variance, seed, add noise to it (or not), change specific mean and variance for the noise and make permutation.

* Generate matrix with specific number of rows or columns, mean, variance and seed.

* Generate sparse matrix with specific number of rows/columns, sparcity percentage and L1 Norm.

* Generate a discrete matrix with values pre-determined with specific number of rows/columns, a values and probabilities array (pytorch tensors).

* Generate a vector for autoregression with specific vector input, matrix (will generate if =None) and noise (will generate if =None). Will generate vector, add noise to it and multiply with matrix. The output vector supports permutation as well.

* Generate a vector with specific mean and variance adding in front of it a noise with specific mean and variance as well. Takes size of vector and size of noise as fundamental parameters.

* Permutate a vector

* Add noise to a vector

So it is possible to define specific mean and variance both in the object creation and in the function usage.

Notice that using a specific seed will result in deterministic/replicable generations, while not using one will result in random generations.

#Full Code + example usage

In [1]:
import torch
import random

class syntetic_data_generator:

    class vector:
        def __init__(self, defaultSeed=None, noiseSeed=None, permutationSeed=None):
            self.seed = defaultSeed
            self.noiseSeed = noiseSeed
            self.permutationSeed = permutationSeed
        
        def setSeed(self, newSeed):
            self.seed = newSeed
            return self

        def setNoiseSeed(self, noiseSeed):
            self.noiseSeed = noiseSeed
            return self

        def setPermutation(self, newPermutationSeed):
            self.permutationSeed = newPermutationSeed
            return self

        def generateVector(self, size, mean=0, variance=1, random_generator_seed=None, add_Noise=False, noise_mean=0, noise_variance = 1, noise_generator_seed=None, permutation_seed=None):


            if random_generator_seed is None:
                random_generator_seed = self.seed
            if noise_generator_seed is None:
                noise_generator_seed = self.noiseSeed

            generator = torch.Generator()

            if random_generator_seed is not None:
                generator = generator.manual_seed(random_generator_seed)
                vector = torch.normal(mean=torch.tensor(mean, dtype=torch.float32), std=torch.tensor(variance), size=(1, size), generator=generator)[0]
            else:
                vector = torch.normal(mean=torch.tensor(mean, dtype=torch.float32), std=torch.tensor(variance), size=(1, size))[0]

            if(add_Noise == False):
                return self.permutate_vector(vector, permutation_seed)
            else:
                return self.add_noise_to_vector(vector, noise_mean, noise_variance, noise_generator_seed, permutation_seed)

        def add_noise_to_vector(self, vector, noise_mean=0, noise_variance=1, noise_generator_seed=None, permutation_seed=None):
            size = len(vector)
            if noise_generator_seed is None:
                noise_generator_seed = self.noiseSeed
            generator = torch.Generator()
            if noise_generator_seed is not None:
                    generator = generator.manual_seed(noise_generator_seed)
                    noise = torch.normal(mean=torch.tensor(noise_mean, dtype=torch.float32), std=torch.tensor(noise_variance), size=(1, size), generator=generator)[0]
            else:
                    noise = torch.normal(mean=torch.tensor(noise_mean, dtype=torch.float32), std=torch.tensor(noise_variance), size=(1, size))[0]
            
            return self.permutate_vector(vector+noise, permutation_seed)

        def generate_for_auto_regression(self, basePoint_vector, noise=None, matrix=None, auto_generated_matrix_size = None, permutation_seed=None):
            if auto_generated_matrix_size is None:
                auto_generated_matrix_size = len(basePoint_vector)

            if noise is None:
                vector = self.add_noise_to_vector(basePoint_vector)
            else:
                vector += noise
            
            if matrix is None:
                matrix = self.generateMatrix(auto_generated_matrix_size, len(vector))

            return self.permutate_vector(vector@matrix, permutation_seed)

        def generate_augmented_random_vector(self, size_vector, size_noise, vector_mean=0, vector_variance=1, noise_mean=0, noise_variance=1, random_generator_seed=None, noise_generator_seed=None, permutation_seed=None):
            vector = self.generateVector(size_vector, vector_mean, vector_variance, random_generator_seed)
            noise = self.generateVector(size_noise, noise_mean, noise_variance, noise_generator_seed)

            return self.permutate_vector(torch.cat((vector, noise), dim=0), permutation_seed)

        def permutate_vector(self, vector, permutation_seed=None):
            if(permutation_seed is None):
                permutation_seed = self.permutationSeed
            if(permutation_seed is None):
                return vector

            generator = torch.Generator().manual_seed(permutation_seed)
            perm = torch.randperm(len(vector), generator=generator)
            return vector[perm]

        def generateMatrix(self, rows, columns, mean=0, variance=1, random_generator_seed=None):

            if random_generator_seed is None:
                random_generator_seed = self.seed
            
            generator = torch.Generator()
            if random_generator_seed is not None:
                generator = generator.manual_seed(random_generator_seed)
                return torch.normal(mean=torch.tensor(mean, dtype=torch.float32), std=torch.tensor(variance), size=(rows, columns), generator=generator)
            else:
                return torch.normal(mean=torch.tensor(mean, dtype=torch.float32), std=torch.tensor(variance), size=(rows, columns))

    class matrix:
        def __init__(self, defaultSeed=None):
            self.seed = defaultSeed
        
        def setSeed(self, newSeed):
            self.seed = newSeed
            return self

        def generateMatrix(self, rows, columns, mean=0, variance=1, random_generator_seed=None):

            if random_generator_seed is None:
                random_generator_seed = self.seed
            
            generator = torch.Generator()
            if random_generator_seed is not None:
                generator = generator.manual_seed(random_generator_seed)
                return torch.normal(mean=torch.tensor(mean, dtype=torch.float32), std=torch.tensor(variance), size=(rows, columns), generator=generator)
            else:
                return torch.normal(mean=torch.tensor(mean, dtype=torch.float32), std=torch.tensor(variance), size=(rows, columns))
    
        def generate_Sparse_Matrix(self, rows, columns, sparsityPercentage, random_generator_seed=None, l1_norm =None):
            generatedMatrix = self.generateMatrix(rows, columns, random_generator_seed=random_generator_seed)

            random.seed(random_generator_seed)
            #make it sparse:
            indices = random.sample(range(rows*columns), int(sparsityPercentage*rows*columns))
            generatedMatrix.view(-1)[indices] = 0

            if l1_norm is not None:
                #adjust l1 norm:
                current_l1_norm = generatedMatrix.norm(p=1)
                generatedMatrix = generatedMatrix * (l1_norm / current_l1_norm)
            
            return generatedMatrix   

        def generate_discrete_matrix(self, rows, columns, probabilities_tensor, values_tensor, random_generator_seed=None):

            if(len(probabilities_tensor) != len(values_tensor)):
                raise ValueError("Each element from values_tensor has a probability defined in probabilities_tensor, so their sizes must match.")

            if random_generator_seed is None:
                random_generator_seed = self.seed        
            generator = torch.Generator()

            if random_generator_seed is not None:
                generator = generator.manual_seed(self.seed)

                result = torch.empty((0, columns))
                for x in range(rows):
                    samples = torch.multinomial(probabilities_tensor, num_samples=columns, replacement=True, generator=generator)
                    result = torch.vstack((result, values_tensor[samples]))
                return result

            else:
                result = torch.empty((0, columns))
                for x in range(rows):
                    samples = torch.multinomial(probabilities_tensor, num_samples=columns, replacement=True)
                    result = torch.vstack((result, values_tensor[samples]))
                return result


#example usage

dataGenerator = syntetic_data_generator.vector()
print("\nRandom vector without seed: ")
print(dataGenerator.generateVector(10))
print("\nRandom vector with defined seed:")
print(dataGenerator.generateVector(10, random_generator_seed=10))
print("\nRandom vector with defined seed AND permutated:")
print(dataGenerator.generateVector(10, random_generator_seed=10, permutation_seed=10))

#can do the same if set the classes inner default's value:
dataGenerator = syntetic_data_generator.vector(defaultSeed=10, permutationSeed=10)
print("\nSame vector as before:")
vector = dataGenerator.generateVector(10)
print(vector)

matrixGenerator = syntetic_data_generator.matrix()
print("\nGenerate matrix:")
matrix = matrixGenerator.generateMatrix(5, 5)
print(matrix)

print("\Generate for auto regression:")
print(dataGenerator.generate_for_auto_regression(vector))

print("\nSparse matrix:")
print(matrixGenerator.generate_Sparse_Matrix(5, 5, 0.4))

print("\nMatrix from a set:")

probabilities = torch.tensor([0.1, 0.2, 0.1, 0.4, 0.4, 0.3, 0.8])
inputs = torch.tensor([3, 4, 5, 6, 7, 8, 1])
#This means: There's 0.1 (10%) chance for a matrix value to be 3, 0.2 (20%) to be 4, 10% to be 5, etc. 
#Notice that probabilities won't add to 100% but program will still work fine by calculating 
print(matrixGenerator.generate_discrete_matrix(5, 5, probabilities, inputs)) 


Random vector without seed: 
tensor([ 0.7546, -0.4530,  0.3910,  0.1031, -0.8767,  0.7235, -1.3748,  1.0166,
         1.0166,  0.8083])

Random vector with defined seed:
tensor([-0.6014, -1.0122, -0.3023, -1.2277,  0.9198, -0.3485, -0.8692, -0.9582,
        -1.1920,  1.9050])

Random vector with defined seed AND permutated:
tensor([-0.9582,  0.9198, -0.8692, -1.0122, -1.2277, -0.3485,  1.9050, -1.1920,
        -0.3023, -0.6014])

Same vector as before:
tensor([-0.9582,  0.9198, -0.8692, -1.0122, -1.2277, -0.3485,  1.9050, -1.1920,
        -0.3023, -0.6014])

Generate matrix:
tensor([[-0.5844,  0.6254, -0.5640, -0.4025,  1.3613],
        [ 2.2634,  0.1119, -0.2122, -1.4166,  0.6963],
        [ 0.8214, -0.9992,  0.6348, -1.9531, -0.1019],
        [ 0.9466, -1.0381,  1.0316, -0.3503, -1.3526],
        [ 1.6424,  0.3041,  0.6766,  1.6267,  0.2969]])
\Generate for auto regression:
tensor([-8.6192,  8.7185, -2.6730, -4.9850,  1.1519,  6.6910, -5.0082, -1.2127,
         0.7645, -0.4532])

Sp