##Syntetic Data Generator

This is a documentation for syntetic_data_generator.py, a class which can be used for generating data for Machine Learning tests.

dataGenerator = syntetic_data_generator(defaultSeed=None, noiseSeed=None, permutationSeed=None)




The object can be initialized without any variables, and it supports changing its defaultSeed, noiseSeed, and permutationSeed. If the parameters are not declared, the defaultSeed and noiseSeed will be the system time, and permutation won't be activated in vectors generation as default.

The object's default values can be changed using the functions:

changeSeed(newSeed)

changeNoiseSeed(noiseSeed)

changePermutationSeed(newPermutationSeed)

removeSeed()

removeNoiseSeed()

removePermutation()

The object can has the following features:

* Generate vector with specific size, mean, variance, seed, add noise to it (or not), change specific mean and variance for the noise and make permutation.

* Generate matrix with specific number of rows or columns, mean, variance and seed.

* Generate sparse matrix with specific number of rows/columns, sparcity percentage and L1 Norm.

* Generate a discrete matrix with values pre-determined with specific number of rows/columns, a values and probabilities array (pytorch tensors).

* Generate a vector for autoregression with specific vector input, matrix (will generate if =None) and noise (will generate if =None). Will generate vector, add noise to it and multiply with matrix. The output vector supports permutation as well.

* Generate a vector with specific mean and variance adding in front of it a noise with specific mean and variance as well. Takes size of vector and size of noise as fundamental parameters.

* Permutate a vector

* Add noise to a vector

So it is possible to define specific mean and variance both in the object creation and in the function usage.

Notice that using a specific seed will result in deterministic/replicable generations, while not using one will result in random generations.

#Full Code + example usage

In [3]:
import torch
import random

class syntetic_data_generator:
    def __init__(self, defaultSeed=None, noiseSeed=None, permutationSeed=None):
        self.seed = defaultSeed
        self.noiseSeed = noiseSeed
        self.permutationSeed = permutationSeed
    
    def changeSeed(self, newSeed):
        self.seed = newSeed
        return self

    def changeNoiseSeed(self, noiseSeed):
        self.noiseSeed = noiseSeed
        return self

    def removeSeed(self):
        self.seed = None
        return self

    def removeNoiseSeed(self):
        self.noiseSeed = None
        return self

    def changePermutationSeed(self, newPermutationSeed):
        self.permutationSeed = newPermutationSeed
        return self
    
    def removePermutation(self):
        self.permutationSeed = None
        return self

    def generateVector(self, size, mean=0, variance=1, random_generator_seed=None, add_Noise=False, noise_mean=0, noise_variance = 1, noise_generator_seed=None, permutation_seed=None):


        if random_generator_seed is None:
            random_generator_seed = self.seed
        if noise_generator_seed is None:
            noise_generator_seed = self.noiseSeed

        generator = torch.Generator()

        if random_generator_seed is not None:
            generator = generator.manual_seed(random_generator_seed)
            vector = torch.normal(mean=torch.tensor(mean, dtype=torch.float32), std=torch.tensor(variance), size=(1, size), generator=generator)[0]
        else:
            vector = torch.normal(mean=torch.tensor(mean, dtype=torch.float32), std=torch.tensor(variance), size=(1, size))[0]

        if(add_Noise == False):
            return self.permutate_vector(vector, permutation_seed)
        else:
            return self.add_noise_to_vector(vector, noise_mean, noise_variance, noise_generator_seed, permutation_seed)

    def add_noise_to_vector(self, vector, noise_mean=0, noise_variance=1, noise_generator_seed=None, permutation_seed=None):
        size = len(vector)
        if noise_generator_seed is None:
            noise_generator_seed = self.noiseSeed
        generator = torch.Generator()
        if noise_generator_seed is not None:
                generator = generator.manual_seed(noise_generator_seed)
                noise = torch.normal(mean=torch.tensor(noise_mean, dtype=torch.float32), std=torch.tensor(noise_variance), size=(1, size), generator=generator)[0]
        else:
                noise = torch.normal(mean=torch.tensor(noise_mean, dtype=torch.float32), std=torch.tensor(noise_variance), size=(1, size))[0]
        
        return self.permutate_vector(vector+noise, permutation_seed)

    def generateMatrix(self, rows, columns, mean=0, variance=1, random_generator_seed=None):

        if random_generator_seed is None:
            random_generator_seed = self.seed
        
        
        generator = torch.Generator()
        if random_generator_seed is not None:
            generator = generator.manual_seed(random_generator_seed)
            return torch.normal(mean=torch.tensor(mean, dtype=torch.float32), std=torch.tensor(variance), size=(rows, columns), generator=generator)
        else:
            return torch.normal(mean=torch.tensor(mean, dtype=torch.float32), std=torch.tensor(variance), size=(rows, columns))
 
    def generate_Sparse_Matrix(self, rows, columns, sparsityPercentage, random_generator_seed=None, l1_norm =None):
        generatedMatrix = self.generateMatrix(rows, columns, random_generator_seed=random_generator_seed)

        random.seed(random_generator_seed)
        #make it sparse:
        indices = random.sample(range(rows*columns), int(sparsityPercentage*rows*columns))
        generatedMatrix.view(-1)[indices] = 0

        if l1_norm is not None:
            #adjust l1 norm:
            current_l1_norm = generatedMatrix.norm(p=1)
            generatedMatrix = generatedMatrix * (l1_norm / current_l1_norm)
        
        return generatedMatrix   

    def generate_discrete_matrix(self, rows, columns, probabilities_tensor, values_tensor, random_generator_seed=None):


        if(len(probabilities_tensor) != len(values_tensor)):
            raise ValueError("Each element from values_tensor has a probability defined in probabilities_tensor, so their sizes must match.")

        if random_generator_seed is None:
            random_generator_seed = self.seed        
        generator = torch.Generator()

        if random_generator_seed is not None:
            generator = generator.manual_seed(self.seed)

            result = torch.empty((0, columns))
            for x in range(rows):
                samples = torch.multinomial(probabilities_tensor, num_samples=columns, replacement=True, generator=generator)
                result = torch.vstack((result, values_tensor[samples]))
            return result

        else:
            result = torch.empty((0, columns))
            for x in range(rows):
                samples = torch.multinomial(probabilities_tensor, num_samples=columns, replacement=True)
                result = torch.vstack((result, values_tensor[samples]))
            return result

    def generate_for_auto_regression(self, basePoint_vector, noise=None, matrix=None, auto_generated_matrix_size = None, pemrutation_seed=None):
        if auto_generated_matrix_size is None:
            auto_generated_matrix_size = basePoint_vector

        if noise is None:
            vector = self.add_noise_to_vector(basePoint_vector)
        else:
            vector += noise
        
        if matrix is None:
            matrix = self.generateMatrix(auto_generated_matrix_size, len(vector))

        return self.permutate_vector(vector@matrix, permutation_seed)

    def generate_augmented_random_vector(self, size_vector, size_noise, vector_mean=0, vector_variance=1, noise_mean=0, noise_variance=1, random_generator_seed=None, noise_generator_seed=None, permutation_seed=None):
        vector = self.generateVector(size_vector, vector_mean, vector_variance, random_generator_seed)
        noise = self.generateVector(size_noise, noise_mean, noise_variance, noise_generator_seed)

        return self.permutate_vector(torch.cat((vector, noise), dim=0), permutation_seed)

    def permutate_vector(self, vector, permutation_seed=None):
        if(permutation_seed is None):
            permutation_seed = self.permutationSeed
        if(permutation_seed is None):
            return vector

        generator = torch.Generator().manual_seed(permutation_seed)
        perm = torch.randperm(len(vector), generator=generator)
        return vector[perm]

#example usage

dataGenerator = syntetic_data_generator(defaultSeed=10, noiseSeed=3, permutationSeed=4)
#create an instance of syntetic_data_generator. has a defaultSeed, a permutationSeed. 

#you can generate a 4x4 matrix and a 10 inputs vector:
matrix = dataGenerator.generateMatrix(4, 4)
vector = dataGenerator.generateVector(10)

#you can generate a specific seed vector without altering the dataGenerator value:
vector = dataGenerator.generateVector(10, random_generator_seed=10)

#you can change the default seed of the dataGenerator:
dataGenerator.changeSeed(5)

#you can remove the seed, so the seed will be the computer time itself:
dataGenerator.removeSeed()

#you can add noise to a vector:
vector = dataGenerator.add_noise_to_vector(vector)

#you can remove permutation for all instances:
dataGenerator = dataGenerator.removePermutation()

#you can generate a matrix with specific mean and variance:
matrix = dataGenerator.generateMatrix(2, 2, mean=2, variance=3)


#you can generate a sparse matrix with specific sparsity (percentage, 0-1) and specific l1 norm:
sparseMatrix = dataGenerator.generate_Sparse_Matrix(5,5, sparsityPercentage=0.6, l1_norm=1)

print(sparseMatrix)
print(vector)
print(matrix)

tensor([[-0.0048,  0.1311,  0.0000,  0.0000, -0.1402],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0097,  0.0000,  0.0923,  0.0000,  0.0000],
        [ 0.0000, -0.1627, -0.0461,  0.0000, -0.1233],
        [ 0.0000,  0.0000,  0.0256,  0.2641,  0.0000]])
tensor([ 0.2019, -0.9120,  0.6290,  0.0352, -0.1809, -1.4829,  1.9940, -2.4474,
         1.0947, -2.5959])
tensor([[-4.8181,  3.6741],
        [ 2.6711, -2.1031]])
