# Generating Artifical Data for Simulations

This notebook generates artificial data for simulations. It includes data generation functions, covering a basic simulation example and scenarios in which parametrs are varied. Generated data is saved, with an example provided in the `artificial_example` folder. Due to the data's size, only an example of the generated data is included.

## Packages

In [2]:
import os
import numpy as np
from tqdm import tqdm
import time
import dill

from generate_artificial_data_fun import *

## Simulation functions

### Basic simulation function

In [3]:
def basic_simulations_scenario():
    N = 250 # number of observations 
    M = 3 # number of views
    K = 5 # number of hidden factors
    D = [10, 10, 10] # number of features in views
    G = 100 # number of distinct words
    L = D[2] # number of topics (the last view is not observed directly - we observe words)
    # covariance matrix in CTM:
    Sigma0 = np.eye(L)
    Sigma0 = Sigma0 + 0.5*(np.tri(L, L, -1) - np.tri(L, L, -2)) + 0.5*(np.tri(L, L, -1) - np.tri(L, L, -2)).T
    mu0 = np.zeros(L)
    # data variances in FA:
    var_weights = []
    var_weight_M0 = np.ones((D[0], K))
    var_weight_M0[:,1] = 0
    var_weight_M0[:,4] = 0
    var_weight_M1 = np.ones((D[1], K))
    var_weight_M1[:,2] = 0
    var_weight_M2 = np.ones((D[2], K))
    var_weight_M2[:,3:] = 0
    var_weights = [var_weight_M0, var_weight_M1, var_weight_M2]
    # setting parameters:
    simulations_sample_fa_params = {'K': K, 'N': N, 'D': D, 'M': M, 'sparsity_factor_fraction': 0, 'sparsity_feature_fraction': 0.1,
                                    'var_weights': var_weights}
    simulations_sample_ctm_params = {'N': N, 'G': G, 'L': L, 'sentences_per_observation': 'constant', 
                                    'sentences_per_observation_params': {'const': 100},
                                    'words_per_sentence': 10, 'FA': True,
                                    'mu0': mu0,
                                    'Sigma0': 5*Sigma0, 'topics_params': np.ones(G)}

    which_ctm = [False, False, True]
    
    simulations_sample_params =  {'simulations_sample_fa_params': simulations_sample_fa_params,
                                  'simulations_sample_ctm_params': simulations_sample_ctm_params,
                                  'which_ctm': which_ctm}

    return simulations_sample_params

In [4]:
def sparsity_simulations_scenario():
    N = 250 # number of observations 
    M = 3 # number of views
    K = 5 # number of hidden factors
    D = [500, 500, 10] # number of features in views
    G = 100 # number of distinct words
    L = D[2] # number of topics (the last view is not observed directly - we observe words)
    # covariance matrix in CTM:
    Sigma0 = np.eye(L)
    Sigma0 = Sigma0 + 0.5*(np.tri(L, L, -1) - np.tri(L, L, -2)) + 0.5*(np.tri(L, L, -1) - np.tri(L, L, -2)).T
    mu0 = np.zeros(L)
    # data variances in FA:
    var_weights = []
    var_weight_M0 = np.ones((D[0], K))
    var_weight_M0[:,1] = 0
    var_weight_M0[:,4] = 0
    var_weight_M1 = np.ones((D[1], K))
    var_weight_M1[:,2] = 0
    var_weight_M2 = np.ones((D[2], K))
    var_weight_M2[:,3:] = 0
    var_weights = [var_weight_M0, var_weight_M1, var_weight_M2]
    # setting parameters:
    simulations_sample_fa_params = {'K': K, 'N': N, 'D': D, 'M': M, 'sparsity_factor_fraction': 0, 'sparsity_feature_fraction': 0.1,
                                    'var_weights': var_weights}
    simulations_sample_ctm_params = {'N': N, 'G': G, 'L': L, 'sentences_per_observation': 'constant', 
                                    'sentences_per_observation_params': {'const': 100},
                                    'words_per_sentence': 10, 'FA': True,
                                    'mu0': mu0,
                                    'Sigma0': 5*Sigma0, 'topics_params': np.ones(G)}

    which_ctm = [False, False, True]
    
    simulations_sample_params =  {'simulations_sample_fa_params': simulations_sample_fa_params,
                                  'simulations_sample_ctm_params': simulations_sample_ctm_params,
                                  'which_ctm': which_ctm}

    return simulations_sample_params

### Simulation scenarios - varying parameters

In [5]:
def sim_scenario_scaling_weights(param):
    
    params = basic_simulations_scenario()
    
    muFA_scale = np.ones((params['simulations_sample_fa_params']['M']))
    muFA_scale[params['simulations_sample_fa_params']['M']-1] = param
    params['simulations_sample_fa_params']['muFA_scale'] = muFA_scale

    return params

In [6]:
def sim_scenario_scaling_Sigma0(param):
    
    params = basic_simulations_scenario()

    Sigma0_scale = param*params['simulations_sample_ctm_params']['Sigma0']
    params['simulations_sample_ctm_params']['Sigma0'] = Sigma0_scale

    return params

In [7]:
def sim_scenario_scaling_topics_param(param):
    
    params = basic_simulations_scenario()
    
    topics_param_scale = param*params['simulations_sample_ctm_params']['topics_params']
    params['simulations_sample_ctm_params']['topics_params'] = topics_param_scale

    return params

In [8]:
def sim_scenario_scaling_mu0(param):
    
    params = basic_simulations_scenario()
    
    mu0_max = np.log(np.linspace(1, 3, 10)/np.sum(np.linspace(1, 3, 10)))
    mu0_max = mu0_max - np.mean(mu0_max)

    params['simulations_sample_ctm_params']['mu0'] = (1 - param)*mu0_max

    return params

In [9]:
def sim_scenario_scaling_D_topics(param):
    
    params = basic_simulations_scenario()
    
    K = params['simulations_sample_fa_params']['K']
    D = params['simulations_sample_fa_params']['D']
    D[2] = int(param*D[2])
    L = D[2] 
    Sigma0 = np.eye(L)
    Sigma0 = Sigma0 + 0.5*(np.tri(L, L, -1) - np.tri(L, L, -2)) + 0.5*(np.tri(L, L, -1) - np.tri(L, L, -2)).T
    mu0 = np.zeros(L)
    var_weights = []
    var_weight_M0 = np.ones((D[0], K))
    var_weight_M0[:,1] = 0
    var_weight_M0[:,4] = 0
    var_weight_M1 = np.ones((D[1], K))
    var_weight_M1[:,2] = 0
    var_weight_M2 = np.ones((D[2], K))
    var_weight_M2[:,3:] = 0
    var_weights = [var_weight_M0, var_weight_M1, var_weight_M2]

    params['simulations_sample_fa_params']['D'] = D
    params['simulations_sample_fa_params']['var_weights'] = var_weights

    params['simulations_sample_ctm_params']['L'] = L
    params['simulations_sample_ctm_params']['mu0'] = mu0
    params['simulations_sample_ctm_params']['Sigma0'] = 5*Sigma0

    return params

In [10]:
def sim_scenario_scaling_sparsity(param):
    
    params = sparsity_simulations_scenario()
    
    sparse_param = 0.1 + (0.7 - 0.1)*(1 - param)
    params['simulations_sample_fa_params']['sparsity_feature_fraction'] = sparse_param

    return params

### Generation

In [11]:
def sim_scenario(name, param=None):
    if name == 'basic':
        params = basic_simulations_scenario()
    if name == 'scenario1':
        params = sim_scenario_scaling_weights(param)
    if name == 'scenario2':
        params = sim_scenario_scaling_topics_param(param)
    if name == 'scenario3':
        params = sim_scenario_scaling_D_topics(param)    
    if name == 'scenario4':
        params = sim_scenario_scaling_mu0(param)
    if name == 'scenario5':
        params = sim_scenario_scaling_Sigma0(param) 
    if name == 'basic_sparsity':
        params = sim_scenario_scaling_sparsity()
    if name == 'scenario6_sparsity':
        params = sim_scenario_scaling_sparsity(param)   
    
    return params

In [12]:
def generate_datasets(file_path, simulation_scenario, simulation_param, seed):

    params_sim = sim_scenario(simulation_scenario, simulation_param)
    
    data_simulations, data_fa_info, data_ctm_info = simulations_sample_factm(**params_sim, seed=seed)

    file_name = simulation_scenario + "_" + str(simulation_param) + '_' + str(seed) +'.pkl'

    with open(os.path.join(file_path, file_name), 'wb') as file:
        dill.dump([data_simulations, data_fa_info, data_ctm_info, params_sim], file)

## Saving

In [13]:
file_path_scenario = os.path.join('artificial_datasets')

In [14]:
seed_seq = range(123, 123+10)

### Basic

In [15]:
for seed in seed_seq:
    generate_datasets(file_path_scenario, 'basic', None, seed)

In [8]:
seed_seq = range(123, 123+10)
par_seq = [0.0, 0.5, 1.5, 1.0]
par_seq = [2.0]

### Main scenarios - 1-5

In [9]:
par_seq = [0.0, 0.5, 1.5, 2.0]
for seed in seed_seq:
    for par in par_seq:
        generate_datasets(file_path_scenario, 'scenario1', par, seed)

In [27]:
par_seq = [5, 10]
for seed in seed_seq:
    for par in par_seq:
        generate_datasets(file_path_scenario, 'scenario2', par, seed)

In [24]:
par_seq = [0.5, 1.5]
for seed in seed_seq:
    for par in par_seq:
        generate_datasets(file_path_scenario, 'scenario3', par, seed)

In [18]:
par_seq = [0.0, 0.25, 0.5, 0.75]
for seed in seed_seq:
    for par in par_seq:
        generate_datasets(file_path_scenario, 'scenario4', par, seed)

In [23]:
par_seq = [0.2, 0.6]
for seed in seed_seq:
    for par in par_seq:
        generate_datasets(file_path_scenario, 'scenario5', par, seed)

### Additional scenario 6

In [48]:
for seed in seed_seq:
    generate_datasets(file_path_scenario, 'basic_sparsity', None, seed)

par_seq = [0.0, 0.25, 0.5, 0.75]
for seed in seed_seq:
    for par in par_seq:
        generate_datasets(file_path_scenario, 'scenario6_sparsity', par, seed)