In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.arima_process import ArmaProcess

In [2]:
def mean_model(X, v, b0=0, c0=0, c1=0):
    # Here, v should be v_rep of dimension N. 
    mu = np.exp(c0*np.cos(X[:,0])+c0*np.cos(X[:,1])+c0*np.cos(X[:,2])+c1/(X[:,3]**2+1)+c1/(X[:,4]**2+1)+b0+v)
    return mu

In [3]:
def make_params_lognormal(mean, variance):
    # input is mean and variance of log-normal distribution
    # output is mean and variance of corresponding normal distribution (its logarithm)
    mu = np.log(mean) - 0.5 * np.log(variance/mean**2+1)
    sig = np.sqrt(np.log(variance/mean**2+1))
    return (mu, sig)

In [4]:
def generate_data(data_type, dir_name, b0=0.2, c0=0.2, c1=0.2, p=5, ar=[1,-0.5], ma=1, n_simul=100):
    
    n_sub, n_num, lam, rand_dist = data_type.split('-')
    n_sub, n_num, lam = int(n_sub), int(n_num), float(lam)
    N = n_sub * n_num
    n_new = n_sub    
    arma = ArmaProcess(ar, ma) # AR(1) with coeff 0.5
    
    settings = np.column_stack([n_simul, n_sub, n_num, n_new, p, b0, lam])
    colnames = ['n_simul','n_sub', 'n_num', 'n_new','p', 'b0', 'lam']
    settings = pd.DataFrame(settings, columns=colnames)
    settings.to_csv(dir_name + 'simul-settings-' + data_type + '.csv', index=False)
    
    for repeat in range(n_simul):

        # Generate data

        np.random.seed(repeat)
        X = arma.generate_sample(nsample=(N,p), axis=1)
        if rand_dist == 'constant':
            u, v = np.repeat(1, n_sub), np.repeat(0, n_sub)
        elif rand_dist == 'gamma':
            u = np.random.gamma(1/lam, lam, n_sub)
            v = np.log(u)
        elif rand_dist == 'lognorm':
            v = np.random.normal(0, np.sqrt(lam), n_sub)
            u = np.exp(v)            
        elif rand_dist == 'mixture':
            params1 = make_params_lognormal(0.5, (4*lam-1)/20)
            params2 = make_params_lognormal(1.5, 9*(4*lam-1)/20)
            indicator = np.random.binomial(1, 0.5, 1000)
            v1 = np.random.normal(params1[0], params1[1], 1000)
            v2 = np.random.normal(params2[0], params2[1], 1000)
            v = v1*indicator + v2*(1-indicator)
            u = np.exp(v) # to have E(u)=1 and Var(u)=lam
        else:
            print("rand_dist is wrong !!!")
            
        u_rep = np.repeat(u, n_num)
        v_rep = np.repeat(v, n_num)
            
        mu = mean_model(X, v_rep, b0, c0, c1)
        y = np.random.poisson(mu)

        data = pd.DataFrame(X, columns=[('x'+str(i)) for i in range(p)])
        data['y'] = y
        data['u'] = u_rep
        data['mu'] = mu
        data['sub'] = np.repeat(np.arange(n_sub), n_num)
        data['num'] = np.tile(np.arange(n_num), n_sub)
        
        # save data

        file_name = dir_name + 'simul-data-' + data_type + '-' + str(repeat)
        data.to_csv(file_name+'.csv', index=False)

In [5]:
dir_name = os.getcwd() + '/simulation_data/'
data_type_list = ['1000-10-0-constant',
                  '1000-10-0.5-gamma',   '1000-10-1-gamma', 
                  '1000-10-0.5-lognorm', '1000-10-1-lognorm', 
                  '1000-10-0.5-mixture', '1000-10-1-mixture']
for data_type in data_type_list:    
    generate_data(data_type, dir_name, p = 5, ar = [1, -0.5], ma = 1, n_simul = 100)