In [None]:
from statsmodels.tsa.arima_process import ArmaProcess
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import rbf_kernel

In [None]:
def mean_model(X):
    # make fixed part of the mean model (=marginal mean)
    s = np.sum(X, axis=1)
    mu = s * np.cos(s) + 2*X[:,0]*X[:,1]
    return mu

In [None]:
def generate_data(
    dir_name,
    N, p, qL, gam, sig2, 
    n_simul
):
    
    settings = np.column_stack([N, p, qL, gam[0], gam[1], sig2, n_simul])
    colnames = ['N', 'p', 'qL', 'gam0', 'gam1', 'sig2', 'n_simul']
    settings = pd.DataFrame(settings, columns=colnames)
    settings.to_csv(dir_name + 'settings.csv', index=False)
        
    for repeat in range(n_simul):
        
        np.random.seed(repeat)
        
        loc = np.random.uniform(-10, 10, (qL, 2))
        DL = gam[0] * rbf_kernel(loc, gamma = gam[1]) # d_ij = gam0 * exp(- gam1 |x_i - x_j|^2)
        vL = np.random.multivariate_normal(np.zeros(qL), DL, 1)[0]
        zL = np.random.choice(range(qL), size=N, replace=True)
        
        epsilon = np.random.normal(0, np.sqrt(sig2), N)
        
        X = np.random.uniform(-1, 1, (N,p))
        
        mu_mar = mean_model(X)
        
        ZL = pd.get_dummies(zL)
        y = mu_mar + epsilon + ZL@vL

        data = pd.DataFrame(X, columns=[('x'+str(i)) for i in range(p)])
        data['zL'] = zL
        data['epsilon'] = epsilon
        data['y'] = y
        
        data.to_csv(dir_name+'data-'+str(repeat)+'.csv', index=False)
        
        pd.DataFrame(loc, columns=['loc1', 'loc2']).to_csv(dir_name+'loc-'+str(repeat)+'.csv', index=False)
        pd.DataFrame(vL).to_csv(dir_name+'vL-'+str(repeat)+'.csv', index=False)

In [None]:
dir_name = os.getcwd()+'/data/'
n_simul = 100

N = 100000
p = 10
qL = 5000
gam = [0.5, 0.5] # param of rbf kernel (spatial) # gam1 = 1/2sig2
sig2 = 1.

In [None]:
generate_data_1(dir_name, N, p, qL, gam, sig2, n_simul)