In [None]:
import os, sys, time, gc, random
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
device = torch.device("cuda:0")
print(f"Using device: {device}")

In [None]:
sys.path.append(os.path.abspath(os.path.join('..', 'source')))
from HMixNetTorch import HMixNet, seed_everything
from metrics import compute_metrics

In [None]:
path_data = os.path.abspath('data')
path_res  = os.path.abspath('results')
n_repeats = 10
n_clusters, cluster_size = 10000, 12
cluster_size_train, cluster_size_valid, cluster_size_test = 8, 2, 2

In [None]:
input_distributions = ['IND', 'AR1', 'AR2', 'SYM', 'MIX']
random_effects_types = ['X-0.0']
for random_effects_distribution in ['G', 'N', 'M']: # gamma, log-normal, mixture
    for random_effects_variance in [1.0, 2.0]:
        random_effects_types.append(f'{random_effects_distribution}-{random_effects_variance:.1f}')
metrics = ['RMSP', 'RMD', 'R2']

In [None]:
n_features = 50
optimizer = 'Adam'
learning_rate = 0.01
batch_size = 1000
num_nodes = [50, 25, 12]
activation = torch.nn.LeakyReLU()
max_epochs = 100
patience = 5
sig2u_init = 0.8
sig2e_init = 0.8

### Gaussian-NN(m)

In [None]:
# ---------------------------------
model_name = 'GNN-M'
# ---------------------------------
results = {metric:np.zeros((n_repeats*len(input_distributions), len(random_effects_types))) for metric in metrics+['time']}
experiments = [f'{input_distribution}-{simulation_number}' for input_distribution in input_distributions for simulation_number in range(n_repeats)]
is_conditional = False
for k_re, random_effects_type in enumerate(random_effects_types):    
    for k_in, input_distribution in enumerate(input_distributions):
        for simulation_number in tqdm(range(n_repeats), desc=f'X: {input_distribution}, u: {random_effects_type}'):
            data = pd.read_csv(f'{path_data}/data-{input_distribution}-{random_effects_type}-{simulation_number}.csv')
            data = {
                'train':data[data['number'].isin(range(cluster_size_train))],
                'valid':data[data['number'].isin(range(cluster_size_train, cluster_size_train+cluster_size_valid))],
                'test' :data[data['number'].isin(range(cluster_size_train+cluster_size_valid, cluster_size))],
            }
            for subset_name, subset in data.items():
                z = np.array(subset['cluster'], dtype=np.int32)
                globals()[f'y_{subset_name}'] = np.array(subset['y'], dtype=np.float32).reshape(-1,1)
                globals()[f'X_{subset_name}'] = np.array(subset[[f'x{i}' for i in range(n_features)]], dtype=np.float32)
                globals()[f'Z_{subset_name}'] = np.zeros((len(z), n_clusters), dtype=np.float32)
                globals()[f'Z_{subset_name}'][np.arange(len(z)), z] = 1
            torch.cuda.empty_cache(); gc.collect()
            seed_everything()
            M = HMixNet(n_clusters, n_features, num_nodes, activation, device=device)            
            start_time = time.time()
            M.train_clik([np.log(y_train+0.5), X_train, Z_train], [np.log(y_valid+0.5), X_valid, Z_valid], is_conditional,
                    optimizer, learning_rate, batch_size, max_epochs, patience, sig2e_init)
            computing_time = time.time() - start_time
            y_pred = M.predict(X_test)
            temp_results = compute_metrics(y_test, y_pred, np.mean(y_train), metrics)
            for metric in metrics:
                results[metric][(k_in*n_repeats+simulation_number), k_re] = temp_results[metric]
            results['time'][(k_in*n_repeats+simulation_number), k_re] = computing_time            
            del M; gc.collect()    
for metric in metrics+['time']:
    pd.DataFrame(results[metric], columns=random_effects_types, index=experiments).to_csv(f'{path_res}/{model_name}-{metric}.csv', index=True)

### Gaussian-NN(c)

In [None]:
# ---------------------------------
model_name = 'GNN-C'
# ---------------------------------
results = {metric:np.zeros((n_repeats*len(input_distributions), len(random_effects_types))) for metric in metrics+['time']}
experiments = [f'{input_distribution}-{simulation_number}' for input_distribution in input_distributions for simulation_number in range(n_repeats)]
is_conditional = True
for k_re, random_effects_type in enumerate(random_effects_types):    
    for k_in, input_distribution in enumerate(input_distributions):
        for simulation_number in tqdm(range(n_repeats), desc=f'X: {input_distribution}, u: {random_effects_type}'):
            data = pd.read_csv(f'{path_data}/data-{input_distribution}-{random_effects_type}-{simulation_number}.csv')
            data = {
                'train':data[data['number'].isin(range(cluster_size_train))],
                'valid':data[data['number'].isin(range(cluster_size_train, cluster_size_train+cluster_size_valid))],
                'test' :data[data['number'].isin(range(cluster_size_train+cluster_size_valid, cluster_size))],
            }
            for subset_name, subset in data.items():
                z = np.array(subset['cluster'], dtype=np.int32)
                globals()[f'y_{subset_name}'] = np.array(subset['y'], dtype=np.float32).reshape(-1,1)
                globals()[f'X_{subset_name}'] = np.array(subset[[f'x{i}' for i in range(n_features)]], dtype=np.float32)
                globals()[f'Z_{subset_name}'] = np.zeros((len(z), n_clusters), dtype=np.float32)
                globals()[f'Z_{subset_name}'][np.arange(len(z)), z] = 1                
            torch.cuda.empty_cache(); gc.collect()
            seed_everything()
            M = HMixNet(n_clusters, n_features, num_nodes, activation, device=device)            
            start_time = time.time()
            M.train_clik([np.log(y_train+0.5), X_train, Z_train], [np.log(y_valid+0.5), X_valid, Z_valid], is_conditional,
                    optimizer, learning_rate, batch_size, max_epochs, patience, sig2e_init)
            computing_time = time.time() - start_time
            y_pred = M.predict(X_test, Z_test)
            temp_results = compute_metrics(y_test, y_pred, np.mean(y_train), metrics)
            for metric in metrics:
                results[metric][(k_in*n_repeats+simulation_number), k_re] = temp_results[metric]
            results['time'][(k_in*n_repeats+simulation_number), k_re] = computing_time            
            del M; gc.collect()    
for metric in metrics+['time']:
    pd.DataFrame(results[metric], columns=random_effects_types, index=experiments).to_csv(f'{path_res}/{model_name}-{metric}.csv', index=True)

### H-LMMNN

In [None]:
# ---------------------------------
model_name = 'HLMMNN'
# ---------------------------------
results = {metric:np.zeros((n_repeats*len(input_distributions), len(random_effects_types))) for metric in metrics+['time']}
experiments = [f'{input_distribution}-{simulation_number}' for input_distribution in input_distributions for simulation_number in range(n_repeats)]
for k_re, random_effects_type in enumerate(random_effects_types):    
    for k_in, input_distribution in enumerate(input_distributions):
        for simulation_number in tqdm(range(n_repeats), desc=f'X: {input_distribution}, u: {random_effects_type}'):
            data = pd.read_csv(f'{path_data}/data-{input_distribution}-{random_effects_type}-{simulation_number}.csv')
            data = {
                'train':data[data['number'].isin(range(cluster_size_train))],
                'valid':data[data['number'].isin(range(cluster_size_train, cluster_size_train+cluster_size_valid))],
                'test' :data[data['number'].isin(range(cluster_size_train+cluster_size_valid, cluster_size))],
            }
            for subset_name, subset in data.items():
                z = np.array(subset['cluster'], dtype=np.int32)
                globals()[f'y_{subset_name}'] = np.array(subset['y'], dtype=np.float32).reshape(-1,1)
                globals()[f'X_{subset_name}'] = np.array(subset[[f'x{i}' for i in range(n_features)]], dtype=np.float32)
                globals()[f'Z_{subset_name}'] = np.zeros((len(z), n_clusters), dtype=np.float32)
                globals()[f'Z_{subset_name}'][np.arange(len(z)), z] = 1                
            torch.cuda.empty_cache(); gc.collect()
            seed_everything()
            M = HMixNet(n_clusters, n_features, num_nodes, activation, device=device)            
            start_time = time.time()
            M.train_hlik([np.log(y_train+0.5), X_train, Z_train], [np.log(y_valid+0.5), X_valid, Z_valid],
                    optimizer, learning_rate, batch_size, max_epochs, patience, 
                    sig2e_init, sig2u_init)
            computing_time = time.time() - start_time
            y_pred = M.predict(X_test, Z_test)
            temp_results = compute_metrics(y_test, y_pred, np.mean(y_train), metrics)
            for metric in metrics:
                results[metric][(k_in*n_repeats+simulation_number), k_re] = temp_results[metric]
            results['time'][(k_in*n_repeats+simulation_number), k_re] = computing_time            
            del M; gc.collect()
for metric in metrics+['time']:
    pd.DataFrame(results[metric], columns=random_effects_types, index=experiments).to_csv(f'{path_res}/{model_name}-{metric}.csv', index=True)