In [None]:
import os, sys, time, gc, random
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
device = torch.device("cuda:0")
print(f"Using device: {device}")

In [None]:
sys.path.append(os.path.abspath(os.path.join('..', 'source')))
from HMixNetTorch import HMixNet, seed_everything
from metrics import compute_metrics

In [None]:
path_data = os.path.abspath('data')
path_res  = os.path.abspath('results')
metrics = ['RMSP', 'RMD', 'R2']
n_folds = 10

In [None]:
data_names = ['epilepsy', 'cd4', 'bolus', 'owls', 'fruits']
optimizer = 'Adam'
learning_rates = [0.01, 0.001]
batch_sizes = [32, 16]
num_nodes_list = [[8,8], [4,4]]
activation = torch.nn.LeakyReLU()
max_epochs = 200
patience = 10
sig2u_init = 0.8
sig2e_init = 0.8

### Gaussian-NN(m)

In [None]:
# ---------------------------------
model_name = 'GNN-M'
# ---------------------------------
is_conditional = False
for learning_rate in learning_rates:
    for batch_size in batch_sizes:
        for num_nodes in num_nodes_list:
            experiment = f'lr{learning_rate}-batch{batch_size}-nodes{num_nodes[0]}'
            results = {metric:np.zeros((n_folds, len(data_names))) for metric in metrics+['time']}
            for data_number, data_name in enumerate(data_names):
                data = pd.read_csv(f'{path_data}/{data_name}-prep.csv', dtype='float32', engine='pyarrow')        
                n_clusters = np.unique(data['id']).shape[0]
                n_features = 1+max([int(col_name[1:]) if 'x' in col_name else 0 for col_name in data.columns])
                for test_fold in tqdm(range(n_folds), desc=data_name):
                    valid_fold = (test_fold+1)//n_folds
                    data_dict = {
                        'train':data[-data['fold'].isin([valid_fold, test_fold])],
                        'valid':data[data['fold'] == valid_fold],
                        'test' :data[data['fold'] == test_fold],
                    }
                    for subset_name, subset in data_dict.items():
                        z = np.array(subset['id'], dtype=np.int32)
                        globals()[f'X_{subset_name}'] = np.array(subset[[f'x{i}' for i in range(n_features)]], dtype=np.float32)
                        globals()[f'Z_{subset_name}'] = np.zeros((len(z), n_clusters), dtype=np.float32)
                        globals()[f'Z_{subset_name}'][np.arange(len(z)), z] = 1
                        globals()[f'y_{subset_name}'] = np.array(subset['y'], dtype=np.float32).reshape(-1,1)
                    torch.cuda.empty_cache(); gc.collect()
                    seed_everything()
                    M = HMixNet(n_clusters, n_features, num_nodes, activation, device=device)            
                    start_time = time.time()
                    M.train_clik([np.log(y_train+0.5), X_train, Z_train], [np.log(y_valid+0.5), X_valid, Z_valid], is_conditional,
                            optimizer, learning_rate, batch_size, max_epochs, patience, sig2e_init)
                    computing_time = time.time() - start_time
                    y_pred = M.predict(X_test)
                    if data_name == 'owls':
                        offset_test = np.array(data_dict['test']['offset']).flatten()
                        y_pred = y_pred * offset_test
                        y_test = y_test * offset_test
                        y_train = y_train * np.array(data_dict['train']['offset']).flatten()
                    temp_results = compute_metrics(y_test, y_pred, np.mean(y_train), metrics)
                    for metric in metrics:
                        results[metric][test_fold, data_number] = temp_results[metric]
                    results['time'][test_fold, data_number] = computing_time
                    del M; gc.collect()
            for metric in metrics+['time']:
                pd.DataFrame(results[metric], columns=data_names).to_csv(f'{path_res}/{model_name}-{metric}-{experiment}.csv', index=False)

### Gaussian-NN(c)

In [None]:
# ---------------------------------
model_name = 'GNN-C'
# ---------------------------------
is_conditional = True
for learning_rate in learning_rates:
    for batch_size in batch_sizes:
        for num_nodes in num_nodes_list:
            experiment = f'lr{learning_rate}-batch{batch_size}-nodes{num_nodes[0]}'
            results = {metric:np.zeros((n_folds, len(data_names))) for metric in metrics+['time']}
            for data_number, data_name in enumerate(data_names):
                data = pd.read_csv(f'{path_data}/{data_name}-prep.csv', dtype='float32', engine='pyarrow')        
                n_clusters = np.unique(data['id']).shape[0]
                n_features = 1+max([int(col_name[1:]) if 'x' in col_name else 0 for col_name in data.columns])
                for test_fold in tqdm(range(n_folds), desc=data_name):
                    valid_fold = (test_fold+1)//n_folds
                    data_dict = {
                        'train':data[-data['fold'].isin([valid_fold, test_fold])],
                        'valid':data[data['fold'] == valid_fold],
                        'test' :data[data['fold'] == test_fold],
                    }
                    for subset_name, subset in data_dict.items():
                        z = np.array(subset['id'], dtype=np.int32)
                        globals()[f'X_{subset_name}'] = np.array(subset[[f'x{i}' for i in range(n_features)]], dtype=np.float32)
                        globals()[f'Z_{subset_name}'] = np.zeros((len(z), n_clusters), dtype=np.float32)
                        globals()[f'Z_{subset_name}'][np.arange(len(z)), z] = 1
                        globals()[f'y_{subset_name}'] = np.array(subset['y'], dtype=np.float32).reshape(-1,1)
                    torch.cuda.empty_cache(); gc.collect()
                    seed_everything()
                    M = HMixNet(n_clusters, n_features, num_nodes, activation, device=device)            
                    start_time = time.time()
                    M.train_clik([np.log(y_train+0.5), X_train, Z_train], [np.log(y_valid+0.5), X_valid, Z_valid], is_conditional,
                            optimizer, learning_rate, batch_size, max_epochs, patience, sig2e_init)
                    computing_time = time.time() - start_time
                    y_pred = M.predict(X_test, Z_test)
                    if data_name == 'owls':
                        offset_test = np.array(data_dict['test']['offset']).flatten()
                        y_pred = y_pred * offset_test
                        y_test = y_test * offset_test
                        y_train = y_train * np.array(data_dict['train']['offset']).flatten()
                    temp_results = compute_metrics(y_test, y_pred, np.mean(y_train), metrics)
                    for metric in metrics:
                        results[metric][test_fold, data_number] = temp_results[metric]
                    results['time'][test_fold, data_number] = computing_time
                    del M; gc.collect()
            for metric in metrics+['time']:
                pd.DataFrame(results[metric], columns=data_names).to_csv(f'{path_res}/{model_name}-{metric}-{experiment}.csv', index=False)

### H-LMMNN

In [None]:
# ---------------------------------
model_name = 'HLMMNN'
# ---------------------------------
for learning_rate in learning_rates:
    for batch_size in batch_sizes:
        for num_nodes in num_nodes_list:
            experiment = f'lr{learning_rate}-batch{batch_size}-nodes{num_nodes[0]}'
            results = {metric:np.zeros((n_folds, len(data_names))) for metric in metrics+['time']}
            for data_number, data_name in enumerate(data_names):
                data = pd.read_csv(f'{path_data}/{data_name}-prep.csv', dtype='float32', engine='pyarrow')        
                n_clusters = np.unique(data['id']).shape[0]
                n_features = 1+max([int(col_name[1:]) if 'x' in col_name else 0 for col_name in data.columns])
                for test_fold in tqdm(range(n_folds), desc=data_name):
                    valid_fold = (test_fold+1)//n_folds
                    data_dict = {
                        'train':data[-data['fold'].isin([valid_fold, test_fold])],
                        'valid':data[data['fold'] == valid_fold],
                        'test' :data[data['fold'] == test_fold],
                    }
                    for subset_name, subset in data_dict.items():
                        z = np.array(subset['id'], dtype=np.int32)
                        globals()[f'X_{subset_name}'] = np.array(subset[[f'x{i}' for i in range(n_features)]], dtype=np.float32)
                        globals()[f'Z_{subset_name}'] = np.zeros((len(z), n_clusters), dtype=np.float32)
                        globals()[f'Z_{subset_name}'][np.arange(len(z)), z] = 1
                        globals()[f'y_{subset_name}'] = np.array(subset['y'], dtype=np.float32).reshape(-1,1)
                    torch.cuda.empty_cache(); gc.collect()
                    seed_everything()
                    M = HMixNet(n_clusters, n_features, num_nodes, activation, device=device)            
                    start_time = time.time()
                    M.train_hlik([np.log(y_train+0.5), X_train, Z_train], [np.log(y_valid+0.5), X_valid, Z_valid],
                            optimizer, learning_rate, batch_size, max_epochs, patience, sig2e_init, sig2u_init)
                    computing_time = time.time() - start_time
                    y_pred = M.predict(X_test, Z_test)
                    if data_name == 'owls':
                        offset_test = np.array(data_dict['test']['offset']).flatten()
                        y_pred = y_pred * offset_test
                        y_test = y_test * offset_test
                        y_train = y_train * np.array(data_dict['train']['offset']).flatten()
                    temp_results = compute_metrics(y_test, y_pred, np.mean(y_train), metrics)
                    for metric in metrics:
                        results[metric][test_fold, data_number] = temp_results[metric]
                    results['time'][test_fold, data_number] = computing_time
                    del M; gc.collect()
            for metric in metrics+['time']:
                pd.DataFrame(results[metric], columns=data_names).to_csv(f'{path_res}/{model_name}-{metric}-{experiment}.csv', index=False)