In [1]:
import pandas as pd
import torch.nn as nn
 
parent_dir = './'
physical_para = pd.read_csv('./data/data_no_header.csv', header=None, low_memory=False)
XRD_descriptor = pd.read_csv('./data/XRD_dense_descriptor_new_pi_4.csv', header=None, low_memory=False)
data = pd.concat([XRD_descriptor, physical_para.iloc[:,11]], axis=1)
seeds = [199228, 302675, 257057, 320858, 844620, 298933, 681403, 690678]
import argparse
parser = argparse.ArgumentParser()

parser.add_argument('--num_folds', type=int, default=1,
                help='Number of folds when performing cross validation')
parser.add_argument('--ensemble_size', type=int, default=3,
                        help='Number of models in ensemble')
parser.add_argument('--show_individual_scores', action='store_true', default=False,
                help='Show all scores for individual targets, not just average, at the end')
parser.add_argument('--task_inds', type=int, default=[], nargs='+',
                help='Indices of tasks you want to train on.')
# Active Learning Arguments
parser.add_argument('--al_init_ratio', type=float, default=0.1,
                    help='Percent of training data to use on first active learning iteration')
parser.add_argument('--al_end_ratio', type=float, default=None,
                    help='Fraction of total data To stop active learning early. By default, explore full train data')
parser.add_argument('--num_al_loops', type=int, default=8,
                    help='Number of active learning loops to add new data')
parser.add_argument('--al_topk', type=int, default=25,
                    help='Top-K acquired molecules to consider during active learning')
parser.add_argument('--al_std_mult', type=float, default=1,
                    help='Multiplier for std in lcb acquisition')
parser.add_argument('--al_step_scale', type=str, default="linear",
                    help='scale of spacing for active learning steps (log, linear)')
parser.add_argument('--acquire_min', action='store_true',
                    help='if we should acquire min or max score molecules')
parser.add_argument('--al_strategy', type=str, nargs='+',
                    choices=["random",
                            "explorative_greedy", "explorative_sample",
                            "score_greedy", "score_sample",
                            "exploit", "exploit_ucb", "exploit_lcb", "exploit_ts"],
                    default=["explorative_greedy"],
                    help='Strategy for active learning regime')
parser.add_argument('--use_std', action='store_true', default=False,
                    help='Use std for evidence during active learning')
args = parser.parse_args([])
args.save_dir =  parent_dir + 'results/m13/'
args.epochs = 1000
args.al_strategy = ['explor_total','explor_total_2','score_greedy','random','exploit_ucb','exploit_lcb','exploit_ts']
args.LR = 0.001
args.mb_size = 20
heteroscedastic_loss_coefficient = 1e-3
grid_point = [12,52]
import torch
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")


In [2]:
def heteroscedastic_loss(true, mean, log_var):
    """
    Compute the heteroscedastic loss for regression.

    :param true: A list of true values.
    :param mean: A list of means (output predictions).
    :param log_var: A list of logvars (log of predicted variances).
    :return: Computed loss.
    """
    precision = torch.exp(-log_var)
    loss = precision * (true - mean)**2 + log_var
    return loss.mean()


In [3]:
from datetime import datetime
import os
from utils import makedirs
        
if args.save_dir is not None:
    timestamp = datetime.now().strftime("%y%m%d-%H%M%S%f")
    dataset = 'XRD'
    log_path = "{}_{}".format(timestamp, dataset)
    args.save_dir = os.path.join(args.save_dir, log_path)
    if os.path.exists(args.save_dir):
        num_ctr = 0
        while (os.path.exists(f"{args.save_dir}_{num_ctr}")):
            num_ctr += 1
        args.save_dir = f"{args.save_dir}_{num_ctr}"
    makedirs(args.save_dir)

In [None]:
from sklearn.model_selection import train_test_split
import torch
x_train, x_test, y_train, y_test = train_test_split(data.iloc[:, :4096], data.iloc[:, 4096], test_size=0.3, random_state=500)
print("Train data:", len(x_train))
print("Test data:", len(x_test))


In [5]:
class UA_CNN(nn.Module):
    def __init__(self):
        super(UA_CNN, self).__init__()#N*2*4096
        
        self.partition_1 = nn.ModuleList([nn.Conv1d(in_channels=1,out_channels=2,kernel_size=3,padding=1),
                        nn.ReLU(),
                        nn.MaxPool1d(2,2),#N*4*1024
                        nn.Conv1d(in_channels=2,out_channels=4,kernel_size=3,padding=1),
                        nn.ReLU(),
                        nn.MaxPool1d(2,2),#N*4*512
                        nn.Conv1d(in_channels=4,out_channels=2,kernel_size=3,padding=1),
                        nn.ReLU(),
                        nn.MaxPool1d(2,2),#N*2*256
                        nn.Conv1d(in_channels=2,out_channels=1,kernel_size=3,padding=1),
                        nn.ReLU(),
                        nn.MaxPool1d(2,2)#N*1*128
                        ])
        self.partition_2 = nn.ModuleList([nn.Conv1d(in_channels=1,out_channels=2,kernel_size=3,padding=1),
                        nn.ReLU(),
                        nn.MaxPool1d(2,2),#N*4*1024
                        nn.Conv1d(in_channels=2,out_channels=4,kernel_size=3,padding=1),
                        nn.ReLU(),
                        nn.MaxPool1d(2,2),#N*4*512
                        nn.Conv1d(in_channels=4,out_channels=2,kernel_size=3,padding=1),
                        nn.ReLU(),
                        nn.MaxPool1d(2,2),#N*2*256
                        nn.Conv1d(in_channels=2,out_channels=1,kernel_size=3,padding=1),
                        nn.ReLU(),
                        nn.MaxPool1d(2,2)#N*1*128
                        ])
        # self.dropout_layer = nn.Dropout(0.1)
        self.fc1 = nn.Linear(256,64)
        self.relu5 = nn.ReLU()
        self.fc2 = nn.Linear(64,32)
        self.relu6 = nn.ReLU()
        self.fc3 = nn.Linear(32,8)
        self.relu7 = nn.ReLU()
        self.fc4 = nn.Linear(8,1)
        
        self.fc1_var = nn.Linear(256,64)
        self.relu5_var = nn.ReLU()
        self.fc2_var = nn.Linear(64,32)
        self.relu6_var = nn.ReLU()
        self.fc3_var = nn.Linear(32,8)
        self.relu7_var = nn.ReLU()
        self.fc4_var = nn.Linear(8,1)
    
    def forward(self, x, gp):
        # out = self.lanorm(x)
        x1 = x[:,:,:gp[0]*64]
        x2 = x[:,:,gp[0]*64:]
        
        for item in self.partition_1:
            x1 = item(x1)
        for item in self.partition_2:
            x2 = item(x2)
        
        x = torch.cat([x1,x2],dim=2)

        out = self.fc1(x)
        out = self.relu5(out)
        out = self.fc2(out)
        out = self.relu6(out)
        out = self.fc3(out)
        out = self.relu7(out)
        out = self.fc4(out)
        
        out_var = self.fc1_var(x)
        out_var = self.relu5_var(out_var)
        out_var = self.fc2_var(out_var)
        out_var = self.relu6_var(out_var)
        out_var = self.fc3_var(out_var)
        out_var = self.relu7_var(out_var)
        out_var = self.fc4_var(out_var)
        return out, x, out_var

In [6]:
def random_mini_batches(X_train, Y_train, mini_batch_size = 10):                           
    mini_batches = []
    X_train = torch.split(X_train, mini_batch_size)
    Y_train = torch.split(Y_train, mini_batch_size)
    for i in np.arange(len(X_train)):
        mini_batch = (X_train[i],Y_train[i])
        mini_batches.append(mini_batch)
    return mini_batches
def setup_seed(seed):
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        np.random.seed(seed)
        random.seed(seed)
        torch.backends.cudnn.deterministic = True


In [7]:
import torch
import numpy as np
import random
import math
import os 
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from utils import makedirs

def run_training(x_train,y_train,x_test,y_test,args,loss_func):
    print("Train data:", len(x_train))
    print("Test data:", len(x_test))
    
    x_train = torch.from_numpy(x_train.values).float().to(device)
    x_test = torch.from_numpy(x_test.values).float().to(device)
    y_train = torch.from_numpy(y_train.values).float().to(device)
    y_test = torch.from_numpy(y_test.values).float().to(device)

    x_train = torch.unsqueeze(x_train, 1)
    x_test = torch.unsqueeze(x_test, 1)

    ensemble_models = []
    for model_idx in range(args.ensemble_size):
        seed = seeds[model_idx]
        model_idx_result_dir = os.path.join(args.save_dir, f'model_{model_idx}')
        makedirs(model_idx_result_dir)
        model_idx = run_training_single_model(x_train,y_train,x_test,y_test,seed,args,model_idx_result_dir,model_idx,loss_func)
        ensemble_models.append(model_idx)
    return ensemble_models

def run_training_single_model(x_train,y_train,x_test,y_test,seed,args,model_idx_result_dir,model_idx,loss_func):
    setup_seed(seed)
    input_size, feature_size = x_train.shape[0], x_train.shape[1]
    model = UA_CNN().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=args.LR)
    y_train_sub = y_train
    y_test_sub = y_test
    r2_best = -math.inf
    MAE_best = 0
    for epoch in range(args.epochs):
        epoch_loss = 0
        num_minibatches = int(input_size / args.mb_size) + 1
        minibatches = random_mini_batches(x_train, y_train_sub, args.mb_size)
        model.train()
        for minibatch in minibatches:
            batch_x, batch_y  = minibatch
            batch_y_pre, _, batch_y_pre_var = model(batch_x,grid_point)
            idx = torch.nonzero(batch_y.squeeze()!=0,as_tuple=False)
            batch_y_pre1 = torch.index_select(batch_y_pre.squeeze(), dim=0, index = idx.squeeze())
            batch_y1 = torch.index_select(batch_y.squeeze(), dim=0, index = idx.squeeze())
            batch_y_pre_log_var1 = torch.index_select(batch_y_pre_var.squeeze(), dim=0, index = idx.squeeze())
            mse_loss = loss_func(batch_y_pre1.squeeze(), batch_y1.squeeze())    
            h_loss = heteroscedastic_loss(batch_y1.squeeze(),batch_y_pre1.squeeze(),batch_y_pre_log_var1.squeeze())
            loss = mse_loss + heteroscedastic_loss_coefficient * h_loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            epoch_loss = epoch_loss + (loss / num_minibatches)
            
        model.eval()
        with torch.no_grad():
            y_test_pre, _, y_test_pre_var = model(x_test,grid_point)
            idx_test = torch.nonzero(y_test_sub.squeeze()!=0,as_tuple=False)
            y_test_pre1 = torch.index_select(y_test_pre.squeeze(), dim=0, index = idx_test.squeeze())
            y_test_sub1 = torch.index_select( y_test_sub.squeeze(), dim=0, index = idx_test.squeeze())        
            y_test_pre_log_var1 = torch.index_select( y_test_pre_var.squeeze(), dim=0, index = idx_test.squeeze())        
            mse_loss_test = loss_func(y_test_pre1.squeeze(), y_test_sub1.squeeze())
            h_loss_test = heteroscedastic_loss(y_test_sub1.squeeze(), y_test_pre1.squeeze(), y_test_pre_log_var1.squeeze())
            loss_test = mse_loss_test + heteroscedastic_loss_coefficient * h_loss_test
            MAE2 = mean_absolute_error(y_test_sub1.cpu().numpy().squeeze(),y_test_pre1.cpu().numpy().squeeze())
            MSE = mean_squared_error(y_test_sub1.cpu().numpy().squeeze(), y_test_pre1.cpu().numpy().squeeze())
            r2_score_v =  r2_score(y_test_sub1.cpu().numpy().squeeze(),y_test_pre1.cpu().numpy().squeeze()) 
            if r2_best < r2_score_v:
                best_test_loss = loss_test
                torch.save(model.state_dict(), os.path.join(model_idx_result_dir, 'best_test_model.pth'))
                MAE_best = MAE2
                r2_best = r2_score_v
            if (epoch+1)%100==0:
                print('Iter-{}; Total loss: {:.4}; MAE2: {:.4}; MSE: {:.4} r2_score_v: {:.4}'.format(epoch, loss_test.item(), MAE2, MSE, r2_score_v))

    
    model_test = UA_CNN().to(device)
    model_test.load_state_dict(torch.load(os.path.join(model_idx_result_dir, 'best_test_model.pth')))
    print(f"Done with model {model_idx}")
    return model_test
        

In [8]:
def evaluate_models(models,loss_func,x_test,y_test,test=True):
    x_test = torch.from_numpy(x_test.values).float().to(device)
    y_test = torch.from_numpy(y_test.values).float().to(device)
    x_test = torch.unsqueeze(x_test, 1)
    y_test_sub = y_test
    
    predictions = []
    sum_preds = np.zeros((len(x_test), 1))
    sum_ale_uncs = np.zeros((len(x_test), 1))
    all_preds = np.zeros((len(x_test), 1, args.ensemble_size))
    
    idx_tst = torch.nonzero(y_test_sub.squeeze()!=0,as_tuple=False)
    y_test_sub1 = torch.index_select(y_test_sub.squeeze(), dim=0, index = idx_tst.squeeze())
        
    for model_idx, model in enumerate(models):
        with torch.no_grad():
            y_test_pre, _, y_test_pre_var= model(x_test,grid_point)
        y_test_pre1 = torch.index_select(y_test_pre.squeeze(), dim=0, index = idx_tst.squeeze())
        y_test_pre_log_var1 = torch.index_select(y_test_pre_var.squeeze(), dim=0, index = idx_tst.squeeze())
        predictions.append(y_test_pre1.tolist())
  
        test_preds_array = np.array([[x] for x in y_test_pre1.cpu().numpy()])
        sum_preds += np.array(test_preds_array)
        
        test_pred_log_vars_array = np.array([[x] for x in y_test_pre_log_var1.cpu().numpy()])
        test_pred_vars_array = np.exp(test_pred_log_vars_array)
        sum_ale_uncs += np.array(test_pred_vars_array)
        
        all_preds[:, :, model_idx] = test_preds_array
        
        mse_loss_test = loss_func(y_test_pre1.squeeze(), y_test_sub1.squeeze())
        h_loss_test = heteroscedastic_loss(y_test_sub1.squeeze(),y_test_pre1.squeeze(),y_test_pre_log_var1.squeeze())
        loss_test = mse_loss_test + heteroscedastic_loss_coefficient * h_loss_test
        MAE_test = mean_absolute_error(y_test_sub1.cpu().numpy().squeeze(),y_test_pre1.cpu().numpy().squeeze())
        MSE = mean_squared_error(y_test_sub1.cpu().numpy().squeeze(), y_test_pre1.cpu().numpy().squeeze())
        r2_test = r2_score(y_test_sub1.cpu().numpy().squeeze(),y_test_pre1.cpu().numpy().squeeze())
        
        if args.show_individual_scores:
            # Individual test scores
            print('Model {} test Total loss: {:.4}; MAE2: {:.4}; MSE: {:.4}, r2_score_v: {:.4}'.format(model_idx, loss_test.item(), MAE_test, MSE, r2_test))
                    
    ensemble_predictions = np.mean(predictions, axis=0)
    ensemble_MAE_test = mean_absolute_error(y_test_sub1.cpu().numpy().squeeze(),ensemble_predictions.squeeze())
    ensemble_MSE_test = mean_squared_error(y_test_sub1.cpu().numpy().squeeze(),ensemble_predictions.squeeze())
    ensemble_r2_test = r2_score(y_test_sub1.cpu().numpy().squeeze(),ensemble_predictions.squeeze())
    if test == True:
        print('Ensemble model test Total loss: {:.4}; MAE2: {:.4}; MSE: {:.4},  r2_score_v: {:.4}'.format(loss_test.item(), ensemble_MAE_test,ensemble_MSE_test, ensemble_r2_test))
    else:
        print('Ensemble model train Total loss: {:.4}; MAE2: {:.4}; MSE: {:.4},  r2_score_v: {:.4}'.format(loss_test.item(), ensemble_MAE_test,ensemble_MSE_test, ensemble_r2_test))
            
    avg_preds = sum_preds / args.ensemble_size
    avg_preds = avg_preds.tolist()

    avg_ale_uncs = sum_ale_uncs / args.ensemble_size
    avg_ale_uncs = avg_ale_uncs.tolist()

    avg_epi_uncs = np.var(all_preds, axis=2)
    # 对每个元素取平方根
    epi_std = np.sqrt(avg_epi_uncs)
    ale_std = np.sqrt(avg_ale_uncs)
    total_std = epi_std + ale_std
    total_std2 = np.sqrt(avg_epi_uncs+avg_ale_uncs)

    epi_std = epi_std.tolist()
    ale_std = ale_std.tolist()
    total_std = total_std.tolist()
    total_std2 = total_std2.tolist()
    return ensemble_MAE_test,ensemble_MSE_test,ensemble_r2_test,ensemble_predictions,epi_std,ale_std,total_std,total_std2

In [None]:
import numpy as np
import time, datetime
from pathlib import Path
from copy import deepcopy
setup_seed(42)

results_root = Path(args.save_dir)
Path(results_root).mkdir(parents=True, exist_ok=True)

df = pd.DataFrame(
    columns=["Trial", "Train Data Ratio", "Score_mae","Score_mse", "Uncertainty", "Entropy"])

### Define active learning step variables and subsample the tasks
n_total = len(x_train)
n_sample = n_total
n_loops = args.num_al_loops

### Change active learning n_sample for early stopping
if args.al_end_ratio is not None:
    if args.al_end_ratio > 1:
        raise ValueError("Arg al_end_ratio must be less than train size")
    total_data = len(x_train) + len(x_test)
    early_stop_num = int(n_total * args.al_end_ratio)
    n_sample = early_stop_num

n_start = int(n_total * args.al_init_ratio)

train_subset_inds_start = np.random.choice(n_total, n_start, replace=False)

print(f"Ratio targets 0/1: {np.nanmean(np.array(y_train), axis=0)}")

### Compute the number of samples to use at each step of active learning
if args.al_step_scale == "linear":
    n_samples_per_run = np.linspace(n_start, n_sample, n_loops)
elif args.al_step_scale == "log":
    n_samples_per_run = np.logspace(np.log10(n_start), np.log10(n_sample), n_loops)
else:
    raise ValueError(f"unknown args.al_step_scale = {args.al_step_scale}")
n_samples_per_run = np.round(n_samples_per_run).astype(int)

loss_func = nn.MSELoss()    
for strategy in args.al_strategy:
    train_subset_inds = np.copy(train_subset_inds_start)
    
    tic_time = time.time() # grab the current time for logging
    i_trial = 0
    
    ### Main active learning loop
    for i in range(n_loops):
        print(f"===> [{strategy}] Running trial {i_trial} with {n_samples_per_run[i]} samples")
        current_x_train_data = x_train.iloc[train_subset_inds]
        current_y_train_data = y_train.iloc[train_subset_inds]
        
        ### Train with the data subset, return the best models
        models = run_training(
            current_x_train_data, current_y_train_data,x_test,y_test, args, loss_func)

        if "explorative" in strategy or "explor" in strategy or "score" in strategy or "exploit" in strategy:
            ensemble_MAE_train, ensemble_MSE_train, ensemble_r2_train, all_train_preds, all_train_std,all_train_ale_std,all_train_total_std,all_train_total_std_2 = evaluate_models(models,loss_func,x_train,y_train,test=False)
            sq_error = np.square(np.array(y_train) - all_train_preds)
            rmse = np.sqrt( sq_error.astype(np.float32)) 
        
            mean_uncertainty = np.mean(all_train_std, axis=1)
            mean_ale_uncertainty = np.mean(all_train_ale_std, axis=1)
            mean_total_uncertainty = np.mean(all_train_total_std, axis=1)
            mean_total_uncertainty_2 = np.mean(all_train_total_std_2, axis=1)
            if "explorative_greedy" == strategy:
                per_sample_weight = mean_uncertainty
            elif "explor_ale" == strategy:
                per_sample_weight = mean_ale_uncertainty
            elif "explor_total"  == strategy:
                per_sample_weight = mean_total_uncertainty
            elif "explor_total_2"  == strategy:
                per_sample_weight = mean_total_uncertainty_2
            elif "score_greedy" == strategy:
                per_sample_weight = rmse
            elif "exploit" in strategy:
                per_sample_weight = all_train_preds
                # Reverse and make sure weights (preds) are positive
                if args.acquire_min:
                    per_sample_weight *= -1

                std_mult = args.al_std_mult
                if "_lcb" in strategy: # lower confidence bound
                    per_sample_weight += -std_mult * mean_uncertainty
                elif "_ucb" in strategy: # upper confidence bound
                    per_sample_weight += +std_mult * mean_uncertainty
                elif "_ts" in strategy: # thompson sampling
                    per_sample_weight = np.random.normal(
                        per_sample_weight, mean_uncertainty)

                per_sample_weight -= per_sample_weight.min()
                
            ### Save all the smiles along with their uncertainties/errors
            train_subset_mask = np.zeros((n_total,))
            train_subset_mask[train_subset_inds] = 1
            x_train_index = np.array(x_train.index)
            df_scores = pd.DataFrame(data={
                        "X_train": x_train_index,
                        "Uncertainty": mean_uncertainty,
                        "Ale_Uncertainty": mean_ale_uncertainty,
                        "Total_Uncertainty": mean_total_uncertainty,
                        "Total_Uncertainty_2": mean_total_uncertainty_2,
                        "Error": rmse,
                        "TrainInds": train_subset_mask
                    })
            Path(os.path.join(results_root, "tracks")).mkdir(
                    parents=True, exist_ok=True)
            df_scores.to_csv(os.path.join(results_root, "tracks",
                f"{strategy}_step_{i}_{tic_time}.csv"))
        
        elif strategy == "random":
            per_sample_weight = np.ones((n_total,)) # uniform
        else:
            raise ValueError(f"Unknown active learning strategy {strategy}")
        
        ensemble_MAE_test,ensemble_MSE_test,ensemble_r2_test,test_preds,test_std,test_ale_std,test_total_std,test_total_std_2 = evaluate_models(models,loss_func,x_test,y_test,test=True)
        
        ### Compute the top-k percent acquired
        # Grab the indicies that are in the top-k of only the training data
        top_k_scores_in_pool = np.sort(y_train)
        top_k_scores_in_pool = top_k_scores_in_pool[:args.al_topk] \
                                if args.acquire_min else \
                                top_k_scores_in_pool[-args.al_topk:]

        top_k_scores_in_selection = np.sort(current_y_train_data)
        top_k_scores_in_selection = top_k_scores_in_selection[:args.al_topk] \
                                if args.acquire_min else \
                                top_k_scores_in_selection[-args.al_topk:]

        # Find the overlap in indicies with our already acquired data points
        selection_overlap = np.in1d(top_k_scores_in_selection,
                                    top_k_scores_in_pool)

        # Compute the percent overlap
        percent_top_k_overlap = np.mean(selection_overlap) * 100
        ###
        
        df = pd.concat([df, pd.DataFrame([{
            'Train Data Ratio': n_samples_per_run[i] / float(n_total),
            'Score_mae': np.mean(ensemble_MAE_test),
            'Score_mse': np.mean(ensemble_MSE_test),
            'Score_R2': np.mean(ensemble_r2_test),
            'Uncertainty': np.mean(test_std),
            'Ale_Uncertainty': np.mean(test_ale_std),
            'Total_Uncertainty': np.mean(test_total_std),
            'Total_Uncertainty_2': np.mean(test_total_std_2),
            'Standard Deviation': np.mean(test_std),
            'Trial': i_trial,
            'Strategy': strategy,
            'Top_k':percent_top_k_overlap,
            'Tasks': 'XRD',
        }])], ignore_index=True)

        ### Save the complete test performance (including uncs) to log
        test_error = test_preds - np.array(y_test)
        log_data_dict = {"Error_0": test_error}
        
        x_test_index = np.array(x_test.index)

        log_data_dict.update({
            "X_test": x_test_index,
            "Uncertainty": np.mean(test_std, 1),
            "Ale_Uncertainty": np.mean(test_ale_std, 1),
            "Total_Uncertainty": np.mean(test_total_std, 1),
            "Total_Uncertainty_2": np.mean(test_total_std_2, 1),
            "Std": np.mean(test_std, 1),
            "TopK": percent_top_k_overlap,
            "Train Data Ratio": n_samples_per_run[i]/float(n_total),
        })
        df_test_log = pd.DataFrame(data=log_data_dict)
        Path(os.path.join(results_root, "scores")).mkdir(
            parents=True, exist_ok=True)
        df_test_log.to_csv(os.path.join(results_root, "scores",
            f"{strategy}_step_{i}_{tic_time}.csv"))

        n_top = args.al_topk  # Use a parameter to dynamically determine how many top values to select
        top_indices = np.argsort(test_preds)[:n_top] if args.acquire_min else np.argsort(test_preds)[-n_top:]
        top_kd_values = np.array(y_test)[top_indices]
        top_pred_values = test_preds[top_indices]

        df_top = pd.DataFrame({
            'Strategy': strategy,
            'Trial': i_trial,
            'Top_Kd_Values': top_kd_values,
            'Top_Pred_Values': top_pred_values
        })
        df_top.to_csv(os.path.join(results_root, f"{strategy}_top_{n_top}_kd_{i}_{tic_time}.csv"), index=False)
        
        print("Percent top-k = {}".format(round(percent_top_k_overlap, 2)))

        ### Add new samples to training set
        n_add = n_samples_per_run[min(i+1, n_loops-1)] - n_samples_per_run[i]
        if n_add > 0: # n_add = 0 on the last iteration, when we are done

            # Probability of sampling a new point, depends on the weight
            per_sample_prob = deepcopy(per_sample_weight)

            # Exclude data we've already trained with, and normalize to probability
            per_sample_prob[train_subset_inds] = 0.0
            per_sample_prob = per_sample_prob / per_sample_prob.sum()

            # Sample accordingly and add to our training inds
            if "sample" in strategy:
                train_inds_to_add = np.random.choice(n_total, size=n_add, p=per_sample_prob, replace=False)
            else:
                # greedy, just pick the highest probability indicies
                inds_sorted = np.argsort(per_sample_prob) # smallest to largest
                train_inds_to_add = inds_sorted[-n_add:] # grab the last k inds

            # Add the indices to the training set
            train_subset_inds = np.append(train_subset_inds, train_inds_to_add)
        i_trial = i_trial+1
        del models
        torch.cuda.empty_cache()    
        
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S.%f")
csv_filename = f"{timestamp}_{args.task_inds}.csv"
csv_save_path = os.path.join(results_root, csv_filename)
df.to_csv(csv_save_path)
print(f"Done with all folds and saved into {results_root}")
print(f"CSV file saved at: {csv_save_path}")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os

df = pd.read_csv(csv_save_path)

train_data_ratio = df["Train Data Ratio"]
mae = df["Score_mae"]

strategies = df["Strategy"].unique()

plt.figure(figsize=(10, 6))
all_coords_mae = []

for strategy in strategies:
    strategy_df = df[df["Strategy"] == strategy]
    plt.plot(strategy_df["Train Data Ratio"], strategy_df["Score_mae"], label=strategy)
    coords_mae = strategy_df[["Train Data Ratio", "Score_mae"]].copy()
    coords_mae["Strategy"] = strategy
    all_coords_mae.append(coords_mae)

plt.xlabel("Train Data Ratio")
plt.ylabel("MAE")
plt.title("MAE vs Train Data Ratio for Different Strategies")
plt.legend()
plt.grid(True)
plt.show()

all_coords_mae_df = pd.concat(all_coords_mae)
mae_coords_save_path = os.path.join(args.save_dir, "mae_vs_train_data_ratio_all_coordinates.csv")
all_coords_mae_df.to_csv(mae_coords_save_path, index=False)

print(f"All MAE coordinates saved to: {mae_coords_save_path}")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv(csv_save_path)
train_data_ratio = df["Train Data Ratio"]
mse = df["Score_mse"]
strategies = df["Strategy"].unique()
plt.figure(figsize=(10, 6))

all_coords = []

for strategy in strategies:
    strategy_df = df[df["Strategy"] == strategy]
    plt.plot(strategy_df["Train Data Ratio"], strategy_df["Score_mse"], label=strategy)
    coords = strategy_df[["Train Data Ratio", "Score_mse"]].copy()
    coords["Strategy"] = strategy
    all_coords.append(coords)

plt.xlabel("Train Data Ratio")
plt.ylabel("MSE")
plt.title("MSE vs Train Data Ratio for Different Strategies")
plt.legend()
plt.grid(True)
plt.show()

all_coords_df = pd.concat(all_coords)
coords_save_path = os.path.join(args.save_dir, "mse_vs_train_data_ratio_all_coordinates.csv")
all_coords_df.to_csv(coords_save_path, index=False)

print(f"All coordinates saved to: {coords_save_path}")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv(csv_save_path)
train_data_ratio = df["Train Data Ratio"]
strategies = df["Strategy"].unique()
strategies = ['explorative_greedy', 'explor_total' ,'explor_total_2' ,'explor_ale',
 'score_greedy','random' ]
print(strategies)
plt.figure(figsize=(10, 6))

all_coords = []

for strategy in strategies:
    strategy_df = df[df["Strategy"] == strategy]
    plt.plot(strategy_df["Train Data Ratio"], strategy_df["Score_R2"], label=strategy)
    coords = strategy_df[["Train Data Ratio", "Score_R2"]].copy()
    coords["Strategy"] = strategy
    all_coords.append(coords)

plt.xlabel("Train Data Ratio")
plt.ylabel("R2")
plt.title("R2 vs Train Data Ratio for Different Strategies")
plt.legend()
plt.grid(True)
plt.show()

all_coords_df = pd.concat(all_coords)
coords_save_path = os.path.join(args.save_dir, "R2_vs_train_data_ratio_all_coordinates.csv")
all_coords_df.to_csv(coords_save_path, index=False)

print(f"All coordinates saved to: {coords_save_path}")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

import matplotlib.pyplot as plt

random_data = df[df["Strategy"] == "random"]
strategies = df["Strategy"].unique()
strategies_data = {strategy: df[df["Strategy"] == strategy] for strategy in strategies}

random_mse = random_data["Score_mse"].values
sample_efficiency = {}
all_coords = []

for strategy, data in strategies_data.items():
    mse = data["Score_mse"].values
    efficiency = (random_mse - mse) / random_mse * 100
    sample_efficiency[strategy] = efficiency
    coords = pd.DataFrame({
        "Train Data Ratio": data["Train Data Ratio"],
        "Sample Efficiency (%)": efficiency,
        "Strategy": strategy
    })
    all_coords.append(coords)

train_data_ratio = random_data["Train Data Ratio"]
plt.figure(figsize=(10, 6))

for strategy in sample_efficiency.keys():
    plt.plot(train_data_ratio, sample_efficiency[strategy], marker='o', label=strategy)

plt.plot(train_data_ratio, [0] * len(train_data_ratio), linestyle='--', color='gray', label='Random')

plt.xlabel("Train Data Ratio")
plt.ylabel("Sample Efficiency (%)")
plt.title("Change in Sample Efficiency for Different Acquisition Strategies")
plt.legend()
plt.grid(True)
plt.show()

all_coords_df = pd.concat(all_coords)
coords_save_path = os.path.join(args.save_dir, "change_in_sample_efficiency_all_coordinates.csv")
all_coords_df.to_csv(coords_save_path, index=False)

print(f"All coordinates saved to: {coords_save_path}")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

strategies = df["Strategy"].unique()

strategy_data = {}
for strategy in strategies:
    strategy_data[strategy] = df[df['Strategy'] == strategy]
top_k_data = {strategy: strategy_data[strategy]['Top_k'].values for strategy in strategies}
x_values = df['Train Data Ratio'].unique()
all_coords = []

plt.figure(figsize=(10, 6))
for strategy in strategies:
    plt.plot(x_values, top_k_data[strategy], marker='o', label=strategy.replace('_', ' ').title())
    coords = pd.DataFrame({
        "Train Data Ratio": x_values,
        "% of top-k scores found": top_k_data[strategy],
        "Strategy": strategy
    })
    all_coords.append(coords)

plt.xlabel('Train Data Ratio')
plt.ylabel('% of top-k scores found')
plt.title('Comparison of Different Strategies')
plt.legend()
plt.grid(True)
plt.show()

all_coords_df = pd.concat(all_coords)
coords_save_path = os.path.join(args.save_dir, "comparison_of_different_strategies_all_coordinates.csv")
all_coords_df.to_csv(coords_save_path, index=False)

print(f"All coordinates saved to: {coords_save_path}")

