In [2]:
import pickle


print("Generating new dataset.")
with open(f'char2idx_class_V1.pkl','rb') as f:
    class_  = pickle.load(f)
with open(f'char2idx_super_V1.pkl','rb') as f:
    superclass_  = pickle.load(f)
with open(f'char2idx_path_V1.pkl','rb') as f:
    pathway_  = pickle.load(f)
with open(f'datset_class_all_V1.pkl','rb') as r:
    dataset = pickle.load(r)
    dataset = {k: {k2.replace("_", ""): v2 for k2, v2 in v.items()} for k, v in dataset.items()}

Generating new dataset.


In [5]:
import numpy as np
import pandas as pd

def dataset_split(matrix, n_samples=None):
    """
    Returns the unique vectors (rows) of a given matrix.

    Parameters:
        matrix (np.ndarray): A 2D numpy array.
        n_samples (int, optional): The number of samples to select from each class. Defaults to config.N_SAMPLES (all the samples if None).

    Returns:
        np.ndarray: A 2D numpy array containing the unique rows of the input matrix.
    """
    if not isinstance(matrix, np.ndarray):
        raise ValueError("Input must be a numpy array.")
    if isinstance(matrix, list):
        print("Input is a list, converting to numpy array.")
        matrix = np.array(matrix)
    if matrix.ndim != 2:
        raise ValueError("Input must be a 2D matrix.")
    
    multilabel = True

    # Search the multi-label samples (if any row has more than one 1)
    if multilabel == True:
        matrix_single_label = matrix
    else:
        sum_rows = np.sum(matrix, axis=1)
        single_label_rows = np.where(sum_rows == 1)[0]
        matrix_single_label = matrix[single_label_rows]
        
    # Find the unique rows
    unique_rows = np.unique(matrix_single_label, axis=0)
    
    # Count the number of occurrences of each unique row
    unique_counts = np.array([np.sum(np.all(matrix_single_label == row, axis=1)) for row in unique_rows])
    
    # Save the indices of matrix containing the unique rows (eg, a list of indices of all the samples of class i)
    classwise_indices = []
    
    # Find the indices of each unique row in the original matrix
    for i, row in enumerate(unique_rows):
        indices = np.where(np.all(matrix_single_label == row, axis=1))[0]
        if len(indices) > 1:
            classwise_indices.append(indices)
            print(f"Row {i} corresponding to class {[m for m in np.where(row==1)[0]]} occurs {len(indices)} times in the original matrix.")
    
    # Equally distribute the samples of each class using the classwise indices
    training_indices = []
    validation_indices = []
    test_indices = []
    
    training_samples = []
    validation_samples = []
    test_samples = []
    
    # Split the indices into training, validation, and test sets
    training_split = 0.6
    validation_split = 0.2  # test_split = 1 - training_split - validation_split
    
    for indices in classwise_indices:
        np.random.shuffle(indices)
        n = len(indices)
        # Update the number of samples if N_SAMPLES is not None
        if n_samples is not None:
            if len(indices) > n_samples:
                print(f"Class {np.argmax(matrix_single_label[indices[0]])} has more than {n_samples} samples. Randomly selecting {n_samples} samples.")
                indices = indices[:n_samples]
                n = n_samples
            else:
                n = len(indices)
                
        train_end = int(training_split * n)
        val_end = int((training_split + validation_split) * n)
        training_indices.extend(indices[:train_end])
        validation_indices.extend(indices[train_end:val_end])
        test_indices.extend(indices[val_end:])
        training_samples.extend(matrix_single_label[indices[:train_end]])
        validation_samples.extend(matrix_single_label[indices[train_end:val_end]])
        test_samples.extend(matrix_single_label[indices[val_end:]])
        print(f"Class {np.argmax(matrix_single_label[indices[0]])} - Training: {len(training_indices)}, Validation: {len(validation_indices)}, Test: {len(test_indices)}")

    training_samples = np.array(training_samples)
    validation_samples = np.array(validation_samples)
    test_samples = np.array(test_samples)
    
    return training_indices, validation_indices, test_indices, training_samples, validation_samples, test_samples



dataset = {k: v for k, v in dataset.items() if np.sum(v["class".capitalize()]) == 1}
smiles_df = [dataset[i]['SMILES'] for i in dataset.keys()]
labels_list = [dataset[i]["class".capitalize()] for i in dataset.keys()]

df = pd.DataFrame({'SMILES': smiles_df, "class".capitalize(): labels_list})
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
train_indices, val_indices, test_indices, _, _, _ = dataset_split(np.array(labels_list))

train_df = df.iloc[train_indices]
val_df = df.iloc[val_indices]
test_df = df.iloc[test_indices]
train_df.to_csv(f'train_smiles.csv', index=False)
val_df.to_csv(f'val_smiles.csv', index=False)
test_df.to_csv(f'test_smiles.csv', index=False)

Row 0 corresponding to class [652] occurs 130 times in the original matrix.
Row 1 corresponding to class [651] occurs 7 times in the original matrix.
Row 2 corresponding to class [650] occurs 25 times in the original matrix.
Row 3 corresponding to class [649] occurs 69 times in the original matrix.
Row 4 corresponding to class [648] occurs 50 times in the original matrix.
Row 5 corresponding to class [647] occurs 35 times in the original matrix.
Row 6 corresponding to class [646] occurs 35 times in the original matrix.
Row 7 corresponding to class [645] occurs 811 times in the original matrix.
Row 8 corresponding to class [644] occurs 5 times in the original matrix.
Row 9 corresponding to class [643] occurs 440 times in the original matrix.
Row 10 corresponding to class [642] occurs 26 times in the original matrix.
Row 11 corresponding to class [641] occurs 10 times in the original matrix.
Row 12 corresponding to class [640] occurs 13 times in the original matrix.
Row 13 corresponding 