In [1]:
import pandas as pd

from ec_number_prediction.data_processing_pipeline.cdhit_clusters import ClustersIdentifier

clusters = ClustersIdentifier.from_files(identity_threshold=80, folder="./data/clusters/", filename='all_sequences')

In [2]:
import pandas as pd

merged_dataset = pd.read_csv("../data/merged_dataset.csv")

  merged_dataset = pd.read_csv("../data/merged_dataset.csv")


In [3]:
import numpy as np

representatives = []
for cluster in clusters.cluster_to_members:
    element = np.random.choice(np.array(clusters.cluster_to_members[cluster].members), size=1)
    representatives.append(element[0])

In [4]:
representatives_dataset = merged_dataset[merged_dataset["accession"].isin(representatives)]

In [5]:
from skmultilearn.model_selection import IterativeStratification

def apply_stratification_sklearn(X: np.ndarray, y: np.ndarray, test_size: float = 0.15, train_size: float = 0.85, n_splits=2):
    """
    Parameters
    ----------
    X : np.ndarray
        Samples
    y : np.ndarray
        Labels
    test_size : float, optional
        Size of the test set, by default 0.15
    train_size : float, optional
        Size of the train set, by default 0.85
    
    Returns
    -------
    Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]
        X_train, y_train, X_test, y_test
    """

    
    stratifier = IterativeStratification(n_splits=n_splits, order=1, sample_distribution_per_fold=[test_size]*n_splits)

    folds = []
    for train_indexes, test_indexes  in stratifier.split(X, y):
        X_train = X.iloc[train_indexes]
        y_train = y.iloc[train_indexes, :]

        X_test = X.iloc[test_indexes]
        y_test = y.iloc[test_indexes, :]

        folds.append((X_train, y_train, X_test, y_test))

    return folds

In [6]:
def generate_stats(y_train: np.ndarray, y_test: np.ndarray, y_val: np.ndarray=None):
    """
    Parameters
    ----------
    y_train : np.ndarray
        Labels of the train set
    y_test : np.ndarray
        Labels of the test set
    y_val : np.ndarray, optional
        Labels of the validation set, by default None
    
    Returns
    -------
    Tuple[pd.DataFrame, Any]
        DataFrame with the stats of the split, styled table
    """
    y_test_sum = np.sum(y_test)
    y_train_sum = np.sum(y_train)

    sum_of_all = pd.DataFrame([y_train_sum, y_test_sum], index=["train", "test"])

    if y_val is not None:
        y_val_sum = np.sum(y_val)
        sum_of_all = pd.DataFrame([y_train_sum, y_test_sum, y_val_sum], index=["train", "test", "validation"])
        sum_of_all.loc['Validation relative split', :] = sum_of_all.loc['validation', :] / (sum_of_all.loc['train', :] + sum_of_all.loc['test', :] + sum_of_all.loc['validation', :]) * 100
        sum_of_all.loc['Test relative split', :] = sum_of_all.loc['test', :] / (sum_of_all.loc['train', :] + sum_of_all.loc['test', :]+ sum_of_all.loc['validation', :]) * 100
        sum_of_all.loc['Train relative split', :] = sum_of_all.loc['train', :] / (sum_of_all.loc['train', :] + sum_of_all.loc['test', :]+ sum_of_all.loc['validation', :]) * 100

    else:
        sum_of_all.loc['Test relative split', :] = sum_of_all.loc['test', :] / (sum_of_all.loc['train', :] + sum_of_all.loc['test', :]) * 100
        sum_of_all.loc['Train relative split', :] = sum_of_all.loc['train', :] / (sum_of_all.loc['train', :] + sum_of_all.loc['test', :]) * 100

    df = pd.melt(sum_of_all.T.reset_index(), id_vars=['index']).rename(columns={'index': 'EC', 'value': 'Percentage of data'})
    if y_val is not None:
        df = df[(df["variable"]!="train") & (df["variable"]!="validation") & (df["variable"]!="test")]
    else: 
        df = df[(df["variable"]!="train") & (df["variable"]!="test")]

    df1 = sum_of_all.loc['Test relative split', :].describe()
    df2 = sum_of_all.loc['Train relative split', :].describe()
    if y_val is not None:
        df3 = sum_of_all.loc['Validation relative split', :].describe()
        stats_table = pd.concat([df1, df2, df3], axis=1)
    else:
        stats_table = pd.concat([df1, df2], axis=1)

    stats_table.drop(['count'], inplace=True)
    table_styled = stats_table.style.background_gradient(cmap="YlGn")
    

    return df, table_styled

In [7]:
X = representatives_dataset.loc[:, "accession"]
y = representatives_dataset.iloc[:, 8:]
y = y.astype(float).astype(int)

In [8]:
import numpy as np
import pandas as pd
from tqdm import tqdm

def split_algorithm(X, y, test_size=0.20):
    # Calculate the sum of y for each unique class
    y_sum = y.sum()
    sorted_indices = np.argsort(y_sum)

    # Initialize mapping and counts
    ec_to_accessions = {}
    remaining_ecs = y_sum.index[sorted_indices].tolist()
    remaining_ecs_counts = y_sum.values[sorted_indices].tolist()

    # Build the ec_to_accessions mapping using vectorized operations
    for ec in remaining_ecs:
        cases = y[y[ec] == 1].index
        ec_to_accessions[ec] = X.loc[cases]

    # Initialize train and test datasets
    X_test_list = []
    y_test_list = []

    # Keep track of indexes to drop later
    indexes_to_drop = []

    for i, ec in enumerate(tqdm(remaining_ecs, desc="Splitting")):
        counts = remaining_ecs_counts[i]
        
        if i == 0:
            # Sample initial set of ECs for the test set
            cases = y[y[ec] == 1]
            n_samples = max(1, round(test_size * cases.shape[0]))
            indexes = cases.sample(n=n_samples, random_state=123).index

            # Collect results in lists for final concatenation
            X_test_list.append(X.loc[indexes])
            y_test_list.append(y.loc[indexes])

            # Track indexes to drop
            indexes_to_drop.extend(indexes)
        else:
            # Check how many cases of the current EC are already in the test set
            cases_ec_test = len(X_test_list) and np.isin(ec_to_accessions[ec], X_test_list).sum()
            total_to_add = test_size - (cases_ec_test / counts)

            if total_to_add > 0.05:


                # Sample from the remaining training set
                available_cases = X.loc[~X.index.isin(indexes_to_drop) & X.isin(ec_to_accessions[ec])]
                n_samples = max(1, round(total_to_add * available_cases.shape[0]))
                indexes = available_cases.sample(n=n_samples, random_state=123).index

                # Collect results in lists for final concatenation
                X_test_list.append(X.loc[indexes])
                y_test_list.append(y.loc[indexes])

                # Track indexes to drop
                indexes_to_drop.extend(indexes)

    # Concatenate the results once at the end
    X_test = pd.concat(X_test_list, ignore_index=True)
    y_test = pd.concat(y_test_list, ignore_index=True)

    # Drop selected indexes from training datasets in one go
    X_train = X.drop(indexes_to_drop)
    y_train = y.drop(indexes_to_drop)

    return X_train, y_train, X_test, y_test



In [8]:
folds = apply_stratification_sklearn(X, y, n_splits=5, test_size=0.20)
stats = []
for i, fold in enumerate(folds):
    X_train, y_train, X_test, y_test = fold
    df_with_stats, table_styled = generate_stats(y_train, y_test)
    
    stats.append(table_styled)

In [9]:
test_folds = []

for fold in folds:
    X_train, y_train, X_test, y_test = fold
    test = merged_dataset[merged_dataset["accession"].isin(X_test)]
    test_folds.append(test)

In [10]:
rest_of_train_datasets = []
rest_of_test_datasets = []

for fold in folds:
    X_train, y_train, X_test, y_test = fold
    rest_of_train_dataset = []
    rest_of_test_dataset = []
    for accession in X_train:
        cluster = clusters.get_cluster_by_member(accession).members
        rest_of_train_dataset.extend(cluster)

    for accession in X_test:
        cluster = clusters.get_cluster_by_member(accession).members
        rest_of_test_dataset.extend(cluster)

    rest_of_train_datasets.append(rest_of_train_dataset)
    rest_of_test_datasets.append(rest_of_test_dataset)

In [12]:
train_datasets = []
test_datasets = []
for i, fold in enumerate(folds): 
    train = merged_dataset[merged_dataset["accession"].isin(rest_of_train_datasets[i])]
    test = merged_dataset[merged_dataset["accession"].isin(rest_of_test_datasets[i])]

    train_datasets.append(train)
    test_datasets.append(test)

In [16]:
train_dataset = train_datasets[0]

In [19]:
test_dataset = test_datasets[0]

In [20]:
X_train = train_dataset.loc[:, "accession"]
y_train = train_dataset.iloc[:, 8:]
y_train = y_train.astype(float).astype(int)

X_test = test_dataset.loc[:, "accession"]
y_test = test_dataset.iloc[:, 8:]
y_test = y_test.astype(float).astype(int)

In [22]:
df_with_stats, table_styled = generate_stats(y_train, y_test)
table_styled

Unnamed: 0,Test relative split,Train relative split
mean,20.031162,79.968838
std,5.48351,5.48351
min,0.0,18.0
25%,17.0,77.0
50%,19.868996,80.131004
75%,23.0,83.0
max,82.0,100.0
