Based on the full data subset A with no missing data, the full samples for y=0 and y=1 are learned and new samples are generated using the data and labels from tp2.

In [1]:
import os
import warnings
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.mixture import GaussianMixture
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import f_classif, SelectKBest
from sklearn.model_selection import StratifiedKFold
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel
from torch.utils.data import TensorDataset, DataLoader
from skbio.stats.composition import clr, alr, ilr
from scipy.stats import gaussian_kde
import scipy.stats as stats
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import matplotlib.pyplot as plt

In [2]:
def evaluate_model(y_true, y_pred, y_pred_proba):
    
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    sensitivity = recall_score(y_true, y_pred)
    specificity = tn / (tn + fp)
    balanced_accuracy = (sensitivity + specificity) / 2
    f1 = f1_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred_proba)
    
    return {
#         'Accuracy': accuracy,
        'Balanced_accuracy': balanced_accuracy,
        'Sensitivity': sensitivity,
        'Specificity': specificity,
#         'F1 Score': f1,
        'AUC': auc
    }

In [3]:
pd.set_option('future.no_silent_downcasting', True)

In [4]:
os.environ["PYTHONHASHSEED"] = "0"

In [5]:
warnings.filterwarnings("ignore", category=UserWarning)

In [6]:
# Define K-fold cross-validation
# 1. There are positive samples in both the training and validation sets.
# 2. the class distributions in the training and validation sets are similar to the original dataset.

n_splits = 5
skf = StratifiedKFold(n_splits= n_splits, shuffle=True, random_state = 42)

### metadata

In [7]:
Metadata = pd.read_csv("data/BASIC_metadata_full.csv", sep=',', low_memory=False)
Metadata = Metadata.drop(Metadata.columns[0], axis=1)

# convert timepoits to 0,1,2
Metadata.loc[Metadata.TimePoint == "Trimester2","TimePoint"] = 0 
Metadata.loc[Metadata.TimePoint == "Trimester3","TimePoint"] = 1
Metadata.loc[Metadata.TimePoint == "PostpartumWeek6","TimePoint"] = 2

# turn insufficient reads to NaN
i = Metadata[Metadata.ReadsNumber < 500000].index
Metadata.loc[i, 'ReadsNumber'] = np.nan

### species data

In [8]:
profile =pd.read_csv("data/Species_Profile_full.csv",sep=',',low_memory=False)

# extract all bacteria names
full_list_bacteria = list(profile.columns)[1:]

species = profile.to_numpy()[:,1:]

species_num = np.shape(species)[1] # 713 species

In [9]:
# Inner join profile and metadeata
merged_data_base = pd.merge(profile, Metadata, left_on='Sample_id', right_on='Sample_ID')

merged_data = merged_data_base.dropna(subset=['ReadsNumber'])[['Individual_ID', 'TimePoint', 'EPDS', 'Dichotomous_EPDS'] + full_list_bacteria]

In [10]:
# 1. Sample individuals whose EPDS != NaN at tp2
individuals_with_na_epds_at_tp2 = merged_data[
    (merged_data['TimePoint'] == 2) & (merged_data['EPDS'].isna())
]['Individual_ID'].unique()

data = merged_data[~merged_data['Individual_ID'].isin(individuals_with_na_epds_at_tp2)]

# 2. Sample individuals with data at tp0, tp1 and tp2
individuals_with_all_timepoints = data.groupby('Individual_ID').filter(lambda x: set(x['TimePoint']) >= {0, 1, 2})['Individual_ID'].unique()
data = data[data['Individual_ID'].isin(individuals_with_all_timepoints)]

In [11]:
# Remove features that have a value of 0 at all time points for all samples
columns_to_drop = []
for col in full_list_bacteria:
    if (data[col] == 0).all():
        columns_to_drop.append(col)
data = data.drop(columns=columns_to_drop)

# Update full_list_bacteria
full_list_bacteria = [col for col in full_list_bacteria if col not in columns_to_drop]

### （1）Non-CLR

### Comparison of Zero-value Replacement Methods

#### Replacement with half of the non-zero minimum value of each row
- **Effect description**: <span style="color:red;"> The effect after feature extraction was poor compared to replacing with column, but significant after applying oversampling.<span>
- **Applicable scenario**: Suitable for scenarios where the focus is on within-sample variation, such as comparing the relative abundances of different species within the same sample.
- **Advantages**: Maintains the internal structure of the sample and reduces the impact of inter-sample variation.
- **Disadvantages**: May lead to inconsistent replacement values across different samples, affecting cross-sample comparisons.

---

#### Replacement with half of the non-zero minimum value of each column
- **Effect description**: <span style="color:red;"> Oversampling fails to learn and generate new and better samples.<span>
- **Applicable scenario**: Applicable when the focus is on the distribution of features (e.g., species) across different samples.
- **Advantages**: Maintains the consistency of features across different samples, facilitating cross-sample comparisons.
- **Disadvantages**: May ignore the internal structure of the sample, affecting the analysis of within-sample variation.

---

#### Replacement with default value 1e-10
- **Effect description**: <span style="color:red;"> The effect after feature extraction was poor compared to replacing with column, and also oversampling fails to learn and generate new and better samples.<span>
- **Applicable scenario**: Suitable when zero values result from measurement limitations and the replacement value minimally impacts overall analysis, e.g., in preliminary data exploration.
- **Advantages**: imple to implement, no complex calculations or parameter estimations needed. Quickly handles zero values for further processing.
- **Disadvantages**: Ignores data characteristics like non - zero value distributions. Fixed replacement may not reflect true zero - valued data, affecting analysis accuracy.

In [12]:
# 1.Replace the zero value by 1/2 of the non - zero minimum value
data[full_list_bacteria] = data[full_list_bacteria].astype(float)

# Replace with 1/2 of the non-zero minimum value of the row
matrix = data[full_list_bacteria].values
for i in range(matrix.shape[0]):
    row = matrix[i, :]
    non_zero_values = row[~np.isnan(row) & (row > 0)]
    if len(non_zero_values) > 0:
        min_non_zero = np.min(non_zero_values)
        half_min = min_non_zero / 2
        row[row == 0] = half_min
    matrix[i, :] = row

data[full_list_bacteria] = matrix

In [13]:
sum_first_row_pandas = data[full_list_bacteria].iloc[0].sum()
print("使用 pandas 求和结果:", sum_first_row_pandas)

使用 pandas 求和结果: 100.01409000000001


In [14]:
# Normalization
for i in range(matrix.shape[0]):
    row = matrix[i, :]
    
    row_sum = np.sum(row)
    row = row / row_sum
    matrix[i, :] = row

data[full_list_bacteria] = matrix

In [15]:
# Turn data from [individual * tp, feature] into [individual, tp, feature]
grouped = data.groupby('Individual_ID')

transformed_data = []
labels = []

for individual_id, group in grouped:
    time_point_matrix = np.full((3, len(full_list_bacteria)), np.nan)

    for _, row in group.iterrows():
        time_point = int(row['TimePoint'])
        time_point_matrix[time_point] = row[full_list_bacteria].values

    tp2_row = group[group['TimePoint'] == 2]
    label = tp2_row['Dichotomous_EPDS'].values[0]

    transformed_data.append(time_point_matrix)
    labels.append(label)

transformed_data = np.array(transformed_data)
labels = np.array(labels)

In [16]:
print("minority class：", len(labels[labels == 1]))
print("majority class：", len(labels[labels == 0]))

minority class： 15
majority class： 70


In [17]:
# Build an RF model on the data to make predictions with only CLR.
# There are a lot of NAs in features
X = transformed_data[:, :2, :].reshape(transformed_data.shape[0], -1)
y = labels

metrics_list = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):

    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_val)
    y_pred_proba = rf.predict_proba(X_val)[:, 1]

    metrics = evaluate_model(y_val, y_pred, y_pred_proba)
    metrics_list.append(metrics)

avg_metrics = {
    'Balanced_accuracy': np.mean([m['Balanced_accuracy'] for m in metrics_list]),
    'Sensitivity': np.mean([m['Sensitivity'] for m in metrics_list]),
    'Specificity': np.mean([m['Specificity'] for m in metrics_list]),
    'AUC': np.mean([m['AUC'] for m in metrics_list]),
}

std_metrics = {
    'Balanced_accuracy': np.std([m['Balanced_accuracy'] for m in metrics_list]),
    'Sensitivity': np.std([m['Sensitivity'] for m in metrics_list]),
    'Specificity': np.std([m['Specificity'] for m in metrics_list]),
    'AUC': np.std([m['AUC'] for m in metrics_list]),
}

print("Average Results:")
for metric, value in avg_metrics.items():
    print(f"  {metric}: {value:.4f} ± {std_metrics[metric]:.4f}")

Average Results:
  Balanced_accuracy: 0.5000 ± 0.0000
  Sensitivity: 0.0000 ± 0.0000
  Specificity: 1.0000 ± 0.0000
  AUC: 0.5643 ± 0.1945


### （2）Feature Extraction

In [18]:
# Null values in Dichotomous_EPDS at tp0, tp1 of data
extracted_data_base = data.dropna(subset=['Dichotomous_EPDS'])

In [19]:
for time_point in [0, 1]:
    time_point_data = extracted_data_base[extracted_data_base['TimePoint'] == time_point]
    
    X = time_point_data.drop(['Individual_ID', 'TimePoint', 'EPDS', 'Dichotomous_EPDS'], axis=1)
    y = time_point_data['Dichotomous_EPDS']

    selector = SelectKBest(score_func=f_classif, k = 3)  # Select the k most important features
    X_new = selector.fit_transform(X, y)

    feature_indices = selector.get_support(indices=True)
    feature_columns = X.columns[feature_indices]
    
    print(f"Selected features for TimePoint {time_point}: {feature_columns}")

    if time_point == 0:
        selected_features_tp0 = feature_columns
    elif time_point == 1:
        selected_features_tp1 = feature_columns
        
feature_columns = list(set(selected_features_tp0) | set(selected_features_tp1))[:10]

Selected features for TimePoint 0: Index(['Allisonella_histaminiformans', 'Lactococcus_lactis',
       'Enterococcus_faecium'],
      dtype='object')
Selected features for TimePoint 1: Index(['Enterococcus_faecalis', 'Allisonella_histaminiformans',
       'Roseburia_sp_CAG_303'],
      dtype='object')


In [21]:
# Get the data after feature selection
unused_bacteria_columns = [col for col in full_list_bacteria if col not in feature_columns]
others_column = data[unused_bacteria_columns].sum(axis=1).to_frame(name='Others')

data = data[['Individual_ID', 'TimePoint', 'EPDS', 'Dichotomous_EPDS'] + list(feature_columns)]
data = pd.concat([data, others_column], axis=1)

In [22]:
# CLR
columns_to_transform = [col for col in data.columns if col not in ['Individual_ID', 'TimePoint', 'EPDS', 'Dichotomous_EPDS']]
transformed_data = data[columns_to_transform]

clr_transformed = clr(transformed_data.values)
clr_transformed_df = pd.DataFrame(clr_transformed, columns=columns_to_transform, index=transformed_data.index)

data.loc[:, columns_to_transform] = clr_transformed_df

In [23]:
# Turn data from [individual * tp, feature] into [individual, tp, feature]
grouped = data.groupby('Individual_ID')

extracted_data = []
labels = []

for individual_id, group in grouped:
    time_point_matrix = np.full((3, len(feature_columns)), np.nan)

    for _, row in group.iterrows():
        time_point = int(row['TimePoint'])
        time_point_matrix[time_point] = row[feature_columns].values

    tp2_row = group[group['TimePoint'] == 2]
    label = tp2_row['Dichotomous_EPDS'].values[0]

    extracted_data.append(time_point_matrix)
    labels.append(label)

extracted_data = np.array(extracted_data)
labels = np.array(labels)

In [24]:
print("minority class：", len(labels[labels == 1]))
print("majority class：", len(labels[labels == 0]))

minority class： 15
majority class： 70


In [25]:
# Only feature selection, with a small increase in sensitivity
X = extracted_data[:, :2, :].reshape(extracted_data.shape[0], -1)
y = labels

metrics_list = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):

    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_val)
    y_pred_proba = rf.predict_proba(X_val)[:, 1]

    metrics = evaluate_model(y_val, y_pred, y_pred_proba)
    metrics_list.append(metrics)

avg_metrics = {
    'Balanced_accuracy': np.mean([m['Balanced_accuracy'] for m in metrics_list]),
    'Sensitivity': np.mean([m['Sensitivity'] for m in metrics_list]),
    'Specificity': np.mean([m['Specificity'] for m in metrics_list]),
    'AUC': np.mean([m['AUC'] for m in metrics_list]),
}

std_metrics = {
    'Balanced_accuracy': np.std([m['Balanced_accuracy'] for m in metrics_list]),
    'Sensitivity': np.std([m['Sensitivity'] for m in metrics_list]),
    'Specificity': np.std([m['Specificity'] for m in metrics_list]),
    'AUC': np.std([m['AUC'] for m in metrics_list]),
}

print("Average Results:")
for metric, value in avg_metrics.items():
    print(f"  {metric}: {value:.4f} ± {std_metrics[metric]:.4f}")

Average Results:
  Balanced_accuracy: 0.5643 ± 0.2038
  Sensitivity: 0.2000 ± 0.4000
  Specificity: 0.9286 ± 0.0782
  AUC: 0.6048 ± 0.2007


### （3）Feature Extraction + OverSampling

In [26]:
# Data segmentation
# Use data containing all time points when oversampling
X_all = extracted_data[:, :, :].reshape(extracted_data.shape[0], -1)
# Only data from tp0 and tp1 are used for modeling.
X = extracted_data[:, :2, :].reshape(extracted_data.shape[0], -1)
y = labels

# Assuming the number of new samples generated
num_new_samples = 55 # 70-15 = 55

# Assume here that the minority class samples are originally in the shape (number of samples, 2, number of features), 
# and take this into account when recovering the shape
original_num_time_steps = 3
original_num_features = extracted_data.shape[2]

### cGAN

In [27]:
# Generate corresponding time point information for each sample
num_samples = X_all.shape[0]

time_points = np.tile(np.arange(original_num_time_steps), num_samples)
# Encoded for each time point
time_point_encoded = pd.get_dummies(time_points).values.reshape(num_samples, original_num_time_steps,
                                                                original_num_time_steps)
time_point_encoded_tensor = torch.tensor(time_point_encoded, dtype=torch.float32)


class Generator(nn.Module):
    def __init__(self, latent_dim, output_shape, time_dim):
        super(Generator, self).__init__()
        self.output_shape = output_shape
        self.time_dim = time_dim
        self.model = nn.Sequential(
            nn.Linear(latent_dim + time_dim * output_shape[0], 128),
            nn.LeakyReLU(0.2),
            nn.BatchNorm1d(128, momentum=0.8),
            nn.Linear(128, 256),
            nn.LeakyReLU(0.2),
            nn.BatchNorm1d(256, momentum=0.8),
            nn.Linear(256, 512),
            nn.LeakyReLU(0.2),
            nn.BatchNorm1d(512, momentum=0.8),
            nn.Linear(512, np.prod(output_shape)),
            nn.Tanh(),
        )

    def forward(self, z, t):
        t = t.reshape(-1, self.time_dim * self.output_shape[0])
        zt = torch.cat([z, t], dim=1)
        output = self.model(zt)
        output = output.view(output.size(0), *self.output_shape)
        return output


class Discriminator(nn.Module):
    def __init__(self, input_shape, time_dim):
        super(Discriminator, self).__init__()
        self.input_shape = input_shape 
        self.time_dim = time_dim
        self.model = nn.Sequential(
            nn.Flatten(),
            nn.Linear(np.prod(input_shape) + time_dim * input_shape[0], 512),
            nn.LeakyReLU(0.2),
            nn.Linear(512, 256),
            nn.LeakyReLU(0.2),
            nn.Linear(256, 1),
            nn.Sigmoid(),
        )

    def forward(self, x, t):
        x = x.reshape(-1, np.prod(self.input_shape))
        t = t.reshape(-1, self.time_dim * self.input_shape[0])
        xt = torch.cat([x, t], dim=1)
        return self.model(xt)


class tsGAN:
    def __init__(self, latent_dim, output_shape, lr=0.0002, b1=0.5, b2=0.999):
        self.latent_dim = latent_dim
        self.output_shape = output_shape
        self.time_dim = output_shape[0]

        self.generator = Generator(latent_dim, output_shape, self.time_dim)
        self.discriminator = Discriminator(output_shape, self.time_dim)

        self.optimizer_G = torch.optim.Adam(self.generator.parameters(), lr=lr, betas=(b1, b2))
        self.optimizer_D = torch.optim.Adam(self.discriminator.parameters(), lr=lr, betas=(b1, b2))

        self.loss_fn = nn.BCELoss()

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.generator.to(self.device)
        self.discriminator.to(self.device)

    def train(self, X_train, epochs, batch_size):
        X_train = torch.tensor(X_train, dtype=torch.float32).to(self.device)
        time_points_train = time_point_encoded_tensor[torch.tensor(np.arange(len(X_train)))]
        time_points_train = time_points_train.to(self.device)

        dataset = TensorDataset(X_train, time_points_train)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

        for epoch in range(epochs):
            self.discriminator.train()
            self.generator.eval()

            for real_data, real_time in dataloader:
                real_labels = torch.ones(real_data.size(0), 1).to(self.device)
                noise = torch.randn(real_data.size(0), self.latent_dim).to(self.device)
                fake_data = self.generator(noise, real_time)
                fake_labels = torch.zeros(real_data.size(0), 1).to(self.device)

                # Calculation of discriminator loss
                self.optimizer_D.zero_grad()
                real_loss = self.loss_fn(self.discriminator(real_data, real_time), real_labels)
                fake_loss = self.loss_fn(self.discriminator(fake_data.detach(), real_time), fake_labels)
                d_loss = (real_loss + fake_loss) / 2
                d_loss.backward()
                self.optimizer_D.step()

            self.discriminator.eval()
            self.generator.train()

            for real_data, real_time in dataloader:
                noise = torch.randn(real_data.size(0), self.latent_dim).to(self.device)
                fake_data = self.generator(noise, real_time)
                valid_labels = torch.ones(real_data.size(0), 1).to(self.device)

                # Calculate the generator loss
                self.optimizer_G.zero_grad()
                g_loss = self.loss_fn(self.discriminator(fake_data, real_time), valid_labels)
                g_loss.backward()
                self.optimizer_G.step()

            if epoch % 1000 == 0:
                print(f"[Epoch {epoch}/{epochs}] [D loss: {d_loss.item()}] [G loss: {g_loss.item()}]")

    def generate_samples(self, num_samples):
        self.generator.eval()
        new_time_points = np.tile(np.arange(self.time_dim), num_samples)
        new_time_points_encoded = pd.get_dummies(new_time_points).values.reshape(num_samples, self.time_dim,
                                                                                self.time_dim)
        new_time_points_encoded_tensor = torch.tensor(new_time_points_encoded, dtype=torch.float32).to(self.device)
        noise = torch.randn(num_samples, self.latent_dim).to(self.device)
        with torch.no_grad():
            fake_data = self.generator(noise, new_time_points_encoded_tensor)
        return fake_data.cpu().numpy()


metrics_list = []

# Initialize tsGAN
latent_dim = 100
output_shape = (original_num_time_steps, original_num_features)
tsgan = tsGAN(latent_dim, output_shape)

n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(X_all, y)):
    X_train_all, X_val_all = X_all[train_idx], X_all[val_idx]
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    # Train tsGAN with all data (both y=0 and y=1)
    tsgan.train(X_train_all, epochs=500, batch_size=16)

    # Generate new samples using only data with y=1
    minority_class = 1
    minority_indices = np.where(y_train == minority_class)[0]
    minority_X_all = X_train_all[minority_indices]

    # Generate new samples
    new_samples_np = tsgan.generate_samples(num_new_samples)
    new_samples_np = new_samples_np.reshape(num_new_samples, -1)

    # Merge
    combined_X_train_all = np.concatenate([X_train_all, new_samples_np], axis=0)
    combined_y_train = np.concatenate([y_train, np.full(num_new_samples, minority_class)], axis=0)

    combined_X_train = combined_X_train_all[:, :2 * original_num_features]

    # Training a Random Forest Classifier
    rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")
    rf.fit(combined_X_train, combined_y_train)

    # Evaluating models on validation sets
    y_pred = rf.predict(X_val)
    y_pred_proba = rf.predict_proba(X_val)[:, 1]

    metrics = evaluate_model(y_val, y_pred, y_pred_proba)
    metrics_list.append(metrics)

avg_metrics = {
    'Balanced_accuracy': np.mean([m['Balanced_accuracy'] for m in metrics_list]),
    'Sensitivity': np.mean([m['Sensitivity'] for m in metrics_list]),
    'Specificity': np.mean([m['Specificity'] for m in metrics_list]),
    'AUC': np.mean([m['AUC'] for m in metrics_list]),
}

std_metrics = {
    'Balanced_accuracy': np.std([m['Balanced_accuracy'] for m in metrics_list]),
    'Sensitivity': np.std([m['Sensitivity'] for m in metrics_list]),
    'Specificity': np.std([m['Specificity'] for m in metrics_list]),
    'AUC': np.std([m['AUC'] for m in metrics_list]),
}

print("Average Results:")
for metric, value in avg_metrics.items():
    print(f"  {metric}: {value:.4f} ± {std_metrics[metric]:.4f}")

[Epoch 0/500] [D loss: 0.5122999548912048] [G loss: 0.6745175123214722]
[Epoch 0/500] [D loss: 1.0727481821959373e-05] [G loss: 10.903702735900879]
[Epoch 0/500] [D loss: 2.2065196390030906e-06] [G loss: 12.780921936035156]
[Epoch 0/500] [D loss: 5.421073865363724e-07] [G loss: 14.312573432922363]
[Epoch 0/500] [D loss: 7.58870370987097e-08] [G loss: 15.701498031616211]
Average Results:
  Balanced_accuracy: 0.6310 ± 0.1774
  Sensitivity: 0.3333 ± 0.3651
  Specificity: 0.9286 ± 0.0452
  AUC: 0.6548 ± 0.1737


### GMMs

The code does not consider time dependency.

Modeling by time points: GMM modeling is performed on the data at different time points separately, and then when generating new samples, the corresponding models are selected to be sampled according to the time points, and then combined to form a complete sample.

In [28]:
metrics_list = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_all, y)):
    X_train_all, X_val_all = X_all[train_idx], X_all[val_idx]
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    gmm = GaussianMixture(n_components=2, random_state=42)
    gmm.fit(X_train_all)  

    minority_class = 1
    minority_indices = np.where(y_train == minority_class)[0]

    new_samples_flattened, _ = gmm.sample(num_new_samples)

    new_samples_np = new_samples_flattened.reshape(num_new_samples, original_num_time_steps, original_num_features)
    new_samples_np = new_samples_np.reshape(num_new_samples, -1)  # To test the validity of a new sample.

    combined_X_train_all = np.concatenate([X_train_all, new_samples_np], axis=0)
    combined_y_train = np.concatenate([y_train, np.full(num_new_samples, minority_class)], axis=0)

    combined_X_train = combined_X_train_all[:, :2 * original_num_features]

    rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")
    rf.fit(combined_X_train, combined_y_train)

    y_pred = rf.predict(X_val)
    y_pred_proba = rf.predict_proba(X_val)[:, 1]

    metrics = evaluate_model(y_val, y_pred, y_pred_proba)
    metrics_list.append(metrics)

avg_metrics = {
    'Balanced_accuracy': np.mean([m['Balanced_accuracy'] for m in metrics_list]),
    'Sensitivity': np.mean([m['Sensitivity'] for m in metrics_list]),
    'Specificity': np.mean([m['Specificity'] for m in metrics_list]),
    'AUC': np.mean([m['AUC'] for m in metrics_list]),
}

std_metrics = {
    'Balanced_accuracy': np.std([m['Balanced_accuracy'] for m in metrics_list]),
    'Sensitivity': np.std([m['Sensitivity'] for m in metrics_list]),
    'Specificity': np.std([m['Specificity'] for m in metrics_list]),
    'AUC': np.std([m['AUC'] for m in metrics_list]),
}

print("Average Results:")
for metric, value in avg_metrics.items():
    print(f"  {metric}: {value:.4f} ± {std_metrics[metric]:.4f}")

Average Results:
  Balanced_accuracy: 0.5905 ± 0.1789
  Sensitivity: 0.4667 ± 0.4000
  Specificity: 0.7143 ± 0.0782
  AUC: 0.6310 ± 0.1758


### CVAEs

For each sample, the time point information is uniquely encoded (pd.get_dummies) to obtain time_point_encoded and converted to the PyTorch tensor time_point_encoded_tensor.

The discrete-time parameters are added to the CVAE model by splicing the time-point encoded information with the input data and category information (torch.cat) as input to the model during the encode and decode process.

In [29]:
metrics_list = []


class CVAE(nn.Module):
    def __init__(self, input_dim, latent_dim, cond_dim, time_dim):
        super(CVAE, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim * original_num_time_steps + cond_dim + time_dim * original_num_time_steps, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU()
        )
        self.fc_mu = nn.Linear(128, latent_dim)
        self.fc_logvar = nn.Linear(128, latent_dim)
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim + cond_dim + time_dim * original_num_time_steps, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, input_dim * original_num_time_steps),
            nn.Sigmoid()
        )

    def encode(self, x, c, t):
        x = x.reshape(x.size(0), -1)
        t = t.reshape(t.size(0), -1)
        xct = torch.cat([x, c, t], dim=1)
        h = self.encoder(xct)
        mu = self.fc_mu(h)
        logvar = self.fc_logvar(h)
        return mu, logvar

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z, c, t):
        t = t.reshape(t.size(0), -1)
        zct = torch.cat([z, c, t], dim=1)
        recon_x = self.decoder(zct)
        recon_x = recon_x.reshape(recon_x.size(0), original_num_time_steps, -1)
        return recon_x

    def forward(self, x, c, t):
        mu, logvar = self.encode(x, c, t)
        z = self.reparameterize(mu, logvar)
        recon_x = self.decode(z, c, t)
        return recon_x, mu, logvar


# hyperparameterization based on grid search
input_dim = original_num_features
latent_dim = 5
cond_dim = len(np.unique(y))
time_dim = original_num_time_steps
batch_size = 16
epochs = 100
learning_rate = 0.01

n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(X_all, y)):
    X_train_all, X_val_all = X_all[train_idx], X_all[val_idx]
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    all_X = X_train_all.reshape(-1, original_num_time_steps, original_num_features)
    all_y = y_train

    time_points = np.tile(np.arange(original_num_time_steps), len(all_X))
    time_point_encoded = pd.get_dummies(time_points).values.reshape(-1, original_num_time_steps, original_num_time_steps)

    # Convert to PyTorch tensor
    all_X_tensor = torch.tensor(all_X, dtype=torch.float32)
    all_y_tensor = torch.tensor(all_y, dtype=torch.long)
    time_point_encoded_tensor = torch.tensor(time_point_encoded, dtype=torch.float32)

    # Creating a DataLoader
    dataset = TensorDataset(all_X_tensor, all_y_tensor, time_point_encoded_tensor)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # Initializing the model, optimizer, and loss function
    model = CVAE(input_dim, latent_dim, cond_dim, time_dim)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()

    for epoch in range(epochs):
        running_loss = 0.0
        for batch_X, batch_y, batch_t in dataloader:
            batch_c = torch.nn.functional.one_hot(batch_y, num_classes=cond_dim).float()
            optimizer.zero_grad()
            recon_x, mu, logvar = model(batch_X, batch_c, batch_t)

            recon_loss = criterion(recon_x, batch_X)
            kl_divergence = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())

            loss = recon_loss + kl_divergence

            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        # print(f'Epoch {epoch + 1}/{epochs}, Loss: {running_loss / len(dataloader)}')

    # Generate new samples using only data with y=1
    minority_class = 1
    minority_indices = np.where(y_train == minority_class)[0]
    minority_X = X_train_all[minority_indices].reshape(-1, original_num_time_steps, original_num_features)
    minority_y = y_train[minority_indices]

    # Generate new samples
    new_samples_z = torch.randn(num_new_samples, latent_dim)
    new_samples_c = torch.ones(num_new_samples, dtype=torch.long) * minority_class
    new_samples_c = torch.nn.functional.one_hot(new_samples_c, num_classes=cond_dim).float()

    new_time_points = np.tile(np.arange(original_num_time_steps), num_new_samples)
    new_time_points_encoded = pd.get_dummies(new_time_points).values.reshape(num_new_samples, original_num_time_steps,
                                                                            original_num_time_steps)
    new_time_points_encoded_tensor = torch.tensor(new_time_points_encoded, dtype=torch.float32)

    new_samples = []
    with torch.no_grad():
        for i in range(num_new_samples):
            sample = model.decode(new_samples_z[i].unsqueeze(0), new_samples_c[i].unsqueeze(0),
                                  new_time_points_encoded_tensor[i].unsqueeze(0))
            new_samples.append(sample.numpy())

    new_samples_np = np.concatenate(new_samples, axis=0).reshape(num_new_samples, -1)

    combined_X_train_all = np.concatenate([X_train_all, new_samples_np], axis=0)
    combined_y_train = np.concatenate([y_train, np.full(num_new_samples, minority_class)], axis=0)

    combined_X_train = combined_X_train_all[:, :2 * original_num_features]

    rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")
    rf.fit(combined_X_train, combined_y_train)

    y_pred = rf.predict(X_val)
    y_pred_proba = rf.predict_proba(X_val)[:, 1]

    metrics = evaluate_model(y_val, y_pred, y_pred_proba)
    metrics_list.append(metrics)

avg_metrics = {
    'Balanced_accuracy': np.mean([m['Balanced_accuracy'] for m in metrics_list]),
    'Sensitivity': np.mean([m['Sensitivity'] for m in metrics_list]),
    'Specificity': np.mean([m['Specificity'] for m in metrics_list]),
    'AUC': np.mean([m['AUC'] for m in metrics_list]),
}

std_metrics = {
    'Balanced_accuracy': np.std([m['Balanced_accuracy'] for m in metrics_list]),
    'Sensitivity': np.std([m['Sensitivity'] for m in metrics_list]),
    'Specificity': np.std([m['Specificity'] for m in metrics_list]),
    'AUC': np.std([m['AUC'] for m in metrics_list]),
}

print("Average Results:")
for metric, value in avg_metrics.items():
    print(f"  {metric}: {value:.4f} ± {std_metrics[metric]:.4f}")

Average Results:
  Balanced_accuracy: 0.6167 ± 0.1797
  Sensitivity: 0.3333 ± 0.3651
  Specificity: 0.9000 ± 0.0728
  AUC: 0.6595 ± 0.1720


### Kernel density estimation

Time-point weighted kernel function: When performing kernel density estimation, different weights are assigned to data at different time points. For example, a higher weight is given to data at a recent point in time and a lower weight to data at a distant point in time, by adjusting the weights to reflect the difference in importance of the points in time.

In [30]:
metrics_list = []
np.random.seed(0)

# Time Points weight
weights = np.linspace(0.5, 1, original_num_time_steps)

n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(X_all, y)):
    X_train_all, X_val_all = X_all[train_idx], X_all[val_idx]
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    # Train KDE with all data (both y=0 and y=1)
    all_X = X_train_all.reshape(-1, original_num_time_steps, original_num_features)
    all_y = y_train

    # Generate new samples using only data with y=1
    minority_class = 1
    minority_indices = np.where(y_train == minority_class)[0]
    minority_X = X_train_all[minority_indices].reshape(-1, original_num_time_steps, original_num_features)

    new_samples = []

    # Kernel density estimation and sampling of data for each time point
    for time_point in range(original_num_time_steps):
        time_point_data = all_X[:, time_point, :]

        # Multiply the data for certain tp by the corresponding weights
        weighted_time_point_data = time_point_data * weights[time_point]

        # Perform kernel density estimation
        kde = gaussian_kde(weighted_time_point_data.T)

        minority_time_point_data = minority_X[:, time_point, :]
        weighted_minority_time_point_data = minority_time_point_data * weights[time_point]

        # Generate new samples by sampling from the distribution of kernel density estimates
        new_samples_time_point_flattened = kde.resample(num_new_samples).T / weights[time_point]

        new_samples.append(new_samples_time_point_flattened)

    new_samples = np.stack(new_samples, axis=1).reshape(num_new_samples, -1)

    combined_X_train_all = np.concatenate([X_train_all, new_samples], axis=0)
    combined_y_train = np.concatenate([y_train, np.full(num_new_samples, minority_class)], axis=0)

    combined_X_train = combined_X_train_all[:, :2 * original_num_features]

    rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")
    rf.fit(combined_X_train, combined_y_train)

    y_pred = rf.predict(X_val)
    y_pred_proba = rf.predict_proba(X_val)[:, 1]

    metrics = evaluate_model(y_val, y_pred, y_pred_proba)
    metrics_list.append(metrics)

avg_metrics = {
    'Balanced_accuracy': np.mean([m['Balanced_accuracy'] for m in metrics_list]),
    'Sensitivity': np.mean([m['Sensitivity'] for m in metrics_list]),
    'Specificity': np.mean([m['Specificity'] for m in metrics_list]),
    'AUC': np.mean([m['AUC'] for m in metrics_list]),
}

std_metrics = {
    'Balanced_accuracy': np.std([m['Balanced_accuracy'] for m in metrics_list]),
    'Sensitivity': np.std([m['Sensitivity'] for m in metrics_list]),
    'Specificity': np.std([m['Specificity'] for m in metrics_list]),
    'AUC': np.std([m['AUC'] for m in metrics_list]),
}

print("Average Results:")
for metric, value in avg_metrics.items():
    print(f"  {metric}: {value:.4f} ± {std_metrics[metric]:.4f}")

Average Results:
  Balanced_accuracy: 0.6381 ± 0.1725
  Sensitivity: 0.5333 ± 0.3399
  Specificity: 0.7429 ± 0.0728
  AUC: 0.6976 ± 0.1969


### Dirichlet Distribution + Bayesian Inference

Time-point conditional sampling: when generating new samples from a sampling from the Dirichlet distribution, the probability distribution of the samples is adjusted depending on the time point. Sampling is performed at different time steps using different parameters of the Delikeray distribution.

In [31]:
metrics_list = []
np.random.seed(0)

for fold, (train_idx, val_idx) in enumerate(skf.split(X_all, y)):
    X_train_all, X_val_all = X_all[train_idx], X_all[val_idx]
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    all_X = X_train_all.reshape(-1, original_num_time_steps, original_num_features)
    all_y = y_train

    minority_class = 1
    minority_indices = np.where(y_train == minority_class)[0]
    minority_X = X_train_all[minority_indices].reshape(-1, original_num_time_steps, original_num_features)

    new_samples = []

    for time_point in range(original_num_time_steps):
        time_point_data = all_X[:, time_point, :]

        # Calculate the mean and standard deviation of each feature for Bayesian inference to adjust the Dirichlet parameters
        feature_means = np.mean(time_point_data, axis=0)
        feature_stds = np.std(time_point_data, axis=0)

        # Adjust the parameters of the Dirichlet distribution based on Bayesian inference
        # Ensure that all elements in the alpha array are greater than 0
        alpha = np.maximum(feature_means + 1, 1e-8)  # Minimal：1e-8

        minority_time_point_data = minority_X[:, time_point, :]

        #  Sampling from the Delicacy distribution to generate new samples
        samples_at_time_point = np.random.dirichlet(alpha, num_new_samples)

        new_samples.append(samples_at_time_point)

    new_samples = np.stack(new_samples, axis=1).reshape(num_new_samples, -1)

    combined_X_train_all = np.concatenate([X_train_all, new_samples], axis=0)
    combined_y_train = np.concatenate([y_train, np.full(num_new_samples, minority_class)], axis=0)

    combined_X_train = combined_X_train_all[:, :2 * original_num_features]

    rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")
    rf.fit(combined_X_train, combined_y_train)

    y_pred = rf.predict(X_val)
    y_pred_proba = rf.predict_proba(X_val)[:, 1]

    metrics = evaluate_model(y_val, y_pred, y_pred_proba)
    metrics_list.append(metrics)

avg_metrics = {
    'Balanced_accuracy': np.mean([m['Balanced_accuracy'] for m in metrics_list]),
    'Sensitivity': np.mean([m['Sensitivity'] for m in metrics_list]),
    'Specificity': np.mean([m['Specificity'] for m in metrics_list]),
    'AUC': np.mean([m['AUC'] for m in metrics_list]),
}

std_metrics = {
    'Balanced_accuracy': np.std([m['Balanced_accuracy'] for m in metrics_list]),
    'Sensitivity': np.std([m['Sensitivity'] for m in metrics_list]),
    'Specificity': np.std([m['Specificity'] for m in metrics_list]),
    'AUC': np.std([m['AUC'] for m in metrics_list]),
}

print("Average Results:")
for metric, value in avg_metrics.items():
    print(f"  {metric}: {value:.4f} ± {std_metrics[metric]:.4f}")

Average Results:
  Balanced_accuracy: 0.6167 ± 0.1643
  Sensitivity: 0.3333 ± 0.3651
  Specificity: 0.9000 ± 0.0571
  AUC: 0.6548 ± 0.1745
