Based on the full data subset A with no missing data, the full samples for y=0 and y=1 are learned and new samples are generated using the data and labels from tp2.

In [1]:
import os
import warnings
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.mixture import GaussianMixture
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import f_classif, SelectKBest
from sklearn.model_selection import StratifiedKFold
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel
from sklearn.decomposition import PCA
from torch.utils.data import TensorDataset, DataLoader
from skbio.stats.composition import clr, alr, ilr
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import pdist
from scipy.stats.mstats import gmean
from scipy.stats import gaussian_kde
import scipy.stats as stats
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

In [2]:
def evaluate_model(y_true, y_pred, y_pred_proba):
    
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    sensitivity = recall_score(y_true, y_pred)
    specificity = tn / (tn + fp)
    balanced_accuracy = (sensitivity + specificity) / 2
    f1 = f1_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred_proba)
    
    return {
#         'Accuracy': accuracy,
        'Balanced_accuracy': balanced_accuracy,
        'Sensitivity': sensitivity,
        'Specificity': specificity,
#         'F1 Score': f1,
        'AUC': auc
    }

In [3]:
pd.set_option('future.no_silent_downcasting', True)

In [4]:
os.environ["PYTHONHASHSEED"] = "0"

In [5]:
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn.base")

In [6]:
# Define K-fold cross-validation
# 1. There are positive samples in both the training and validation sets.
# 2. the class distributions in the training and validation sets are similar to the original dataset.

n_splits = 5
num_seeds = 5
skf = StratifiedKFold(n_splits= n_splits, shuffle=True, random_state = 42)

### metadata

In [7]:
Metadata = pd.read_csv("data/BASIC_metadata_full.csv", sep=',', low_memory=False)
Metadata = Metadata.drop(Metadata.columns[0], axis=1)

# convert timepoits to 0,1,2
Metadata.loc[Metadata.TimePoint == "Trimester2","TimePoint"] = 0 
Metadata.loc[Metadata.TimePoint == "Trimester3","TimePoint"] = 1
Metadata.loc[Metadata.TimePoint == "PostpartumWeek6","TimePoint"] = 2

# turn insufficient reads to NaN
i = Metadata[Metadata.ReadsNumber < 500000].index
Metadata.loc[i, 'ReadsNumber'] = np.nan

### species data

In [8]:
profile =pd.read_csv("data/Species_Profile_full.csv",sep=',',low_memory=False)

# extract all bacteria names
full_list_bacteria = list(profile.columns)[1:]

species = profile.to_numpy()[:,1:]

species_num = np.shape(species)[1] # 713 species

In [9]:
# Inner join profile and metadeata
merged_data_base = pd.merge(profile, Metadata, left_on='Sample_id', right_on='Sample_ID')

merged_data = merged_data_base.dropna(subset=['ReadsNumber'])[['Individual_ID', 'TimePoint', 'EPDS', 'Dichotomous_EPDS'] + full_list_bacteria]

In [10]:
# 1. Sample individuals whose EPDS != NaN at tp2
individuals_with_na_epds_at_tp2 = merged_data[
    (merged_data['TimePoint'] == 2) & (merged_data['EPDS'].isna())
]['Individual_ID'].unique()

data = merged_data[~merged_data['Individual_ID'].isin(individuals_with_na_epds_at_tp2)]

# 2. Sample individuals with data at tp0, tp1 and tp2
individuals_with_all_timepoints = data.groupby('Individual_ID').filter(lambda x: set(x['TimePoint']) >= {0, 1, 2})['Individual_ID'].unique()
data = data[data['Individual_ID'].isin(individuals_with_all_timepoints)]

In [11]:
# Remove features that have a value of 0 at all time points for all samples
columns_to_drop = []
for col in full_list_bacteria:
    if (data[col] == 0).all():
        columns_to_drop.append(col)
data = data.drop(columns=columns_to_drop)

# Update full_list_bacteria
full_list_bacteria = [col for col in full_list_bacteria if col not in columns_to_drop]

### Zero-Replacing

### Comparison of Zero-value Replacement Methods

#### Replacement with half of the non-zero minimum value of each row
- **Effect description**: <span style="color:red;"> The effect after feature extraction was poor compared to replacing with column, but significant after applying oversampling.<span>
- **Applicable scenario**: Suitable for scenarios where the focus is on within-sample variation, such as comparing the relative abundances of different species within the same sample.
- **Advantages**: Maintains the internal structure of the sample and reduces the impact of inter-sample variation.
- **Disadvantages**: May lead to inconsistent replacement values across different samples, affecting cross-sample comparisons.

---

#### Replacement with half of the non-zero minimum value of each column
- **Effect description**: <span style="color:red;"> Oversampling fails to learn and generate new and better samples.<span>
- **Applicable scenario**: Applicable when the focus is on the distribution of features (e.g., species) across different samples.
- **Advantages**: Maintains the consistency of features across different samples, facilitating cross-sample comparisons.
- **Disadvantages**: May ignore the internal structure of the sample, affecting the analysis of within-sample variation.

---

#### Replacement with default value 1e-10
- **Effect description**: <span style="color:red;"> The effect after feature extraction was poor compared to replacing with column, and also oversampling fails to learn and generate new and better samples.<span>
- **Applicable scenario**: Suitable when zero values result from measurement limitations and the replacement value minimally impacts overall analysis, e.g., in preliminary data exploration.
- **Advantages**: imple to implement, no complex calculations or parameter estimations needed. Quickly handles zero values for further processing.
- **Disadvantages**: Ignores data characteristics like non - zero value distributions. Fixed replacement may not reflect true zero - valued data, affecting analysis accuracy.

OverSampling is performed using GMM as the base, when no feature selection or data transformation is performed to compare the role of zero-value replacement.

#### (0) No Replacing

In [12]:
data[full_list_bacteria] = data[full_list_bacteria].astype(float)

# Replace with 1/2 of the non-zero minimum value of the row
new_data = data.copy()
matrix = new_data[full_list_bacteria].values

# Normalization
for i in range(matrix.shape[0]):
    row = matrix[i, :]
    
    row_sum = np.sum(row)
    row = row / row_sum
    matrix[i, :] = row

new_data[full_list_bacteria] = matrix

In [13]:
# Turn data from [individual * tp, feature] into [individual, tp, feature]
grouped = new_data.groupby('Individual_ID')

transformed_data = []
labels = []

for individual_id, group in grouped:
    time_point_matrix = np.full((3, len(full_list_bacteria)), np.nan)

    for _, row in group.iterrows():
        time_point = int(row['TimePoint'])
        time_point_matrix[time_point] = row[full_list_bacteria].values

    tp2_row = group[group['TimePoint'] == 2]
    label = tp2_row['Dichotomous_EPDS'].values[0]

    transformed_data.append(time_point_matrix)
    labels.append(label)

transformed_data = np.array(transformed_data)
labels = np.array(labels)

In [14]:
X = transformed_data[:, :2, :].reshape(transformed_data.shape[0], -1)
y = labels

individual_ids = np.unique(new_data['Individual_ID'])

# Assuming the number of new samples generated
num_new_samples = 55 # 70-15 = 55
original_num_time_steps = 2
original_num_features = transformed_data.shape[2]

In [15]:
metrics_list = []

for fold, (train_idx, val_idx) in enumerate(skf.split(individual_ids, y)):
    X_train = X[train_idx]
    X_val = X[val_idx]
    y_train = y[train_idx]
    y_val = y[val_idx]

    minority_class = 1
    minority_indices = np.where(y_train == minority_class)[0]
    X_train_minority = X_train[minority_indices]

    gmm = GaussianMixture(n_components=2, random_state=42)
    gmm.fit(X_train_minority)

    new_samples_flattened, _ = gmm.sample(num_new_samples)
    
    new_samples_np = new_samples_flattened.reshape(new_samples_flattened.shape[0], original_num_time_steps, original_num_features)
    new_samples_np = new_samples_np.reshape(new_samples_flattened.shape[0], -1)
    
    combined_X_train = np.concatenate([X_train, new_samples_np], axis=0)
    combined_y_train = np.concatenate([y_train, np.full(new_samples_np.shape[0], minority_class)], axis=0)

    for seed in range(num_seeds):
        rf = RandomForestClassifier(n_estimators=100, random_state=seed, class_weight="balanced")
        rf.fit(combined_X_train, combined_y_train)
        
        y_pred = rf.predict(X_val)
        y_pred_proba = rf.predict_proba(X_val)[:, 1]

        metrics = evaluate_model(y_val, y_pred, y_pred_proba)
        metrics_list.append(metrics)

avg_metrics = {
    'Balanced_accuracy': np.mean([m['Balanced_accuracy'] for m in metrics_list]),
    'Sensitivity': np.mean([m['Sensitivity'] for m in metrics_list]),
    'Specificity': np.mean([m['Specificity'] for m in metrics_list]),
    'AUC': np.mean([m['AUC'] for m in metrics_list]),
}

std_metrics = {
    'Balanced_accuracy': np.std([m['Balanced_accuracy'] for m in metrics_list]),
    'Sensitivity': np.std([m['Sensitivity'] for m in metrics_list]),
    'Specificity': np.std([m['Specificity'] for m in metrics_list]),
    'AUC': np.std([m['AUC'] for m in metrics_list]),
}

print("Average Results:")
for metric, value in avg_metrics.items():
    print(f"  {metric}: {value:.4f} ± {std_metrics[metric]:.4f}")

Average Results:
  Balanced_accuracy: 0.5000 ± 0.0000
  Sensitivity: 0.0000 ± 0.0000
  Specificity: 1.0000 ± 0.0000
  AUC: 0.3876 ± 0.2099


#### (1) Based on Row

In [31]:
# Choice 1：based on row
# Replace with 1/2 of the non-zero minimum value of the row
new_data = data.copy()
matrix = new_data[full_list_bacteria].values

for i in range(matrix.shape[0]):
    row = matrix[i, :]
    non_zero_values = row[~np.isnan(row) & (row > 0)]
    if len(non_zero_values) > 0:
        min_non_zero = np.min(non_zero_values)
        half_min = min_non_zero / 2
        row[row == 0] = half_min
    
    matrix[i, :] = row

In [32]:
# Normalization
for i in range(matrix.shape[0]):
    row = matrix[i, :]
    
    row_sum = np.sum(row)
    row = row / row_sum
    matrix[i, :] = row

new_data[full_list_bacteria] = matrix

In [33]:
# Turn data from [individual * tp, feature] into [individual, tp, feature]
grouped = new_data.groupby('Individual_ID')

transformed_data = []
labels = []

for individual_id, group in grouped:
    time_point_matrix = np.full((3, len(full_list_bacteria)), np.nan)

    for _, row in group.iterrows():
        time_point = int(row['TimePoint'])
        time_point_matrix[time_point] = row[full_list_bacteria].values

    tp2_row = group[group['TimePoint'] == 2]
    label = tp2_row['Dichotomous_EPDS'].values[0]

    transformed_data.append(time_point_matrix)
    labels.append(label)

transformed_data = np.array(transformed_data)
labels = np.array(labels)

In [34]:
X = transformed_data[:, :2, :].reshape(transformed_data.shape[0], -1)
y = labels

individual_ids = np.unique(new_data['Individual_ID'])

# Assuming the number of new samples generated
num_new_samples = 55 # 70-15 = 55
original_num_time_steps = 2
original_num_features = transformed_data.shape[2]

In [35]:
metrics_list = []

for fold, (train_idx, val_idx) in enumerate(skf.split(individual_ids, y)):
    X_train = X[train_idx]
    X_val = X[val_idx]
    y_train = y[train_idx]
    y_val = y[val_idx]

    for seed in range(num_seeds):
        rf = RandomForestClassifier(n_estimators=100, random_state=seed, class_weight="balanced")
        rf.fit(X_train, y_train)
        
        y_pred = rf.predict(X_val)
        y_pred_proba = rf.predict_proba(X_val)[:, 1]

        metrics = evaluate_model(y_val, y_pred, y_pred_proba)
        metrics_list.append(metrics)

avg_metrics = {
    'Balanced_accuracy': np.mean([m['Balanced_accuracy'] for m in metrics_list]),
    'Sensitivity': np.mean([m['Sensitivity'] for m in metrics_list]),
    'Specificity': np.mean([m['Specificity'] for m in metrics_list]),
    'AUC': np.mean([m['AUC'] for m in metrics_list]),
}

std_metrics = {
    'Balanced_accuracy': np.std([m['Balanced_accuracy'] for m in metrics_list]),
    'Sensitivity': np.std([m['Sensitivity'] for m in metrics_list]),
    'Specificity': np.std([m['Specificity'] for m in metrics_list]),
    'AUC': np.std([m['AUC'] for m in metrics_list]),
}

print("Average Results:")
for metric, value in avg_metrics.items():
    print(f"  {metric}: {value:.4f} ± {std_metrics[metric]:.4f}")

Average Results:
  Balanced_accuracy: 0.5000 ± 0.0000
  Sensitivity: 0.0000 ± 0.0000
  Specificity: 1.0000 ± 0.0000
  AUC: 0.5871 ± 0.1759


In [20]:
metrics_list = []

for fold, (train_idx, val_idx) in enumerate(skf.split(individual_ids, y)):
    X_train = X[train_idx]
    X_val = X[val_idx]
    y_train = y[train_idx]
    y_val = y[val_idx]

    minority_class = 1
    minority_indices = np.where(y_train == minority_class)[0]
    X_train_minority = X_train[minority_indices]

    gmm = GaussianMixture(n_components=2, random_state=42)
    gmm.fit(X_train_minority)

    new_samples_flattened, _ = gmm.sample(num_new_samples)
    
    new_samples_np = new_samples_flattened.reshape(new_samples_flattened.shape[0], original_num_time_steps, original_num_features)
    new_samples_np = new_samples_np.reshape(new_samples_flattened.shape[0], -1)
    
    combined_X_train = np.concatenate([X_train, new_samples_np], axis=0)
    combined_y_train = np.concatenate([y_train, np.full(new_samples_np.shape[0], minority_class)], axis=0)

    for seed in range(num_seeds):
        rf = RandomForestClassifier(n_estimators=100, random_state=seed, class_weight="balanced")
        rf.fit(combined_X_train, combined_y_train)
        
        y_pred = rf.predict(X_val)
        y_pred_proba = rf.predict_proba(X_val)[:, 1]

        metrics = evaluate_model(y_val, y_pred, y_pred_proba)
        metrics_list.append(metrics)

avg_metrics = {
    'Balanced_accuracy': np.mean([m['Balanced_accuracy'] for m in metrics_list]),
    'Sensitivity': np.mean([m['Sensitivity'] for m in metrics_list]),
    'Specificity': np.mean([m['Specificity'] for m in metrics_list]),
    'AUC': np.mean([m['AUC'] for m in metrics_list]),
}

std_metrics = {
    'Balanced_accuracy': np.std([m['Balanced_accuracy'] for m in metrics_list]),
    'Sensitivity': np.std([m['Sensitivity'] for m in metrics_list]),
    'Specificity': np.std([m['Specificity'] for m in metrics_list]),
    'AUC': np.std([m['AUC'] for m in metrics_list]),
}

print("Average Results:")
for metric, value in avg_metrics.items():
    print(f"  {metric}: {value:.4f} ± {std_metrics[metric]:.4f}")

Average Results:
  Balanced_accuracy: 0.4990 ± 0.0397
  Sensitivity: 0.0667 ± 0.1333
  Specificity: 0.9314 ± 0.0892
  AUC: 0.5719 ± 0.1289


#### (2) Based on Column

In [21]:
# Choice 2：based on column
# 1.Replace the zero value by 1/2 of the non - zero minimum value
new_data = data.copy()
matrix = new_data[full_list_bacteria].values

for col_index in range(matrix.shape[1]):
    col = matrix[:, col_index]
    non_zero_values = col[~np.isnan(col) & (col > 0)]
    if len(non_zero_values) > 0:
        min_non_zero = np.min(non_zero_values)
        half_min = min_non_zero / 2
        col[col == 0] = half_min
    else:
        col[col == 0] = 1e-10

    matrix[:, col_index] = col

In [22]:
# Normalization
for i in range(matrix.shape[0]):
    row = matrix[i, :]
    
    row_sum = np.sum(row)
    row = row / row_sum
    matrix[i, :] = row

new_data[full_list_bacteria] = matrix

In [23]:
# Turn data from [individual * tp, feature] into [individual, tp, feature]
grouped = new_data.groupby('Individual_ID')

transformed_data = []
labels = []

for individual_id, group in grouped:
    time_point_matrix = np.full((3, len(full_list_bacteria)), np.nan)

    for _, row in group.iterrows():
        time_point = int(row['TimePoint'])
        time_point_matrix[time_point] = row[full_list_bacteria].values

    tp2_row = group[group['TimePoint'] == 2]
    label = tp2_row['Dichotomous_EPDS'].values[0]

    transformed_data.append(time_point_matrix)
    labels.append(label)

transformed_data = np.array(transformed_data)
labels = np.array(labels)

In [24]:
X = transformed_data[:, :2, :].reshape(transformed_data.shape[0], -1)
y = labels

individual_ids = np.unique(new_data['Individual_ID'])

# Assuming the number of new samples generated
num_new_samples = 55 # 70-15 = 55
original_num_time_steps = 2
original_num_features = transformed_data.shape[2]

In [25]:
metrics_list = []

for fold, (train_idx, val_idx) in enumerate(skf.split(individual_ids, y)):
    X_train = X[train_idx]
    X_val = X[val_idx]
    y_train = y[train_idx]
    y_val = y[val_idx]

    minority_class = 1
    minority_indices = np.where(y_train == minority_class)[0]
    X_train_minority = X_train[minority_indices]

    gmm = GaussianMixture(n_components=2, random_state=42)
    gmm.fit(X_train_minority)

    new_samples_flattened, _ = gmm.sample(num_new_samples)
    
    new_samples_np = new_samples_flattened.reshape(new_samples_flattened.shape[0], original_num_time_steps, original_num_features)
    new_samples_np = new_samples_np.reshape(new_samples_flattened.shape[0], -1)
    
    combined_X_train = np.concatenate([X_train, new_samples_np], axis=0)
    combined_y_train = np.concatenate([y_train, np.full(new_samples_np.shape[0], minority_class)], axis=0)

    for seed in range(num_seeds):
        rf = RandomForestClassifier(n_estimators=100, random_state=seed, class_weight="balanced")
        rf.fit(combined_X_train, combined_y_train)
        
        y_pred = rf.predict(X_val)
        y_pred_proba = rf.predict_proba(X_val)[:, 1]

        metrics = evaluate_model(y_val, y_pred, y_pred_proba)
        metrics_list.append(metrics)

avg_metrics = {
    'Balanced_accuracy': np.mean([m['Balanced_accuracy'] for m in metrics_list]),
    'Sensitivity': np.mean([m['Sensitivity'] for m in metrics_list]),
    'Specificity': np.mean([m['Specificity'] for m in metrics_list]),
    'AUC': np.mean([m['AUC'] for m in metrics_list]),
}

std_metrics = {
    'Balanced_accuracy': np.std([m['Balanced_accuracy'] for m in metrics_list]),
    'Sensitivity': np.std([m['Sensitivity'] for m in metrics_list]),
    'Specificity': np.std([m['Specificity'] for m in metrics_list]),
    'AUC': np.std([m['AUC'] for m in metrics_list]),
}

print("Average Results:")
for metric, value in avg_metrics.items():
    print(f"  {metric}: {value:.4f} ± {std_metrics[metric]:.4f}")

Average Results:
  Balanced_accuracy: 0.4829 ± 0.0178
  Sensitivity: 0.0000 ± 0.0000
  Specificity: 0.9657 ± 0.0357
  AUC: 0.4005 ± 0.1263


#### (3) Default Value

In [26]:
# Choice 3：set a default
# 1.Replace the zero value by default
new_data = data.copy()
matrix = new_data[full_list_bacteria].values
default_value = 1e-10

for col_index in range(matrix.shape[1]):
    col = matrix[:, col_index]

    col[col == 0] = default_value

    matrix[:, col_index] = col

In [27]:
# Normalization
for i in range(matrix.shape[0]):
    row = matrix[i, :]
    
    row_sum = np.sum(row)
    row = row / row_sum
    matrix[i, :] = row

new_data[full_list_bacteria] = matrix

In [28]:
# Turn data from [individual * tp, feature] into [individual, tp, feature]
grouped = new_data.groupby('Individual_ID')

transformed_data = []
labels = []

for individual_id, group in grouped:
    time_point_matrix = np.full((3, len(full_list_bacteria)), np.nan)

    for _, row in group.iterrows():
        time_point = int(row['TimePoint'])
        time_point_matrix[time_point] = row[full_list_bacteria].values

    tp2_row = group[group['TimePoint'] == 2]
    label = tp2_row['Dichotomous_EPDS'].values[0]

    transformed_data.append(time_point_matrix)
    labels.append(label)

transformed_data = np.array(transformed_data)
labels = np.array(labels)

In [29]:
X = transformed_data[:, :2, :].reshape(transformed_data.shape[0], -1)
y = labels

individual_ids = np.unique(new_data['Individual_ID'])

# Assuming the number of new samples generated
num_new_samples = 55 # 70-15 = 55
original_num_time_steps = 2
original_num_features = transformed_data.shape[2]

In [30]:
metrics_list = []

for fold, (train_idx, val_idx) in enumerate(skf.split(individual_ids, y)):
    X_train = X[train_idx]
    X_val = X[val_idx]
    y_train = y[train_idx]
    y_val = y[val_idx]

    minority_class = 1
    minority_indices = np.where(y_train == minority_class)[0]
    X_train_minority = X_train[minority_indices]

    gmm = GaussianMixture(n_components=2, random_state=42)
    gmm.fit(X_train_minority)

    new_samples_flattened, _ = gmm.sample(num_new_samples)
    
    new_samples_np = new_samples_flattened.reshape(new_samples_flattened.shape[0], original_num_time_steps, original_num_features)
    new_samples_np = new_samples_np.reshape(new_samples_flattened.shape[0], -1)
    
    combined_X_train = np.concatenate([X_train, new_samples_np], axis=0)
    combined_y_train = np.concatenate([y_train, np.full(new_samples_np.shape[0], minority_class)], axis=0)

    for seed in range(num_seeds):
        rf = RandomForestClassifier(n_estimators=100, random_state=seed, class_weight="balanced")
        rf.fit(combined_X_train, combined_y_train)
        
        y_pred = rf.predict(X_val)
        y_pred_proba = rf.predict_proba(X_val)[:, 1]

        metrics = evaluate_model(y_val, y_pred, y_pred_proba)
        metrics_list.append(metrics)

avg_metrics = {
    'Balanced_accuracy': np.mean([m['Balanced_accuracy'] for m in metrics_list]),
    'Sensitivity': np.mean([m['Sensitivity'] for m in metrics_list]),
    'Specificity': np.mean([m['Specificity'] for m in metrics_list]),
    'AUC': np.mean([m['AUC'] for m in metrics_list]),
}

std_metrics = {
    'Balanced_accuracy': np.std([m['Balanced_accuracy'] for m in metrics_list]),
    'Sensitivity': np.std([m['Sensitivity'] for m in metrics_list]),
    'Specificity': np.std([m['Specificity'] for m in metrics_list]),
    'AUC': np.std([m['AUC'] for m in metrics_list]),
}

print("Average Results:")
for metric, value in avg_metrics.items():
    print(f"  {metric}: {value:.4f} ± {std_metrics[metric]:.4f}")

Average Results:
  Balanced_accuracy: 0.4914 ± 0.0183
  Sensitivity: 0.0000 ± 0.0000
  Specificity: 0.9829 ± 0.0366
  AUC: 0.2705 ± 0.1578
