# Object:To implement NMF for depression classification by Zhong et al (2023).

- create_time: 20 Oct 2023 
- ref: Zhong, Jitao, et al. "Robust discriminant feature extraction for automatic depression recognition." Biomedical Signal Processing and Control 82 (2023): 104505.

In [1]:
# Import require libraries
import numpy as np 
import matplotlib.pyplot as plt 
from scipy.signal import welch
import pywt


# Extract 5 features from each channel

In [28]:

# input = np.random.rand(458, 52, 125)
# please make sure the input data is normalized using something like z-norm
def normalize(data):
    # Iterate over each subject
    normalized_data = np.empty_like(data)
    for i in range(data.shape[0]):
        # Calculate the mean and standard deviation for the current subject
        mean = np.mean(data[i, :, :])
        std = np.std(data[i, :, :])

        # Perform z-normalization for the current subject
        normalized_data[i, :, :] = (data[i, :, :] - mean) / std
    return normalized_data


input = np.load('/Users/shanxiafeng/Documents/Code/python/fnirs_DL/JinyuanWang_pythonCode/allData/Output_npy/twoDoctor/HbO-All-HC-MDD/correct_channel_data.npy')
input = normalize(np.transpose(input, (0, 2, 1)))

index_task_start = 10
index_task_end = 70
fs = 1  # sampling rate

# 1. Total: Sum of hemoglobin concentration changes in the whole process.
feature_1 = np.sum(input, axis=2)[..., np.newaxis]
print(f'feature 1 shape -> {feature_1.shape}')

# 2. Peak: Peak value of hemoglobin concentration changes in four periods.
feature_2 = np.concatenate(
    (np.max(input[..., :index_task_start], axis=2)[..., np.newaxis],
     np.max(input[..., index_task_start:index_task_end],
            axis=2)[..., np.newaxis],
     np.max(input[..., :index_task_end:], axis=2)[..., np.newaxis],
     np.max(input[..., :], axis=2)[..., np.newaxis]),
    axis=2
)
print(f'feature 2 shape -> {feature_2.shape}')

# 3. Valley: Valley value of hemoglobin concentration changes in four periods.
feature_3 = np.concatenate(
    (np.min(input[..., :index_task_start], axis=2)[..., np.newaxis],
     np.min(input[..., index_task_start:index_task_end],
            axis=2)[..., np.newaxis],
     np.min(input[..., :index_task_end:], axis=2)[..., np.newaxis],
     np.min(input[..., :], axis=2)[..., np.newaxis]),
    axis=2
)
print(f'feature 3 shape -> {feature_3.shape}')

# 4. Average: Mean value of hemoglobin concentration changes in four periods.
feature_4 = np.concatenate(
    (np.mean(input[..., :index_task_start], axis=2)[..., np.newaxis],
     np.mean(input[..., index_task_start:index_task_end],
             axis=2)[..., np.newaxis],
     np.mean(input[..., :index_task_end:], axis=2)[..., np.newaxis],
     np.mean(input[..., :], axis=2)[..., np.newaxis]),
    axis=2
)
print(f'feature 4 shape -> {feature_4.shape}')



def entropy_wsh(y):
    return -np.sum(y*y * np.log10(y*y), axis=1)


def entropy_we(y):
    return np.sum(y, axis=1)


def entropy_wp(y):
    t1 = np.abs(y) ** 2
    t2 = np.linalg.norm(y, ord=2, axis=1) ** 2 / y.shape[1]
    # original should be
    # t2 = np.linalg.norm(y, ord=norm_q) ** norm_q
    # but this will cause the value to be 1000x time compared to other entropy values.
    # So I added / y.shape[1]
    return np.sum(t1-t2[..., np.newaxis], axis=1)


def entropy_wt(y):
    b = np.where(y > q, 1, 0)
    return np.sum(b, axis=1)


def entropy_wsu(y):
    t1 = y.shape[0]
    t2 = np.sum(np.where(y > q, 0, 1), axis=1)
    t3 = min(np.min(y**2), q**2)
    return t1 - t2 + t3

# Bacuse using wavelet decomposition will
q = np.mean(input)  # threhold
print(f'threshold is set to {q} ')

def calculte_entropy(x):
    coeffs = pywt.wavedec(x, 'db6', level=4)
    max_length = max(len(coeff) for coeff in coeffs)
    y = np.array([np.pad(coeff, (0, max_length - len(coeff)),
                 constant_values=(q)) for coeff in coeffs])
    return np.concatenate((entropy_wsh(y), entropy_we(y), entropy_wp(y), entropy_wt(y), entropy_wsu(y)), axis=0)

feature_5 = np.zeros((input.shape[0], input.shape[1], 25))
for i in range(input.shape[0]):
    for j in range(input.shape[1]):
        feature_5[i, j] = calculte_entropy(input[i, j])
print(f'feature 10 shape -> {feature_5.shape}')

nor_all_feature = np.concatenate(
    (normalize(feature_1),
     normalize(feature_2),
     normalize(feature_3),
     normalize(feature_4),
     normalize(feature_5)),
    axis=2
)

all_feature = np.concatenate(
    (feature_1,
     np.mean(feature_2, axis=-1)[...,np.newaxis],
     np.mean(feature_3, axis=-1)[...,np.newaxis],
     np.mean(feature_4, axis=-1)[...,np.newaxis],
     np.mean(feature_5, axis=-1)[...,np.newaxis]),
    axis=2
)

all_feature = np.reshape(all_feature, (458, 52*5))
# Based on individual subject, do normalization
print(f'feature all shape -> {all_feature.shape}')
nor_individual_subject_processed_all_feature = np.nan_to_num(nor_all_feature)


save_fold = '/Users/shanxiafeng/Documents/Code/python/fnirs_DL/JinyuanWang_pythonCode/allData/data_for_reproducing_model/zhong_nmf'
# np.save(save_fold + '/data', nor_individual_subject_processed_all_feature)



# Based on all subject, do normalization
def normalize_all_subject(data):
    # Iterate over each subject
    normalized_data = np.empty_like(data)
    # Calculate the mean and standard deviation for the current subject
    mean = np.mean(data)
    std = np.std(data)

    # Perform z-normalization for the current subject
    normalized_data = (data - mean) / std
    return normalized_data

processed_all_feature = np.nan_to_num(all_feature)
nor_all_processed_all_feature = normalize_all_subject(processed_all_feature)
np.save(save_fold + '/nor_allsubject_data', nor_all_processed_all_feature)

feature 1 shape -> (458, 52, 1)
feature 2 shape -> (458, 52, 4)
feature 3 shape -> (458, 52, 4)
feature 4 shape -> (458, 52, 4)
threshold is set to 3.0932596088179035e-18 


  return -np.sum(y*y * np.log10(y*y), axis=1)
  return -np.sum(y*y * np.log10(y*y), axis=1)


feature 10 shape -> (458, 52, 25)
feature all shape -> (458, 260)


In [4]:
nor_all_processed_all_feature = np.load('/Users/shanxiafeng/Documents/Code/python/fnirs_DL/JinyuanWang_pythonCode/allData/data_for_reproducing_model/zhong_nmf/nor_allsubject_data.npy')
label = np.load('/Users/shanxiafeng/Documents/Code/python/fnirs_DL/JinyuanWang_pythonCode/allData/data_for_reproducing_model/zhong_nmf/label.npy')
label_1_2 = label + 1
label_1_2 = np.concatenate((np.ones(1), label_1_2), axis=0)
 
np.save('/Users/shanxiafeng/Documents/Code/python/fnirs_DL/JinyuanWang_pythonCode/allData/data_for_reproducing_model/zhong_nmf/label_1_2.npy', label_1_2)

nor_all_processed_all_feature = np.concatenate((np.ones((1,260)), nor_all_processed_all_feature), axis=0)


In [5]:
folder = '/Users/shanxiafeng/Documents/Code/python/fnirs_DL/JinyuanWang_pythonCode/allData/data_for_reproducing_model/zhong_nmf/'
np.savetxt(folder+"nor_all_processed_all_feature.csv", nor_all_processed_all_feature, delimiter=",")
np.savetxt(folder+"label_1_2.csv", label_1_2, delimiter=",")

In [97]:
import pandas as pd

# Convert data and labels to DataFrame
df_data = pd.DataFrame(nor_all_processed_all_feature)
df_labels = pd.DataFrame(label_1_2)

# Add column and row indices
# df_data.index.name = 'Index'
df_data.columns = ['Feature_' + str(i) for i in range(df_data.shape[1])]


# Save to CSV
df_data.to_csv(folder + "nor_all_processed_all_feature.csv")
df_labels.to_csv(folder + "label_1_2.csv")


In [92]:
# Create row indices for the data
rows, cols = nor_all_processed_all_feature.shape
row_indices = np.arange(rows).reshape(-1, 1)
data_with_row_indices = np.hstack((row_indices, nor_all_processed_all_feature))

# Create column indices and add them at the top
col_indices = np.arange(cols + 1)  # +1 for the additional row index column
data_with_col_and_row_indices = np.vstack((col_indices, data_with_row_indices))

# Save the modified array to CSV
np.savetxt(folder + "nor_all_processed_all_feature.csv", data_with_col_and_row_indices, delimiter=",", fmt='%d')

# For label_1_2 (assuming it's a 1D array)
label_indices = np.arange(label_1_2.size).reshape(-1, 1)
labels_with_indices = np.hstack((label_indices, label_1_2.reshape(-1, 1)))
np.savetxt(folder + "label_1_2.csv", labels_with_indices, delimiter=",", fmt='%d')


In [None]:
xdata <- read.csv("/Users/shanxiafeng/Documents/Code/python/fnirs_DL/JinyuanWang_pythonCode/allData/data_for_reproducing_model/zhong_nmf/nor_all_processed_all_feature.csv")

xdata <- as.matrix(xdata)

xdata <- xdata+20

xdata <- t(xdata)

# Assign simple numeric column names
colnames(xdata) <- 1:ncol(xdata)

# Assign row numbers as row names
rownames(xdata) <- 1:nrow(xdata)



xlabel <- read.csv("/Users/shanxiafeng/Documents/Code/python/fnirs_DL/JinyuanWang_pythonCode/allData/data_for_reproducing_model/zhong_nmf/label_1_2.csv")

xlabel <- as.matrix(xlabel)

xlabel <- t(xlabel)

# Assign simple numeric column names
colnames(xlabel) <- 1:ncol(xlabel)

# Assign row numbers as row names
rownames(xlabel) <- 1:nrow(xlabel)

xlabel <- as.vector(t(xlabel))


---

# Total number of subjects (columns)
n <- ncol(xdata)

# 确定中间索引
mid_index <- n %/% 2  # 整数除法

# 使用前一半数据作为训练集
x_train <- xdata[, 1:mid_index]
y_train <- xlabel[1:mid_index]

# 使用后一半数据作为测试集
x_test <- xdata[, (mid_index+1):n]
y_test <- xlabel[(mid_index+1):n]



res = DNMF(x_train, y_train, r = 512)

W=res['W']
W_matrix <- W$W

library(MASS)
H <- ginv(t(W_matrix) %*% W_matrix) %*% t(W_matrix) %*% x_test


---

write.csv(H, file="/Users/shanxiafeng/Documents/Code/python/fnirs_DL/JinyuanWang_pythonCode/allData/data_for_reproducing_model/zhong_nmf/H_data_test.csv", row.names=FALSE)

write.csv(res['H'][[1]], file = "/Users/shanxiafeng/Documents/Code/python/fnirs_DL/JinyuanWang_pythonCode/allData/data_for_reproducing_model/zhong_nmf/H_data_train.csv", row.names = FALSE)

write.csv(train_indices, file="/Users/shanxiafeng/Documents/Code/python/fnirs_DL/JinyuanWang_pythonCode/allData/data_for_reproducing_model/zhong_nmf/train_indices.csv", row.names=FALSE)



In [42]:

# Read the CSV file
xdf = pd.read_csv('/Users/shanxiafeng/Documents/Code/python/fnirs_DL/JinyuanWang_pythonCode/allData/data_for_reproducing_model/zhong_nmf/nor_all_processed_all_feature.csv')
print(xdf)

     2.210258088519002073e-01  9.297147231294590475e-02  \
0                   -0.063401                  0.106284   
1                   -0.665109                  0.094262   
2                   -1.901612                  0.106929   
3                   -0.302009                  0.107729   
4                   -0.156811                  0.118637   
..                        ...                       ...   
452                  2.465671                  0.159177   
453                 -1.707635                  0.117842   
454                 -1.646723                  0.085574   
455                 -3.324772                  0.069146   
456                  4.206093                  0.156948   

     9.297147231294590475e-02.1  9.297147231294590475e-02.2  \
0                      0.071422                    0.089479   
1                      0.072878                    0.083236   
2                      0.039099                    0.078260   
3                      0.071439        

In [26]:
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.pyplot as plt
# Read the CSV file
folder = '/Users/shanxiafeng/Documents/Code/python/fnirs_DL/JinyuanWang_pythonCode/allData/data_for_reproducing_model/zhong_nmf'
df_train = pd.read_csv(folder + '/H_data_train.csv')
df_test = pd.read_csv(folder + '/H_data_test.csv')
df_train_indices = pd.read_csv(folder + '/train_indices.csv')
print(df_train)
print(df_test)
print(df_train_indices)

                1             2             3             4             5  \
0    1.157917e+03  4.786688e-07  1.157913e+03  5.070737e-07  1.157915e+03   
1    2.420549e-09  1.159172e+03  1.293239e-08  1.159173e+03  3.330801e-09   
2    1.352115e-04  5.336377e-05  3.604982e-04  2.177616e-05  3.360749e-03   
3    1.241169e-03  9.667627e-06  2.186340e-03  6.488010e-06  1.625178e-04   
4    3.126912e-03  4.437764e-05  1.365095e-03  5.521021e-05  1.640083e-03   
..            ...           ...           ...           ...           ...   
507  1.415965e-03  4.200493e-05  3.281992e-03  4.383022e-05  2.533778e-03   
508  4.136677e-03  2.776165e-05  1.212795e-03  4.239838e-05  1.388241e-03   
509  4.104419e-04  4.597180e-06  2.533258e-03  2.361238e-05  9.068421e-04   
510  3.969107e-03  4.517433e-05  5.504212e-03  1.240260e-05  2.683820e-03   
511  2.729687e-03  1.014146e-05  5.577471e-03  1.395522e-05  3.410981e-03   

               6             7             8             9            10  .

In [27]:
x_train = np.array(df_train)
x_train = np.transpose(x_train, (1,0))
print(x_train.shape)

x_test = np.array(df_test)
x_test = np.transpose(x_test, (1,0))
print(x_test.shape)


(229, 512)
(229, 512)


In [31]:
y_train = label[:len(label)//2]
y_test = label[len(label)//2:]


# dmf_data = np.concatenate((dmf_data, np.ones((1000,512))), axis=0)
# dmf_label = np.concatenate((dmf_label, np.ones((500))), axis=0)
# dmf_label = np.concatenate((dmf_label, np.zeros((500))), axis=0)
print(y_train.shape)
print(y_test.shape)

(229,)
(229,)


In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score

# Assuming dmf_data and dmf_label are already loaded in your environment

# Split data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(nor_all_processed_all_feature, label, test_size=0.5, random_state=42)
# X_train, X_test, y_train, y_test = train_test_split(dmf_data, dmf_label, test_size=0.1, random_state=42)
X_train, X_test = x_train, x_test

# print(X_train.shape)
# Initialize a random forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
# clf = SVC(kernel='linear', C=1, random_state=42)
# Train the classifier

# # Perform 5-fold cross-validation
scores = cross_val_score(clf, X_train, y_train, cv=5)

# Print the accuracy for each fold
for i, score in enumerate(scores, 1):
    print(f"Fold-{i} Accuracy: {score * 100:.2f}%")

clf.fit(X_train, y_train)


# Make predictions
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Print detailed classification report
print(classification_report(y_test, y_pred))


Fold-1 Accuracy: 100.00%
Fold-2 Accuracy: 100.00%
Fold-3 Accuracy: 100.00%
Fold-4 Accuracy: 100.00%
Fold-5 Accuracy: 100.00%
Accuracy: 49.78%
              precision    recall  f1-score   support

           0       0.53      0.47      0.50       121
           1       0.47      0.53      0.50       108

    accuracy                           0.50       229
   macro avg       0.50      0.50      0.50       229
weighted avg       0.50      0.50      0.50       229



In [48]:
print(label_1_2)

[1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 1 2 1
 2 1 2 1 2 1 2 1 2 1 2 1 2 1 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 1 2 1 2 1 2
 1 2 1 2 1 2 1 2 1 2 1 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 1 2 1 2 1 2 1 2 1
 2 1 2 1 2 1 2 1 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 1 2 1 2 1 2 1 2 1 2 1 2
 1 2 1 2 1 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1
 2 1 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 1
 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 1 2 1 2
 1 2 1 2 1 2 1 2 1 2 1 2 1 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 1 2 1 2 1 2 1
 2 1 2 1 2 1 2 1 2 1 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 1 2 1 2 1 2 1 2 1 2
 1 2 1 2 1 2 1 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 1 2 1 2 1 2 1 2 1 2 1 2 1
 2 1 2 1 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2
 1 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 1 2
 1 2 1 2 1 2 1 2 1 2 1 2 1 2]
