## Word2vec_MFCC_Minmax_Multiply

In [None]:
import os
import pandas as pd
import librosa
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, GlobalAveragePooling1D, Dropout, LayerNormalization, Multiply
from tensorflow.keras.layers import MultiHeadAttention
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import confusion_matrix, classification_report, f1_score, precision_score, recall_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from gensim.models import KeyedVectors  # Word2Vec 모델을 사용하기 위해 추가

# Load labels
labels_path = "D:/EDAIC-WOZ/labels/normal_label2.xlsx"
labels_df = pd.read_excel(labels_path)
labels_df = labels_df[['Participant_ID', 'PHQ_Three']]

# Directories for text and audio data
source_root_text = "D:/EDAIC-WOZ/Processed_transcript/three_level2"
source_root_audio = "D:/EDAIC-WOZ/audio_token_level/three_level2"

# Initialize lists to hold data and labels
all_sentences = []
all_mfcc_features = []
label_list = []

# Function to load and preprocess individual CSV files
def load_and_preprocess_csv(file_path):
    df = pd.read_csv(file_path)
    return ' '.join(df['Text'].tolist())  # Combine all sentences into one
def extract_aggregate_mfcc_minmax(audio_files, max_pad_len=100, n_mfcc=128):
    mfcc_features_list = []
    scaler = MinMaxScaler()

    for audio_path in audio_files:
        if os.path.exists(audio_path):
            audio, sample_rate = librosa.load(audio_path, sr=None)
            mfcc = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfcc)

            if mfcc.shape[1] < max_pad_len:
                pad_width = max_pad_len - mfcc.shape[1]
                mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
            else:
                mfcc = mfcc[:, :max_pad_len]

            # Apply MinMax Scaling
            mfcc_scaled = scaler.fit_transform(mfcc.T).T
            mfcc_features_list.append(mfcc_scaled.T)

    if len(mfcc_features_list) > 0:
        # Aggregate MFCC features (e.g., by taking the mean)
        aggregated_mfcc = np.mean(mfcc_features_list, axis=0)
    else:
        aggregated_mfcc = np.zeros((max_pad_len, n_mfcc))  # Fallback if no audio found

    return aggregated_mfcc

# Replace the MFCC extraction function with the MinMaxScaling version
for subfolder in os.listdir(source_root_audio):
    participant_id, subfolder_num = subfolder.split('_')

    # Determine the corresponding CSV file for this subfolder
    csv_file = f'{participant_id}_{subfolder_num}_processed.csv'
    text_file_path = os.path.join(source_root_text, csv_file)

    if not os.path.exists(text_file_path):
        print(f'Text file {text_file_path} not found for audio folder {subfolder}. Skipping.')
        continue

    # Load and aggregate sentences from the CSV
    aggregated_sentence = load_and_preprocess_csv(text_file_path)

    label_row = labels_df[labels_df['Participant_ID'] == int(participant_id)]

    if label_row.empty:
        print(f"Label not found for Participant ID: {participant_id}. Skipping.")
        continue

    phq_three = label_row['PHQ_Three'].values[0]

    subfolder_path = os.path.join(source_root_audio, subfolder)
    audio_files = [os.path.join(subfolder_path, f'{participant_id}{subfolder_num}_{i}.wav') for i in range(len(aggregated_sentence.split()))]

    # Aggregate MFCC features from the audio files using MinMaxScaling
    aggregated_mfcc = extract_aggregate_mfcc_minmax(audio_files)

    # Append the aggregated sentence and MFCC to the lists
    all_sentences.append(aggregated_sentence)
    all_mfcc_features.append(aggregated_mfcc)
    label_list.append(phq_three)


In [None]:
#%% Tokenize and prepare Word2Vec embedding
# Tokenize the sentences
from gensim.models import Word2Vec, KeyedVectors
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import KFold, train_test_split
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, GlobalAveragePooling1D, Dropout, LayerNormalization, Multiply
from tensorflow.keras.layers import MultiHeadAttention
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
import matplotlib.pyplot as plt
import seaborn as sns

# Tokenize the sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_sentences)
word_index = tokenizer.word_index

# Load pretrained Word2Vec model
word2vec_model = Word2Vec.load("D:/EDAIC-WOZ/word2vec_sgns_custom.model")

# Create embedding matrix using the pretrained Word2Vec model
embedding_dim = word2vec_model.vector_size
vocab_size = len(word_index) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))

# Fill the embedding matrix with pretrained word vectors (gensim 4.x compatibility)
for word, i in word_index.items():
    if word in word2vec_model.wv.key_to_index:  # Use key_to_index in gensim 4.x
        embedding_vector = word2vec_model.wv[word]
        embedding_matrix[i] = embedding_vector

# Define Word2Vec Embedding Layer
embedding_layer = Embedding(input_dim=vocab_size,
                            output_dim=embedding_dim,
                            weights=[embedding_matrix],
                            input_length=100,  # max_length
                            trainable=True)  # Don't train the embedding weights

# Convert the sentences to sequences
sequences = tokenizer.texts_to_sequences(all_sentences)

# Pad the sequences to ensure uniform length
max_length = 100  # Adjust based on your data
padded_sequences = pad_sequences(sequences, maxlen=max_length)

# Convert MFCC features to numpy array
mfcc_features = np.array(all_mfcc_features)

# Convert labels to categorical format
categorical_labels = to_categorical(label_list, num_classes=3)

# Define Transformer block
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1, **kwargs):
        super(TransformerBlock, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim
        self.rate = rate

        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential(
            [Dense(ff_dim, activation="relu"), Dense(embed_dim)]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

    def get_config(self):
        config = super(TransformerBlock, self).get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "ff_dim": self.ff_dim,
            "rate": self.rate,
        })
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)

# Define model using Word2Vec embedding
def create_transformer_model(text_input_shape, mfcc_input_shape):
    text_inputs = Input(shape=text_input_shape)
    embedded_sequences = embedding_layer(text_inputs)

    mfcc_inputs = Input(shape=mfcc_input_shape)
    mfcc_dense = Dense(128, activation="relu")(mfcc_inputs)

    combined = Multiply()([embedded_sequences, mfcc_dense])

    transformer_block = TransformerBlock(embed_dim=128, num_heads=4, ff_dim=128)
    x = transformer_block(combined)
    x = GlobalAveragePooling1D()(x)
    x = Dropout(0.1)(x)
    x = Dense(128, activation="relu")(x)
    x = Dropout(0.1)(x)
    outputs = Dense(3, activation="softmax")(x)
    model = Model(inputs=[text_inputs, mfcc_inputs], outputs=outputs)
    return model

# KFold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=119)
fold_results = []
best_model = None
best_accuracy = 0

for fold, (train_index, test_index) in enumerate(kf.split(padded_sequences)):
    print(f"Training fold {fold + 1}/5...")

    X_text_train, X_text_test = padded_sequences[train_index], padded_sequences[test_index]
    X_mfcc_train, X_mfcc_test = mfcc_features[train_index], mfcc_features[test_index]
    y_train, y_test = categorical_labels[train_index], categorical_labels[test_index]

    X_text_train, X_text_val, X_mfcc_train, X_mfcc_val, y_train, y_val = train_test_split(
        X_text_train, X_mfcc_train, y_train, test_size=0.2, random_state=119, shuffle=True
    )

    text_input_shape = (max_length,)
    mfcc_input_shape = (100, 128)

    model = create_transformer_model(text_input_shape, mfcc_input_shape)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    print(model.summary())

    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    model_checkpoint = ModelCheckpoint(f'depression_diagnosis_model_fold_{fold+1}.keras', monitor='val_loss', save_best_only=True)

    history = model.fit([X_text_train, X_mfcc_train], y_train, epochs=50, batch_size=16,
                        validation_data=([X_text_val, X_mfcc_val], y_val),
                        callbacks=[early_stopping, model_checkpoint])

    model = tf.keras.models.load_model(f'depression_diagnosis_model_fold_{fold+1}.keras', custom_objects={'TransformerBlock': TransformerBlock})

    # Evaluate the model on the test set
    loss, accuracy = model.evaluate([X_text_test, X_mfcc_test], y_test)
    print(f'Fold {fold + 1} Test Accuracy: {accuracy * 100:.2f}%')

    # Collect the fold results
    fold_results.append((loss, accuracy))

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model

    # Predict on the test set
    y_pred = model.predict([X_text_test, X_mfcc_test])
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_true_classes = np.argmax(y_test, axis=1)

    # Compute F1 score, precision, recall
    f1 = f1_score(y_true_classes, y_pred_classes, average='weighted')
    precision = precision_score(y_true_classes, y_pred_classes, average='weighted')
    recall = recall_score(y_true_classes, y_pred_classes, average='weighted')
    accuracy = np.mean(y_true_classes == y_pred_classes)

    print(f'Fold {fold + 1} F1 Score: {f1:.2f}')
    print(f'Fold {fold + 1} Precision: {precision:.2f}')
    print(f'Fold {fold + 1} Recall: {recall:.2f}')
    print(f'Fold {fold + 1} Accuracy: {accuracy * 100:.2f}%')

    # Confusion Matrix for the current fold
    conf_matrix = confusion_matrix(y_true_classes, y_pred_classes)

    # Normalize the confusion matrix by row (true classes)
    conf_matrix_normalized = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis]

    plt.figure(figsize=(10, 7))
    sns.heatmap(conf_matrix_normalized, annot=True, fmt='.2%', cmap='Blues',
                xticklabels=['Non-depressed', 'Mildly depressed', 'Severely depressed'],
                yticklabels=['Non-depressed', 'Mildly depressed', 'Severely depressed'],
                annot_kws={'size': 16})  # Font size for annotations
    plt.xlabel('Predicted', fontsize=14)
    plt.ylabel('True', fontsize=14)
    plt.title(f'Confusion Matrix - Fold {fold + 1}', fontsize=16)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.show()

    # Plot training & validation accuracy and loss values
    plt.figure(figsize=(12, 4))

    # Plot Accuracy
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title(f'Fold {fold + 1} - Model Accuracy', fontsize=16)
    plt.ylabel('Accuracy', fontsize=14)
    plt.xlabel('Epoch', fontsize=14)
    plt.legend(loc='upper left')
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)

    # Plot Loss
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title(f'Fold {fold + 1} - Model Loss', fontsize=16)
    plt.ylabel('Loss', fontsize=14)
    plt.xlabel('Epoch', fontsize=14)
    plt.legend(loc='upper right')
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)

    plt.show()

# Calculate and print average loss and accuracy across all folds
average_loss = np.mean([result[0] for result in fold_results])
average_accuracy = np.mean([result[1] for result in fold_results])
print(f"Average Test Accuracy across 5 folds: {average_accuracy * 100:.2f}%")
print(f"Average Test Loss across 5 folds: {average_loss:.4f}")

# 모델 저장
if best_model is not None:
    best_model.save('D:/EDAIC-WOZ/best_model/three_level2/mfcc_minmax_multiply_word2vec.h5')
    print(f"Best model saved as mfcc_minmax_multiply_word2vec.h5")

## Word2Vec_MFCC_Standard_Multiply

In [None]:
#%% MFCC Standard scaling
from sklearn.preprocessing import StandardScaler

# Function to extract and aggregate MFCC features from audio files with StandardScaling
def extract_aggregate_mfcc_standard(audio_files, max_pad_len=100, n_mfcc=128):
    mfcc_features_list = []
    scaler = StandardScaler()

    for audio_path in audio_files:
        if os.path.exists(audio_path):
            audio, sample_rate = librosa.load(audio_path, sr=None)
            mfcc = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfcc)

            if mfcc.shape[1] < max_pad_len:
                pad_width = max_pad_len - mfcc.shape[1]
                mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
            else:
                mfcc = mfcc[:, :max_pad_len]

            # Apply Standard Scaling
            mfcc_scaled = scaler.fit_transform(mfcc.T).T
            mfcc_features_list.append(mfcc_scaled.T)

    if len(mfcc_features_list) > 0:
        # Aggregate MFCC features (e.g., by taking the mean)
        aggregated_mfcc = np.mean(mfcc_features_list, axis=0)
    else:
        aggregated_mfcc = np.zeros((max_pad_len, n_mfcc))  # Fallback if no audio found

    return aggregated_mfcc

# Replace the MFCC extraction function with the StandardScaling version
for subfolder in os.listdir(source_root_audio):
    participant_id, subfolder_num = subfolder.split('_')

    # Determine the corresponding CSV file for this subfolder
    csv_file = f'{participant_id}_{subfolder_num}_processed.csv'
    text_file_path = os.path.join(source_root_text, csv_file)

    if not os.path.exists(text_file_path):
        print(f'Text file {text_file_path} not found for audio folder {subfolder}. Skipping.')
        continue

    # Load and aggregate sentences from the CSV
    aggregated_sentence = load_and_preprocess_csv(text_file_path)

    label_row = labels_df[labels_df['Participant_ID'] == int(participant_id)]

    if label_row.empty:
        print(f"Label not found for Participant ID: {participant_id}. Skipping.")
        continue

    phq_binary = label_row['PHQ_Three'].values[0]

    subfolder_path = os.path.join(source_root_audio, subfolder)
    audio_files = [os.path.join(subfolder_path, f'{participant_id}{subfolder_num}_{i}.wav') for i in range(len(aggregated_sentence.split()))]

    # Aggregate MFCC features from the audio files using StandardScaling
    aggregated_mfcc = extract_aggregate_mfcc_standard(audio_files)

    # Append the aggregated sentence and MFCC to the lists
    all_sentences.append(aggregated_sentence)
    all_mfcc_features.append(aggregated_mfcc)
    label_list.append(phq_binary)


In [None]:
#%% Tokenize and prepare Word2Vec embedding
# Tokenize the sentences
from gensim.models import Word2Vec, KeyedVectors
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import KFold, train_test_split
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, GlobalAveragePooling1D, Dropout, LayerNormalization, Multiply
from tensorflow.keras.layers import MultiHeadAttention
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
import matplotlib.pyplot as plt
import seaborn as sns

# Tokenize the sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_sentences)
word_index = tokenizer.word_index

# Load pretrained Word2Vec model
word2vec_model = Word2Vec.load("D:/EDAIC-WOZ/word2vec_sgns_custom.model")

# Create embedding matrix using the pretrained Word2Vec model
embedding_dim = word2vec_model.vector_size
vocab_size = len(word_index) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))

# Fill the embedding matrix with pretrained word vectors (gensim 4.x compatibility)
for word, i in word_index.items():
    if word in word2vec_model.wv.key_to_index:  # Use key_to_index in gensim 4.x
        embedding_vector = word2vec_model.wv[word]
        embedding_matrix[i] = embedding_vector

# Define Word2Vec Embedding Layer
embedding_layer = Embedding(input_dim=vocab_size,
                            output_dim=embedding_dim,
                            weights=[embedding_matrix],
                            input_length=100,  # max_length
                            trainable=True)  # Don't train the embedding weights

# Convert the sentences to sequences
sequences = tokenizer.texts_to_sequences(all_sentences)

# Pad the sequences to ensure uniform length
max_length = 100  # Adjust based on your data
padded_sequences = pad_sequences(sequences, maxlen=max_length)

# Convert MFCC features to numpy array
mfcc_features = np.array(all_mfcc_features)

# Convert labels to categorical format
categorical_labels = to_categorical(label_list, num_classes=3)

# Define Transformer block
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1, **kwargs):
        super(TransformerBlock, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim
        self.rate = rate

        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential(
            [Dense(ff_dim, activation="relu"), Dense(embed_dim)]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

    def get_config(self):
        config = super(TransformerBlock, self).get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "ff_dim": self.ff_dim,
            "rate": self.rate,
        })
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)

# Define model using Word2Vec embedding
def create_transformer_model(text_input_shape, mfcc_input_shape):
    text_inputs = Input(shape=text_input_shape)
    embedded_sequences = embedding_layer(text_inputs)

    mfcc_inputs = Input(shape=mfcc_input_shape)
    mfcc_dense = Dense(128, activation="relu")(mfcc_inputs)

    combined = Multiply()([embedded_sequences, mfcc_dense])

    transformer_block = TransformerBlock(embed_dim=128, num_heads=4, ff_dim=128)
    x = transformer_block(combined)
    x = GlobalAveragePooling1D()(x)
    x = Dropout(0.1)(x)
    x = Dense(128, activation="relu")(x)
    x = Dropout(0.1)(x)
    outputs = Dense(3, activation="softmax")(x)
    model = Model(inputs=[text_inputs, mfcc_inputs], outputs=outputs)
    return model

# KFold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=119)
fold_results = []
best_model = None
best_accuracy = 0

for fold, (train_index, test_index) in enumerate(kf.split(padded_sequences)):
    print(f"Training fold {fold + 1}/5...")

    X_text_train, X_text_test = padded_sequences[train_index], padded_sequences[test_index]
    X_mfcc_train, X_mfcc_test = mfcc_features[train_index], mfcc_features[test_index]
    y_train, y_test = categorical_labels[train_index], categorical_labels[test_index]

    X_text_train, X_text_val, X_mfcc_train, X_mfcc_val, y_train, y_val = train_test_split(
        X_text_train, X_mfcc_train, y_train, test_size=0.2, random_state=119, shuffle=True
    )

    text_input_shape = (max_length,)
    mfcc_input_shape = (100, 128)

    model = create_transformer_model(text_input_shape, mfcc_input_shape)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    print(model.summary())

    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    model_checkpoint = ModelCheckpoint(f'depression_diagnosis_model_fold_{fold+1}.keras', monitor='val_loss', save_best_only=True)

    history = model.fit([X_text_train, X_mfcc_train], y_train, epochs=50, batch_size=16,
                        validation_data=([X_text_val, X_mfcc_val], y_val),
                        callbacks=[early_stopping, model_checkpoint])

    model = tf.keras.models.load_model(f'depression_diagnosis_model_fold_{fold+1}.keras', custom_objects={'TransformerBlock': TransformerBlock})

    # Evaluate the model on the test set
    loss, accuracy = model.evaluate([X_text_test, X_mfcc_test], y_test)
    print(f'Fold {fold + 1} Test Accuracy: {accuracy * 100:.2f}%')

    # Collect the fold results
    fold_results.append((loss, accuracy))

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model

    # Predict on the test set
    y_pred = model.predict([X_text_test, X_mfcc_test])
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_true_classes = np.argmax(y_test, axis=1)

    # Compute F1 score, precision, recall
    f1 = f1_score(y_true_classes, y_pred_classes, average='weighted')
    precision = precision_score(y_true_classes, y_pred_classes, average='weighted')
    recall = recall_score(y_true_classes, y_pred_classes, average='weighted')
    accuracy = np.mean(y_true_classes == y_pred_classes)

    print(f'Fold {fold + 1} F1 Score: {f1:.2f}')
    print(f'Fold {fold + 1} Precision: {precision:.2f}')
    print(f'Fold {fold + 1} Recall: {recall:.2f}')
    print(f'Fold {fold + 1} Accuracy: {accuracy * 100:.2f}%')

    # Confusion Matrix for the current fold
    conf_matrix = confusion_matrix(y_true_classes, y_pred_classes)

    # Normalize the confusion matrix by row (true classes)
    conf_matrix_normalized = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis]

    plt.figure(figsize=(10, 7))
    sns.heatmap(conf_matrix_normalized, annot=True, fmt='.2%', cmap='Blues',
                xticklabels=['Non-depressed', 'Mildly depressed', 'Severely depressed'],
                yticklabels=['Non-depressed', 'Mildly depressed', 'Severely depressed'],
                annot_kws={'size': 16})  # Font size for annotations
    plt.xlabel('Predicted', fontsize=14)
    plt.ylabel('True', fontsize=14)
    plt.title(f'Confusion Matrix - Fold {fold + 1}', fontsize=16)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.show()

    # Plot training & validation accuracy and loss values
    plt.figure(figsize=(12, 4))

    # Plot Accuracy
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title(f'Fold {fold + 1} - Model Accuracy', fontsize=16)
    plt.ylabel('Accuracy', fontsize=14)
    plt.xlabel('Epoch', fontsize=14)
    plt.legend(loc='upper left')
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)

    # Plot Loss
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title(f'Fold {fold + 1} - Model Loss', fontsize=16)
    plt.ylabel('Loss', fontsize=14)
    plt.xlabel('Epoch', fontsize=14)
    plt.legend(loc='upper right')
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)

    plt.show()

# Calculate and print average loss and accuracy across all folds
average_loss = np.mean([result[0] for result in fold_results])
average_accuracy = np.mean([result[1] for result in fold_results])
print(f"Average Test Accuracy across 5 folds: {average_accuracy * 100:.2f}%")
print(f"Average Test Loss across 5 folds: {average_loss:.4f}")

# 모델 저장
if best_model is not None:
    best_model.save('D:/EDAIC-WOZ/best_model/three_level2/mfcc_standard_multiply_word2vec.h5')
    print(f"Best model saved as mfcc_minmax_multiply_word2vec.h5")

## Word2Vec_MFCC_Minmax_Stadnard_Multiply

In [None]:
#%% MFCC Minmaxscaling
# Function to extract and aggregate MFCC features from audio files with MinMaxScaling
def extract_aggregate_mfcc_minmax(audio_files, max_pad_len=100, n_mfcc=128):
    mfcc_features_list = []
    scaler = MinMaxScaler()

    for audio_path in audio_files:
        if os.path.exists(audio_path):
            audio, sample_rate = librosa.load(audio_path, sr=None)
            mfcc = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfcc)

            if mfcc.shape[1] < max_pad_len:
                pad_width = max_pad_len - mfcc.shape[1]
                mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
            else:
                mfcc = mfcc[:, :max_pad_len]

            # Apply MinMax Scaling
            mfcc_scaled = scaler.fit_transform(mfcc.T).T
            mfcc_features_list.append(mfcc_scaled.T)

    if len(mfcc_features_list) > 0:
        # Aggregate MFCC features (e.g., by taking the mean)
        aggregated_mfcc = np.mean(mfcc_features_list, axis=0)
    else:
        aggregated_mfcc = np.zeros((max_pad_len, n_mfcc))  # Fallback if no audio found

    return aggregated_mfcc

# Replace the MFCC extraction function with the MinMaxScaling version
for subfolder in os.listdir(source_root_audio):
    participant_id, subfolder_num = subfolder.split('_')

    # Determine the corresponding CSV file for this subfolder
    csv_file = f'{participant_id}_{subfolder_num}_processed.csv'
    text_file_path = os.path.join(source_root_text, csv_file)

    if not os.path.exists(text_file_path):
        print(f'Text file {text_file_path} not found for audio folder {subfolder}. Skipping.')
        continue

    # Load and aggregate sentences from the CSV
    aggregated_sentence = load_and_preprocess_csv(text_file_path)

    label_row = labels_df[labels_df['Participant_ID'] == int(participant_id)]

    if label_row.empty:
        print(f"Label not found for Participant ID: {participant_id}. Skipping.")
        continue

    phq_three = label_row['PHQ_Three'].values[0]

    subfolder_path = os.path.join(source_root_audio, subfolder)
    audio_files = [os.path.join(subfolder_path, f'{participant_id}{subfolder_num}_{i}.wav') for i in range(len(aggregated_sentence.split()))]

    # Aggregate MFCC features from the audio files using MinMaxScaling
    aggregated_mfcc = extract_aggregate_mfcc_minmax(audio_files)

    # Append the aggregated sentence and MFCC to the lists
    all_sentences.append(aggregated_sentence)
    all_mfcc_features.append(aggregated_mfcc)
    label_list.append(phq_three)

#%% MFCC StandardScaling
from sklearn.preprocessing import StandardScaler

# Function to extract and aggregate MFCC features from audio files with StandardScaling
def extract_aggregate_mfcc_standard(audio_files, max_pad_len=100, n_mfcc=128):
    mfcc_features_list = []
    scaler = StandardScaler()

    for audio_path in audio_files:
        if os.path.exists(audio_path):
            audio, sample_rate = librosa.load(audio_path, sr=None)
            mfcc = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfcc)

            if mfcc.shape[1] < max_pad_len:
                pad_width = max_pad_len - mfcc.shape[1]
                mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
            else:
                mfcc = mfcc[:, :max_pad_len]

            # Apply Standard Scaling
            mfcc_scaled = scaler.fit_transform(mfcc.T).T
            mfcc_features_list.append(mfcc_scaled.T)

    if len(mfcc_features_list) > 0:
        # Aggregate MFCC features (e.g., by taking the mean)
        aggregated_mfcc = np.mean(mfcc_features_list, axis=0)
    else:
        aggregated_mfcc = np.zeros((max_pad_len, n_mfcc))  # Fallback if no audio found

    return aggregated_mfcc

# Replace the MFCC extraction function with the StandardScaling version
for subfolder in os.listdir(source_root_audio):
    participant_id, subfolder_num = subfolder.split('_')

    # Determine the corresponding CSV file for this subfolder
    csv_file = f'{participant_id}_{subfolder_num}_processed.csv'
    text_file_path = os.path.join(source_root_text, csv_file)

    if not os.path.exists(text_file_path):
        print(f'Text file {text_file_path} not found for audio folder {subfolder}. Skipping.')
        continue

    # Load and aggregate sentences from the CSV
    aggregated_sentence = load_and_preprocess_csv(text_file_path)

    label_row = labels_df[labels_df['Participant_ID'] == int(participant_id)]

    if label_row.empty:
        print(f"Label not found for Participant ID: {participant_id}. Skipping.")
        continue

    phq_three = label_row['PHQ_Three'].values[0]

    subfolder_path = os.path.join(source_root_audio, subfolder)
    audio_files = [os.path.join(subfolder_path, f'{participant_id}{subfolder_num}_{i}.wav') for i in range(len(aggregated_sentence.split()))]

    # Aggregate MFCC features from the audio files using StandardScaling
    aggregated_mfcc = extract_aggregate_mfcc_standard(audio_files)

    # Append the aggregated sentence and MFCC to the lists
    all_sentences.append(aggregated_sentence)
    all_mfcc_features.append(aggregated_mfcc)
    label_list.append(phq_three)


In [None]:
#%% Tokenize and prepare Word2Vec embedding
# Tokenize the sentences
from gensim.models import Word2Vec, KeyedVectors
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import KFold, train_test_split
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, GlobalAveragePooling1D, Dropout, LayerNormalization, Multiply
from tensorflow.keras.layers import MultiHeadAttention
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
import matplotlib.pyplot as plt
import seaborn as sns

# Tokenize the sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_sentences)
word_index = tokenizer.word_index

# Load pretrained Word2Vec model
word2vec_model = Word2Vec.load("D:/EDAIC-WOZ/word2vec_sgns_custom.model")

# Create embedding matrix using the pretrained Word2Vec model
embedding_dim = word2vec_model.vector_size
vocab_size = len(word_index) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))

# Fill the embedding matrix with pretrained word vectors (gensim 4.x compatibility)
for word, i in word_index.items():
    if word in word2vec_model.wv.key_to_index:  # Use key_to_index in gensim 4.x
        embedding_vector = word2vec_model.wv[word]
        embedding_matrix[i] = embedding_vector

# Define Word2Vec Embedding Layer
embedding_layer = Embedding(input_dim=vocab_size,
                            output_dim=embedding_dim,
                            weights=[embedding_matrix],
                            input_length=100,  # max_length
                            trainable=True)  # Don't train the embedding weights

# Convert the sentences to sequences
sequences = tokenizer.texts_to_sequences(all_sentences)

# Pad the sequences to ensure uniform length
max_length = 100  # Adjust based on your data
padded_sequences = pad_sequences(sequences, maxlen=max_length)

# Convert MFCC features to numpy array
mfcc_features = np.array(all_mfcc_features)

# Convert labels to categorical format
categorical_labels = to_categorical(label_list, num_classes=3)

# Define Transformer block
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1, **kwargs):
        super(TransformerBlock, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim
        self.rate = rate

        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential(
            [Dense(ff_dim, activation="relu"), Dense(embed_dim)]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

    def get_config(self):
        config = super(TransformerBlock, self).get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "ff_dim": self.ff_dim,
            "rate": self.rate,
        })
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)

# Define model using Word2Vec embedding
def create_transformer_model(text_input_shape, mfcc_input_shape):
    text_inputs = Input(shape=text_input_shape)
    embedded_sequences = embedding_layer(text_inputs)

    mfcc_inputs = Input(shape=mfcc_input_shape)
    mfcc_dense = Dense(128, activation="relu")(mfcc_inputs)

    combined = Multiply()([embedded_sequences, mfcc_dense])

    transformer_block = TransformerBlock(embed_dim=128, num_heads=4, ff_dim=128)
    x = transformer_block(combined)
    x = GlobalAveragePooling1D()(x)
    x = Dropout(0.1)(x)
    x = Dense(128, activation="relu")(x)
    x = Dropout(0.1)(x)
    outputs = Dense(3, activation="softmax")(x)
    model = Model(inputs=[text_inputs, mfcc_inputs], outputs=outputs)
    return model

# KFold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=119)
fold_results = []
best_model = None
best_accuracy = 0

for fold, (train_index, test_index) in enumerate(kf.split(padded_sequences)):
    print(f"Training fold {fold + 1}/5...")

    X_text_train, X_text_test = padded_sequences[train_index], padded_sequences[test_index]
    X_mfcc_train, X_mfcc_test = mfcc_features[train_index], mfcc_features[test_index]
    y_train, y_test = categorical_labels[train_index], categorical_labels[test_index]

    X_text_train, X_text_val, X_mfcc_train, X_mfcc_val, y_train, y_val = train_test_split(
        X_text_train, X_mfcc_train, y_train, test_size=0.2, random_state=119, shuffle=True
    )

    text_input_shape = (max_length,)
    mfcc_input_shape = (100, 128)

    model = create_transformer_model(text_input_shape, mfcc_input_shape)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    print(model.summary())

    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    model_checkpoint = ModelCheckpoint(f'depression_diagnosis_model_fold_{fold+1}.keras', monitor='val_loss', save_best_only=True)

    history = model.fit([X_text_train, X_mfcc_train], y_train, epochs=50, batch_size=16,
                        validation_data=([X_text_val, X_mfcc_val], y_val),
                        callbacks=[early_stopping, model_checkpoint])

    model = tf.keras.models.load_model(f'depression_diagnosis_model_fold_{fold+1}.keras', custom_objects={'TransformerBlock': TransformerBlock})

    # Evaluate the model on the test set
    loss, accuracy = model.evaluate([X_text_test, X_mfcc_test], y_test)
    print(f'Fold {fold + 1} Test Accuracy: {accuracy * 100:.2f}%')

    # Collect the fold results
    fold_results.append((loss, accuracy))

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model

    # Predict on the test set
    y_pred = model.predict([X_text_test, X_mfcc_test])
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_true_classes = np.argmax(y_test, axis=1)

    # Compute F1 score, precision, recall
    f1 = f1_score(y_true_classes, y_pred_classes, average='weighted')
    precision = precision_score(y_true_classes, y_pred_classes, average='weighted')
    recall = recall_score(y_true_classes, y_pred_classes, average='weighted')
    accuracy = np.mean(y_true_classes == y_pred_classes)

    print(f'Fold {fold + 1} F1 Score: {f1:.2f}')
    print(f'Fold {fold + 1} Precision: {precision:.2f}')
    print(f'Fold {fold + 1} Recall: {recall:.2f}')
    print(f'Fold {fold + 1} Accuracy: {accuracy * 100:.2f}%')

    # Confusion Matrix for the current fold
    conf_matrix = confusion_matrix(y_true_classes, y_pred_classes)

    # Normalize the confusion matrix by row (true classes)
    conf_matrix_normalized = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis]

    plt.figure(figsize=(10, 7))
    sns.heatmap(conf_matrix_normalized, annot=True, fmt='.2%', cmap='Blues',
                xticklabels=['Non-depressed', 'Mildly depressed', 'Severely depressed'],
                yticklabels=['Non-depressed', 'Mildly depressed', 'Severely depressed'],
                annot_kws={'size': 16})  # Font size for annotations
    plt.xlabel('Predicted', fontsize=14)
    plt.ylabel('True', fontsize=14)
    plt.title(f'Confusion Matrix - Fold {fold + 1}', fontsize=16)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.show()

    # Plot training & validation accuracy and loss values
    plt.figure(figsize=(12, 4))

    # Plot Accuracy
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title(f'Fold {fold + 1} - Model Accuracy', fontsize=16)
    plt.ylabel('Accuracy', fontsize=14)
    plt.xlabel('Epoch', fontsize=14)
    plt.legend(loc='upper left')
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)

    # Plot Loss
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title(f'Fold {fold + 1} - Model Loss', fontsize=16)
    plt.ylabel('Loss', fontsize=14)
    plt.xlabel('Epoch', fontsize=14)
    plt.legend(loc='upper right')
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)

    plt.show()

# Calculate and print average loss and accuracy across all folds
average_loss = np.mean([result[0] for result in fold_results])
average_accuracy = np.mean([result[1] for result in fold_results])
print(f"Average Test Accuracy across 5 folds: {average_accuracy * 100:.2f}%")
print(f"Average Test Loss across 5 folds: {average_loss:.4f}")

# 모델 저장
if best_model is not None:
    best_model.save('D:/EDAIC-WOZ/best_model/three_level2/mfcc_minmax_multiply_word2vec.h5')
    print(f"Best model saved as mfcc_minmax_multiply_word2vec.h5")

## Word2vec_MFCC_Standard_Minmax_Multiply

In [None]:
#%% MFCC StandardScaling
from sklearn.preprocessing import StandardScaler

# Function to extract and aggregate MFCC features from audio files with StandardScaling
def extract_aggregate_mfcc_standard(audio_files, max_pad_len=100, n_mfcc=128):
    mfcc_features_list = []
    scaler = StandardScaler()

    for audio_path in audio_files:
        if os.path.exists(audio_path):
            audio, sample_rate = librosa.load(audio_path, sr=None)
            mfcc = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfcc)

            if mfcc.shape[1] < max_pad_len:
                pad_width = max_pad_len - mfcc.shape[1]
                mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
            else:
                mfcc = mfcc[:, :max_pad_len]

            # Apply Standard Scaling
            mfcc_scaled = scaler.fit_transform(mfcc.T).T
            mfcc_features_list.append(mfcc_scaled.T)

    if len(mfcc_features_list) > 0:
        # Aggregate MFCC features (e.g., by taking the mean)
        aggregated_mfcc = np.mean(mfcc_features_list, axis=0)
    else:
        aggregated_mfcc = np.zeros((max_pad_len, n_mfcc))  # Fallback if no audio found

    return aggregated_mfcc

# Replace the MFCC extraction function with the StandardScaling version
for subfolder in os.listdir(source_root_audio):
    participant_id, subfolder_num = subfolder.split('_')

    # Determine the corresponding CSV file for this subfolder
    csv_file = f'{participant_id}_{subfolder_num}_processed.csv'
    text_file_path = os.path.join(source_root_text, csv_file)

    if not os.path.exists(text_file_path):
        print(f'Text file {text_file_path} not found for audio folder {subfolder}. Skipping.')
        continue

    # Load and aggregate sentences from the CSV
    aggregated_sentence = load_and_preprocess_csv(text_file_path)

    label_row = labels_df[labels_df['Participant_ID'] == int(participant_id)]

    if label_row.empty:
        print(f"Label not found for Participant ID: {participant_id}. Skipping.")
        continue

    phq_three = label_row['PHQ_Three'].values[0]

    subfolder_path = os.path.join(source_root_audio, subfolder)
    audio_files = [os.path.join(subfolder_path, f'{participant_id}{subfolder_num}_{i}.wav') for i in range(len(aggregated_sentence.split()))]

    # Aggregate MFCC features from the audio files using StandardScaling
    aggregated_mfcc = extract_aggregate_mfcc_standard(audio_files)

    # Append the aggregated sentence and MFCC to the lists
    all_sentences.append(aggregated_sentence)
    all_mfcc_features.append(aggregated_mfcc)
    label_list.append(phq_three)

#%% MFCC Minmaxscaling
# Function to extract and aggregate MFCC features from audio files with MinMaxScaling
def extract_aggregate_mfcc_minmax(audio_files, max_pad_len=100, n_mfcc=128):
    mfcc_features_list = []
    scaler = MinMaxScaler()

    for audio_path in audio_files:
        if os.path.exists(audio_path):
            audio, sample_rate = librosa.load(audio_path, sr=None)
            mfcc = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfcc)

            if mfcc.shape[1] < max_pad_len:
                pad_width = max_pad_len - mfcc.shape[1]
                mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
            else:
                mfcc = mfcc[:, :max_pad_len]

            # Apply MinMax Scaling
            mfcc_scaled = scaler.fit_transform(mfcc.T).T
            mfcc_features_list.append(mfcc_scaled.T)

    if len(mfcc_features_list) > 0:
        # Aggregate MFCC features (e.g., by taking the mean)
        aggregated_mfcc = np.mean(mfcc_features_list, axis=0)
    else:
        aggregated_mfcc = np.zeros((max_pad_len, n_mfcc))  # Fallback if no audio found

    return aggregated_mfcc

# Replace the MFCC extraction function with the MinMaxScaling version
for subfolder in os.listdir(source_root_audio):
    participant_id, subfolder_num = subfolder.split('_')

    # Determine the corresponding CSV file for this subfolder
    csv_file = f'{participant_id}_{subfolder_num}_processed.csv'
    text_file_path = os.path.join(source_root_text, csv_file)

    if not os.path.exists(text_file_path):
        print(f'Text file {text_file_path} not found for audio folder {subfolder}. Skipping.')
        continue

    # Load and aggregate sentences from the CSV
    aggregated_sentence = load_and_preprocess_csv(text_file_path)

    label_row = labels_df[labels_df['Participant_ID'] == int(participant_id)]

    if label_row.empty:
        print(f"Label not found for Participant ID: {participant_id}. Skipping.")
        continue

    phq_three = label_row['PHQ_Three'].values[0]

    subfolder_path = os.path.join(source_root_audio, subfolder)
    audio_files = [os.path.join(subfolder_path, f'{participant_id}{subfolder_num}_{i}.wav') for i in range(len(aggregated_sentence.split()))]

    # Aggregate MFCC features from the audio files using MinMaxScaling
    aggregated_mfcc = extract_aggregate_mfcc_minmax(audio_files)

    # Append the aggregated sentence and MFCC to the lists
    all_sentences.append(aggregated_sentence)
    all_mfcc_features.append(aggregated_mfcc)
    label_list.append(phq_three)


In [None]:
#%% Tokenize and prepare Word2Vec embedding
# Tokenize the sentences
from gensim.models import Word2Vec, KeyedVectors
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import KFold, train_test_split
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, GlobalAveragePooling1D, Dropout, LayerNormalization, Multiply
from tensorflow.keras.layers import MultiHeadAttention
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
import matplotlib.pyplot as plt
import seaborn as sns

# Tokenize the sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_sentences)
word_index = tokenizer.word_index

# Load pretrained Word2Vec model
word2vec_model = Word2Vec.load("D:/EDAIC-WOZ/word2vec_sgns_custom.model")

# Create embedding matrix using the pretrained Word2Vec model
embedding_dim = word2vec_model.vector_size
vocab_size = len(word_index) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))

# Fill the embedding matrix with pretrained word vectors (gensim 4.x compatibility)
for word, i in word_index.items():
    if word in word2vec_model.wv.key_to_index:  # Use key_to_index in gensim 4.x
        embedding_vector = word2vec_model.wv[word]
        embedding_matrix[i] = embedding_vector

# Define Word2Vec Embedding Layer
embedding_layer = Embedding(input_dim=vocab_size,
                            output_dim=embedding_dim,
                            weights=[embedding_matrix],
                            input_length=100,  # max_length
                            trainable=True)  # Don't train the embedding weights

# Convert the sentences to sequences
sequences = tokenizer.texts_to_sequences(all_sentences)

# Pad the sequences to ensure uniform length
max_length = 100  # Adjust based on your data
padded_sequences = pad_sequences(sequences, maxlen=max_length)

# Convert MFCC features to numpy array
mfcc_features = np.array(all_mfcc_features)

# Convert labels to categorical format
categorical_labels = to_categorical(label_list, num_classes=3)

# Define Transformer block
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1, **kwargs):
        super(TransformerBlock, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim
        self.rate = rate

        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential(
            [Dense(ff_dim, activation="relu"), Dense(embed_dim)]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

    def get_config(self):
        config = super(TransformerBlock, self).get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "ff_dim": self.ff_dim,
            "rate": self.rate,
        })
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)

# Define model using Word2Vec embedding
def create_transformer_model(text_input_shape, mfcc_input_shape):
    text_inputs = Input(shape=text_input_shape)
    embedded_sequences = embedding_layer(text_inputs)

    mfcc_inputs = Input(shape=mfcc_input_shape)
    mfcc_dense = Dense(128, activation="relu")(mfcc_inputs)

    combined = Multiply()([embedded_sequences, mfcc_dense])

    transformer_block = TransformerBlock(embed_dim=128, num_heads=4, ff_dim=128)
    x = transformer_block(combined)
    x = GlobalAveragePooling1D()(x)
    x = Dropout(0.1)(x)
    x = Dense(128, activation="relu")(x)
    x = Dropout(0.1)(x)
    outputs = Dense(3, activation="softmax")(x)
    model = Model(inputs=[text_inputs, mfcc_inputs], outputs=outputs)
    return model

# KFold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=119)
fold_results = []
best_model = None
best_accuracy = 0

for fold, (train_index, test_index) in enumerate(kf.split(padded_sequences)):
    print(f"Training fold {fold + 1}/5...")

    X_text_train, X_text_test = padded_sequences[train_index], padded_sequences[test_index]
    X_mfcc_train, X_mfcc_test = mfcc_features[train_index], mfcc_features[test_index]
    y_train, y_test = categorical_labels[train_index], categorical_labels[test_index]

    X_text_train, X_text_val, X_mfcc_train, X_mfcc_val, y_train, y_val = train_test_split(
        X_text_train, X_mfcc_train, y_train, test_size=0.2, random_state=119, shuffle=True
    )

    text_input_shape = (max_length,)
    mfcc_input_shape = (100, 128)

    model = create_transformer_model(text_input_shape, mfcc_input_shape)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    print(model.summary())

    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    model_checkpoint = ModelCheckpoint(f'depression_diagnosis_model_fold_{fold+1}.keras', monitor='val_loss', save_best_only=True)

    history = model.fit([X_text_train, X_mfcc_train], y_train, epochs=50, batch_size=16,
                        validation_data=([X_text_val, X_mfcc_val], y_val),
                        callbacks=[early_stopping, model_checkpoint])

    model = tf.keras.models.load_model(f'depression_diagnosis_model_fold_{fold+1}.keras', custom_objects={'TransformerBlock': TransformerBlock})

    # Evaluate the model on the test set
    loss, accuracy = model.evaluate([X_text_test, X_mfcc_test], y_test)
    print(f'Fold {fold + 1} Test Accuracy: {accuracy * 100:.2f}%')

    # Collect the fold results
    fold_results.append((loss, accuracy))

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model

    # Predict on the test set
    y_pred = model.predict([X_text_test, X_mfcc_test])
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_true_classes = np.argmax(y_test, axis=1)

    # Compute F1 score, precision, recall
    f1 = f1_score(y_true_classes, y_pred_classes, average='weighted')
    precision = precision_score(y_true_classes, y_pred_classes, average='weighted')
    recall = recall_score(y_true_classes, y_pred_classes, average='weighted')
    accuracy = np.mean(y_true_classes == y_pred_classes)

    print(f'Fold {fold + 1} F1 Score: {f1:.2f}')
    print(f'Fold {fold + 1} Precision: {precision:.2f}')
    print(f'Fold {fold + 1} Recall: {recall:.2f}')
    print(f'Fold {fold + 1} Accuracy: {accuracy * 100:.2f}%')

    # Confusion Matrix for the current fold
    conf_matrix = confusion_matrix(y_true_classes, y_pred_classes)

    # Normalize the confusion matrix by row (true classes)
    conf_matrix_normalized = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis]

    plt.figure(figsize=(10, 7))
    sns.heatmap(conf_matrix_normalized, annot=True, fmt='.2%', cmap='Blues',
                xticklabels=['Non-depressed', 'Mildly depressed', 'Severely depressed'],
                yticklabels=['Non-depressed', 'Mildly depressed', 'Severely depressed'],
                annot_kws={'size': 16})  # Font size for annotations
    plt.xlabel('Predicted', fontsize=14)
    plt.ylabel('True', fontsize=14)
    plt.title(f'Confusion Matrix - Fold {fold + 1}', fontsize=16)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.show()

    # Plot training & validation accuracy and loss values
    plt.figure(figsize=(12, 4))

    # Plot Accuracy
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title(f'Fold {fold + 1} - Model Accuracy', fontsize=16)
    plt.ylabel('Accuracy', fontsize=14)
    plt.xlabel('Epoch', fontsize=14)
    plt.legend(loc='upper left')
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)

    # Plot Loss
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title(f'Fold {fold + 1} - Model Loss', fontsize=16)
    plt.ylabel('Loss', fontsize=14)
    plt.xlabel('Epoch', fontsize=14)
    plt.legend(loc='upper right')
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)

    plt.show()

# Calculate and print average loss and accuracy across all folds
average_loss = np.mean([result[0] for result in fold_results])
average_accuracy = np.mean([result[1] for result in fold_results])
print(f"Average Test Accuracy across 5 folds: {average_accuracy * 100:.2f}%")
print(f"Average Test Loss across 5 folds: {average_loss:.4f}")

# 모델 저장
if best_model is not None:
    best_model.save('D:/EDAIC-WOZ/best_model/three_level2/mfcc_minmax_multiply_word2vec.h5')
    print(f"Best model saved as mfcc_minmax_multiply_word2vec.h5")

## Word2Vec_deltamfcc_Minmax_Multiply