<a href="https://colab.research.google.com/github/kaustavd7/CGS616_Assn3/blob/main/Modelling_speech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.metrics import f1_score
from tqdm import tqdm

In [40]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [89]:
class SpeechAnxietyDataset(Dataset):
    def __init__(self, label_csv, feature_dir, max_seq_len=50):
        self.data = pd.read_csv(label_csv)
        self.feature_dir = feature_dir
        self.max_seq_len = max_seq_len

    def __len__(self):
        return len(self.data)

    def pad_or_truncate(self, arr, max_len):
        if len(arr) > max_len:
            return arr[:max_len]
        elif len(arr) < max_len:
            pad = np.zeros((max_len - len(arr), arr.shape[1]))
            return np.vstack([arr, pad])
        return arr

    def load_clean_csv(self, path):
          df = pd.read_csv(path, delimiter=';')
          df = df.drop(columns=[df.columns[0]], axis = 1, errors='ignore')

          df = df.loc[:, ~df.columns.str.contains('^Unnamed')]  # Drop all Unnamed columns
          return df.values.astype(np.float32)


    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        pid = str(row["Participant"])
        folder = os.path.join(self.feature_dir, f"{pid}_features")

        mfcc = self.load_clean_csv(os.path.join(folder, f"{pid}_OpenSMILE2.3.0_mfcc.csv"))
        # mfcc = mfcc.drop(columns = [mfcc.columns[0]]).values.astype(np.float32)

        egemaps = self.load_clean_csv(os.path.join(folder, f"{pid}_OpenSMILE2.3.0_egemaps.csv"))
        # egemaps = egemaps.drop(columns = [egemaps.columns[0]]).values.astype(np.float32)

        mfcc = self.pad_or_truncate(mfcc, self.max_seq_len)
        egemaps = self.pad_or_truncate(egemaps, self.max_seq_len)
        return {
            "mfcc": torch.tensor(mfcc),
            "egemaps": torch.tensor(egemaps),
            "anxiety": torch.tensor(row["Anxiety_severity"], dtype=torch.float32),
            "ptsd": torch.tensor(row["PTSD_label"], dtype=torch.float32)
        }

In [72]:
class TemporalModalityEncoder(nn.Module):
    def __init__(self, input_dim, lstm_hidden=64, latent_dim=64, num_layers=1, dropout=0.3):
        super(TemporalModalityEncoder, self).__init__()
        self.lstm = nn.LSTM(input_dim, lstm_hidden, num_layers=num_layers,
                            batch_first=True, dropout=dropout, bidirectional=True)
        self.project = nn.Sequential(
            nn.Linear(lstm_hidden * 2, latent_dim),
            nn.ReLU()
        )

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        hn = hn.transpose(0, 1).reshape(x.size(0), -1)
        return self.project(hn)

class FusionModule(nn.Module):
    def __init__(self, input_dim, fusion_dim=128, dropout=0.3):
        super(FusionModule, self).__init__()
        self.fusion = nn.Sequential(
            nn.Linear(input_dim, fusion_dim),
            nn.ReLU(),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.fusion(x)

class AnxietyPTSDModel(nn.Module):
    def __init__(self):
        super(AnxietyPTSDModel, self).__init__()
        self.mfcc_encoder = TemporalModalityEncoder(input_dim=39)
        self.egemap_encoder = TemporalModalityEncoder(input_dim=88)

        self.fusion = FusionModule(input_dim=64 * 2)
        self.anxiety_head = nn.Linear(128, 1)
        self.ptsd_head = nn.Linear(128, 1)

    def forward(self, mfcc, egemap):
        z1 = self.mfcc_encoder(mfcc)
        z2 = self.egemap_encoder(egemap)

        fused = self.fusion(torch.cat([z1, z2], dim=-1))
        anxiety_pred = self.anxiety_head(fused)
        ptsd_pred = torch.sigmoid(self.ptsd_head(fused))

        return anxiety_pred.squeeze(), ptsd_pred.squeeze()

def multitask_loss(anxiety_pred, anxiety_true, ptsd_pred, ptsd_true, alpha=0.5):
    loss_reg = F.mse_loss(anxiety_pred, anxiety_true)
    loss_cls = F.binary_cross_entropy(ptsd_pred, ptsd_true)
    return loss_reg + alpha * loss_cls

In [86]:
def train(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader):
        mfcc = batch["mfcc"].to(device)
        egemaps = batch["egemaps"].to(device)
        anxiety = batch["anxiety"].to(device)
        ptsd = batch["ptsd"].to(device)

        optimizer.zero_grad()
        anx_pred, ptsd_pred = model(mfcc, egemaps)
        loss = multitask_loss(anx_pred, anxiety, ptsd_pred, ptsd)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            mfcc = batch["mfcc"].to(device)
            egemaps = batch["egemaps"].to(device)
            anxiety = batch["anxiety"].to(device)
            ptsd = batch["ptsd"].to(device)

            anx_pred, ptsd_pred = model(mfcc, egemaps)
            loss = multitask_loss(anx_pred, anxiety, ptsd_pred, ptsd)
            total_loss += loss.item()
            all_preds.extend(ptsd_pred.round().cpu().tolist())
            all_labels.extend(ptsd.cpu().tolist())
    f1 = f1_score(all_labels, all_preds)
    return total_loss / len(dataloader), f1


In [93]:
path = '/content/drive/MyDrive/HCC_Project'
data_path = os.path.join(path, 'speech_features')
label_path = os.path.join(path, 'labels/Final_detailed_labels_train.csv')


dataset = SpeechAnxietyDataset(label_path,data_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_set, val_set = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_set, batch_size=16, shuffle=True)
val_loader = DataLoader(val_set, batch_size=16, shuffle=False)

model = AnxietyPTSDModel().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

for epoch in range(1, 21):
    train_loss = train(model, train_loader, optimizer, device)
    val_loss, val_f1 = evaluate(model, val_loader, device)
    print(f"Epoch {epoch:02d} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | PTSD F1: {val_f1:.4f}")


  0%|          | 0/7 [00:30<?, ?it/s]


RuntimeError: input.size(-1) must be equal to input_size. Expected 39, got 40

In [91]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split


In [106]:
MAX_SEQ_LEN = 50
MFCC_DIM = 40
EGEMAPS_DIM = 24
LABEL_CSV = "Final_detailed_labels_train.csv"
FEATURE_DIR = "speech_features"
path = '/content/drive/MyDrive/HCC_Project'
data_path = os.path.join(path, 'speech_features')
label_path = os.path.join(path, 'labels/Final_detailed_labels_train.csv')

In [99]:
def load_and_pad_features(pid):
    folder = os.path.join(data_path, f"{pid}_features")

    def load_csv_clean(path, expected_dim):
        df = pd.read_csv(path, delimiter=';')
        df.drop([df.columns[0]], axis = 1, inplace = True)
        df = df.loc[:, ~df.columns.str.contains("^Unnamed")]
        arr = df.values.astype(np.float32)
        if arr.shape[0] > MAX_SEQ_LEN:
            arr = arr[:MAX_SEQ_LEN]
        elif arr.shape[0] < MAX_SEQ_LEN:
            pad = np.zeros((MAX_SEQ_LEN - arr.shape[0], expected_dim), dtype=np.float32)
            arr = np.vstack([arr, pad])
        return arr

    mfcc = load_csv_clean(os.path.join(folder, f"{pid}_OpenSMILE2.3.0_mfcc.csv"), MFCC_DIM)
    egemaps = load_csv_clean(os.path.join(folder, f"{pid}_OpenSMILE2.3.0_egemaps.csv"), EGEMAPS_DIM)
    return mfcc, egemaps


In [96]:
def create_tf_dataset(label_csv, batch_size=16):
    df = pd.read_csv(label_csv)
    pids = df["Participant"].tolist()
    anxiety = df["Anxiety_severity"].tolist()
    ptsd = df["PTSD_label"].tolist()

    mfcc_data, egemaps_data = [], []
    for pid in pids:
        mfcc, egemaps = load_and_pad_features(pid)
        mfcc_data.append(mfcc)
        egemaps_data.append(egemaps)

    X_mfcc = np.stack(mfcc_data)
    X_egemaps = np.stack(egemaps_data)
    y_anx = np.array(anxiety, dtype=np.float32)
    y_ptsd = np.array(ptsd, dtype=np.float32)

    dataset = tf.data.Dataset.from_tensor_slices(((X_mfcc, X_egemaps), (y_anx, y_ptsd)))
    return dataset.shuffle(256).batch(batch_size).prefetch(tf.data.AUTOTUNE)

In [107]:
def build_model():
    mfcc_input = tf.keras.Input(shape=(MAX_SEQ_LEN, MFCC_DIM))
    egemaps_input = tf.keras.Input(shape=(MAX_SEQ_LEN, EGEMAPS_DIM))

    x1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=False))(mfcc_input)
    x2 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=False))(egemaps_input)

    fused = tf.keras.layers.Concatenate()([x1, x2])
    fused = tf.keras.layers.Dense(128, activation='relu')(fused)
    fused = tf.keras.layers.Dropout(0.3)(fused)

    out_anxiety = tf.keras.layers.Dense(1, name="anxiety_output")(fused)
    out_ptsd = tf.keras.layers.Dense(1, activation='sigmoid', name="ptsd_output")(fused)

    model = tf.keras.Model(inputs=[mfcc_input, egemaps_input], outputs=[out_anxiety, out_ptsd])
    return model


In [103]:
full_ds = create_tf_dataset(label_path)
train_ds = full_ds.take(int(0.8 * len(full_ds)))
val_ds = full_ds.skip(int(0.8 * len(full_ds)))

In [109]:
model = build_model()
model.compile(
    optimizer='adam',
    loss={"anxiety_output": "mse", "ptsd_output": "binary_crossentropy"},
    loss_weights={"anxiety_output": 1.0, "ptsd_output": 0.5},
    metrics={"ptsd_output": ["accuracy"]}
)

model.fit(train_ds, validation_data=val_ds, epochs=100)

Epoch 1/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 153ms/step - anxiety_output_loss: 200.9741 - loss: 201.3239 - ptsd_output_accuracy: 0.4698 - ptsd_output_loss: 0.6997 - val_anxiety_output_loss: 98.8189 - val_loss: 101.2052 - val_ptsd_output_accuracy: 0.7407 - val_ptsd_output_loss: 0.5650
Epoch 2/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - anxiety_output_loss: 124.5998 - loss: 124.9013 - ptsd_output_accuracy: 0.7330 - ptsd_output_loss: 0.6030 - val_anxiety_output_loss: 37.3144 - val_loss: 41.3663 - val_ptsd_output_accuracy: 0.7778 - val_ptsd_output_loss: 0.5648
Epoch 3/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - anxiety_output_loss: 56.1964 - loss: 56.5541 - ptsd_output_accuracy: 0.6292 - ptsd_output_loss: 0.7155 - val_anxiety_output_loss: 39.7716 - val_loss: 41.2122 - val_ptsd_output_accuracy: 0.6667 - val_ptsd_output_loss: 0.6620
Epoch 4/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

<keras.src.callbacks.history.History at 0x7eb5be6a5190>

In [112]:
model.save('/content/drive/MyDrive/HCC_Project/speechmodel.keras')