## Project Directory and File Structure Inspection
Displays the current working directory contents to verify available files, datasets, and paths before loading data or running experiments, helping prevent file-not-found and path-related errors.

In [101]:
!dir

 Volume in drive D is Data
 Volume Serial Number is 6A5C-4620

 Directory of D:\portfolio\IU\Thesis_Final\Coding

01/06/2026  02:26 PM    <DIR>          .
01/05/2026  03:28 PM    <DIR>          ..
01/05/2026  10:00 PM    <DIR>          .ipynb_checkpoints
01/05/2026  03:28 PM        29,200,044 492_AUDIO.wav
01/05/2026  03:28 PM         3,171,528 492_CLNF_AUs.txt
01/05/2026  03:28 PM        33,849,177 492_CLNF_features.txt
01/05/2026  03:28 PM        50,404,127 492_CLNF_features3D.txt
01/05/2026  03:28 PM         4,252,001 492_CLNF_gaze.txt
01/05/2026  03:28 PM       489,246,000 492_CLNF_hog.bin
01/05/2026  03:28 PM         2,375,549 492_CLNF_pose.txt
01/05/2026  03:28 PM        51,383,596 492_COVAREP.csv
01/05/2026  03:28 PM         3,017,567 492_FORMANT.csv
01/05/2026  03:28 PM            17,295 492_TRANSCRIPT.csv
01/06/2026  12:12 AM           228,413 DAIC_WOZ Dataset.ipynb
01/05/2026  03:42 PM    <DIR>          DAIC_WOZ_subset
01/06/2026  01:07 PM           183,954 master_dataset_.cs

### Library Imports, Global Configuration, and Reproducibility Setup
Imports all required libraries for file handling, data processing, and feature engineering, while setting a fixed random seed and suppressing warnings to ensure consistent, clean, and reproducible experimental results.

In [102]:
import os
import re
import json
import zipfile
from pathlib import Path
import warnings


import numpy as np
import pandas as pd

from sklearn.decomposition import PCA

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

warnings.filterwarnings('ignore')

### Dataset Paths and Directory Configuration
Defines the base project directory and centralizes file paths for extracted participant data and official train/test split files, ensuring consistent, organized, and portable access to all dataset resources throughout the notebook.

In [103]:
from pathlib import Path

BASE_PATH = Path("E:/IU")

EXTRACT_DIR = BASE_PATH / "DAIC_WOZ_subset" / "extracted"

TRAIN_SPLIT_PATH = BASE_PATH / "DAIC_WOZ_subset" / "train_split_Depression_AVEC2017.csv"

TEST_PATH = BASE_PATH / "DAIC_WOZ_subset" / "train_split_Depression_AVEC2017.csv"

### Multimodal Feature Extraction from Raw Behavioral Signals
Defines a complete set of preprocessing functions that convert raw DAIC-WOZ modality files (transcripts, facial action units, eye gaze, audio, head pose, facial geometry, and HOG descriptors) into clean, aggregated numerical features. Each function summarizes frame-level signals into stable statistics (means/stds or text) to create participant-level representations suitable for machine learning and multimodal fusion.

In [104]:
## 1. The Words: Linguistic Clues
def process_words(transcript_path):
    df = pd.read_csv(transcript_path, sep='\t').fillna("")
    # Only keep what the Patient said
    participant_speech = df[df['speaker'] == 'Participant']['value'].tolist()
    # Join all sentences into one long story
    full_text = " ".join(participant_speech)
    return full_text

## 2. The Face: (Expression Clues)
def process_face_aus(au_path):
    df_au = pd.read_csv(au_path, sep=',', engine='python')
    df_au.columns = df_au.columns.str.strip()
    # Get the average "strength" of every facial muscle movement
    avg_aus = df_au.filter(regex='_r$').mean().to_dict()
    return avg_aus

## 3. The Eyes (Behavioral Clues)
def process_eyes_carefully(gaze_path):
    # Load the file
    df_gaze = pd.read_csv(gaze_path, sep=None)
    df_gaze.columns = df_gaze.columns.str.strip()

    # FILTER: Only keep rows where the computer actually saw the eyes
    success_df = df_gaze[df_gaze['success'] == 1]

    if not success_df.empty:
        # Calculate the average direction of both eyes (0 and 1)
        avg_y_gaze = (success_df['y_0'].mean() + success_df['y_1'].mean()) / 2
        # Calculate how often the camera lost them (maybe they looked away entirely?)
        tracking_rate = len(success_df) / len(df_gaze)
    else:
        avg_y_gaze = 0
        tracking_rate = 0

    return {
        "avg_gaze_downward": avg_y_gaze, # Negative means looking up, Positive means looking down
        "eye_contact_rate": tracking_rate # How much of the time they were 'visible'
    }

## 4. Load the covarep Audio
def process_audio_covarep(covarep_path):
    df = pd.read_csv(covarep_path, header=None, sep=",", engine="python")
    df = df[(df != 0).any(axis=1)]

    features = {}
    if df.empty:
        for i in range(74):
            features[f"covarep_{i}_mean"] = 0.0
            features[f"covarep_{i}_std"] = 0.0
        return features

    for i in df.columns:
        features[f"covarep_{i}_mean"] = df[i].mean()
        features[f"covarep_{i}_std"] = df[i].std()

    return features

# ------------------------------------------------------------
# AUDIO – FORMANT
# ------------------------------------------------------------
def process_formant(path):
    # Correct separator = comma
    df = pd.read_csv(path, header=None, sep=",", engine="python")

    # Ensure numeric (defensive programming)
    df = df.apply(pd.to_numeric, errors="coerce")
    df = df.dropna()

    if df.empty:
        return {
            "formant_F1_mean": 0.0, "formant_F1_std": 0.0,
            "formant_F2_mean": 0.0, "formant_F2_std": 0.0,
            "formant_F3_mean": 0.0, "formant_F3_std": 0.0,
            "formant_F4_mean": 0.0, "formant_F4_std": 0.0,
            "formant_F5_mean": 0.0, "formant_F5_std": 0.0,
        }

    features = {}
    for i, fname in enumerate(["F1", "F2", "F3", "F4", "F5"]):
        features[f"formant_{fname}_mean"] = df[i].mean()
        features[f"formant_{fname}_std"] = df[i].std()

    return features


# ------------------------------------------------------------
# HEAD POSE
# ------------------------------------------------------------
def process_pose(path):
    # Explicit comma separator (this file IS CSV)
    df = pd.read_csv(path, sep=",")
    df.columns = df.columns.str.strip()

    # Keep only successful frames
    df = df[df["success"] == 1]

    # Force numeric conversion (defensive)
    pose_cols = ["Tx", "Ty", "Tz", "Rx", "Ry", "Rz"]
    df[pose_cols] = df[pose_cols].apply(pd.to_numeric, errors="coerce")

    df = df.dropna()

    if df.empty:
        return {
            "pose_Tx_mean": 0.0, "pose_Ty_mean": 0.0, "pose_Tz_mean": 0.0,
            "pose_Rx_mean": 0.0, "pose_Ry_mean": 0.0, "pose_Rz_mean": 0.0,
            "pose_Rx_std": 0.0,  "pose_Ry_std": 0.0,  "pose_Rz_std": 0.0,
        }

    return {
        "pose_Tx_mean": df["Tx"].mean(),
        "pose_Ty_mean": df["Ty"].mean(),
        "pose_Tz_mean": df["Tz"].mean(),
        "pose_Rx_mean": df["Rx"].mean(),
        "pose_Ry_mean": df["Ry"].mean(),
        "pose_Rz_mean": df["Rz"].mean(),
        "pose_Rx_std": df["Rx"].std(),
        "pose_Ry_std": df["Ry"].std(),
        "pose_Rz_std": df["Rz"].std(),
    }



# ------------------------------------------------------------
# FACE GEOMETRY 2D
# ------------------------------------------------------------
def process_geom2d(path):
    # Auto-detect delimiter (OpenFace files are inconsistent)
    df = pd.read_csv(path, sep=None, engine="python")
    df.columns = df.columns.str.strip()

    # Keep only valid frames
    df = df[df["success"] == 1]

    # Select 2D landmark columns
    geom_cols = [c for c in df.columns if c.startswith("x") or c.startswith("y")]

    # Force numeric conversion column-by-column
    for col in geom_cols:
        df[col] = pd.to_numeric(df[col], errors="coerce")

    # Drop invalid rows
    df = df.dropna(subset=geom_cols)

    if df.empty:
        return {f"geom2d_{c}": 0.0 for c in geom_cols}

    return (
        df[geom_cols]
        .astype(float)            # GUARANTEE numeric
        .mean()
        .add_prefix("geom2d_")
        .to_dict()
    )



# ------------------------------------------------------------
# FACE GEOMETRY 3D
# ------------------------------------------------------------
def process_geom3d(path):
    df = pd.read_csv(path, sep=",", engine="python")
    df.columns = df.columns.str.strip()

    df = df[df["success"] == 1]

    geom_cols = [c for c in df.columns
                 if c.startswith("X") or c.startswith("Y") or c.startswith("Z")]

    for col in geom_cols:
        df[col] = pd.to_numeric(df[col], errors="coerce")

    df = df.dropna(subset=geom_cols)

    if df.empty:
        return {f"geom3d_{c}": 0.0 for c in geom_cols}

    return (
        df[geom_cols]
        .mean()
        .add_prefix("geom3d_")
        .to_dict()
    )


# ------------------------------------------------------------
# HOG (WITH PCA)
# ------------------------------------------------------------
def process_hog(hog_path, pose_path, n_components=20):
    num_frames = len(pd.read_csv(pose_path))
    data = np.fromfile(hog_path, dtype=np.float32)

    if data.size == 0 or data.size % num_frames != 0:
        return {f"hog_pca_{i}_mean":0.0 for i in range(n_components)}

    hog_dim = data.size // num_frames
    hog = data.reshape(num_frames, hog_dim)

    pca = PCA(n_components=n_components, random_state=42)
    hog_pca = pca.fit_transform(hog)

    feats = {}
    for i in range(n_components):
        feats[f"hog_pca_{i}_mean"] = hog_pca[:,i].mean()
        feats[f"hog_pca_{i}_std"] = hog_pca[:,i].std()
    return feats

### Transcript Processing Sanity Check (Text Feature Validation)
Runs the text preprocessing function on a single participant’s transcript to verify correct loading, speaker filtering, and sentence aggregation before applying the pipeline to the full dataset. This acts as a quick sanity check to confirm linguistic features are extracted properly.

In [105]:
process_words(BASE_PATH / "DAIC_WOZ_subset/extracted/301_P/301_TRANSCRIPT.csv")

"thank you mmm k i'm doing good thank you i'm from los angeles oh great i live in west los angeles the west side it's alright i xxx no i live alone so i love it i'm from here so i grew up here it's natural the weather um well the weather it's always good it's never it's never bad uh um there's always something to do it's rarely a dull moment the traffic the traffic is horrible well probably traffic is horrible in almost any major city but i hate the traffic not really i mean i have enough things going on here so if i travel it's usually somewhere that's within driving distance i studied uh business i did no i've i've been done for a few years so i haven't gone to school for a while one of these days i'll go back to graduate school but my dream job would be to just work for myself and making lots of money um i don't know i don't really have a dream job just something that i can i can work under my own terms and get paid decently and and be in a creative creative environment um i don't k

### Facial Action Unit Feature Extraction Test (Expression Signals)
Executes the facial Action Unit preprocessing on a single participant file to confirm correct parsing and computation of average facial muscle activation intensities, ensuring expression-based behavioral features are properly generated before full-scale processing.

In [106]:
process_face_aus(BASE_PATH / "DAIC_WOZ_subset/extracted/301_P/301_CLNF_AUs.txt")

{'AU01_r': 0.13259438837425672,
 'AU02_r': 0.06735483224788641,
 'AU04_r': 0.09583429792484124,
 'AU05_r': 0.029404565834715426,
 'AU06_r': 0.018630478500060676,
 'AU09_r': 0.08655279337405444,
 'AU10_r': 0.03926264774887747,
 'AU12_r': 0.04907692556935399,
 'AU14_r': 0.10778485000606772,
 'AU15_r': 0.012307463249868532,
 'AU17_r': 0.0198868424416488,
 'AU20_r': 0.08869211103919744,
 'AU25_r': 0.5712136212531855,
 'AU26_r': 0.0013668396504995751}

### Eye Gaze Feature Extraction Test (Behavioral Attention Signals)
Runs the eye gaze preprocessing on a single participant file to verify correct filtering of valid frames and computation of gaze direction and eye-contact rate, ensuring reliable behavioral attention features before applying the pipeline to all participants.

In [107]:
process_eyes_carefully(BASE_PATH / "DAIC_WOZ_subset/extracted/301_P/301_CLNF_gaze.txt")

{'avg_gaze_downward': np.float64(0.2383617907058136),
 'eye_contact_rate': 0.9911815865054002}

### Audio Feature Extraction Test (COVAREP Acoustic Descriptors)
Processes a single participant’s COVAREP audio file to compute statistical summaries (mean and standard deviation) of low-level acoustic features, validating that vocal characteristics are correctly extracted before large-scale multimodal integration.

In [108]:
df_covarep = process_audio_covarep(BASE_PATH / "DAIC_WOZ_subset/extracted/301_P/301_COVAREP.csv")

In [109]:
df_covarep

{'covarep_0_mean': np.float64(127.25741300416318),
 'covarep_0_std': np.float64(41.00267268920412),
 'covarep_1_mean': np.float64(0.3928922550340458),
 'covarep_1_std': np.float64(0.4883961774485749),
 'covarep_2_mean': np.float64(0.06589149353629005),
 'covarep_2_std': np.float64(0.056274400537690755),
 'covarep_3_mean': np.float64(0.22602470152321305),
 'covarep_3_std': np.float64(0.19343784273331557),
 'covarep_4_mean': np.float64(0.08172966152151379),
 'covarep_4_std': np.float64(4.95747557224423),
 'covarep_5_mean': np.float64(0.15661044655490175),
 'covarep_5_std': np.float64(0.16272447720893676),
 'covarep_6_mean': np.float64(0.09910476905048003),
 'covarep_6_std': np.float64(0.03222231641927864),
 'covarep_7_mean': np.float64(-0.3527951451730207),
 'covarep_7_std': np.float64(0.0734492510601278),
 'covarep_8_mean': np.float64(1.3987305773829033),
 'covarep_8_std': np.float64(0.5078544840944021),
 'covarep_9_mean': np.float64(0.6033904647465075),
 'covarep_9_std': np.float64(0.1

###  Participant-Level Multimodal Feature Aggregation (Master Row Builder)
Combines all modality-specific preprocessing functions into a single pipeline that extracts and merges linguistic, facial, gaze, audio, pose, and geometric features for one participant, producing a unified feature vector (one row per participant) ready for dataset construction and machine learning.

In [110]:
# MASTER FUNCTION (ONE PARTICIPANT)
# ------------------------------------------------------------
def build_master_row(pid, base_path):
    p = f"{base_path}/{pid}_P"
    row = {"participant_id": pid}

    row["text"] = process_words(f"{p}/{pid}_TRANSCRIPT.csv")
    row.update(process_face_aus(f"{p}/{pid}_CLNF_AUs.txt"))
    row.update(process_eyes_carefully(f"{p}/{pid}_CLNF_gaze.txt"))
    row.update(process_audio_covarep(f"{p}/{pid}_COVAREP.csv"))
    row.update(process_formant(f"{p}/{pid}_FORMANT.csv"))
    row.update(process_pose(f"{p}/{pid}_CLNF_pose.txt"))
    row.update(process_geom2d(f"{p}/{pid}_CLNF_features.txt"))
    row.update(process_geom3d(f"{p}/{pid}_CLNF_features3D.txt"))
    row.update(process_hog(
        f"{p}/{pid}_CLNF_hog.bin",
        f"{p}/{pid}_CLNF_pose.txt"
    ))

    return pd.DataFrame([row])

### Full Dataset Construction and Master Feature Matrix Generation
Iterates through all participant folders, applies the multimodal feature aggregation pipeline to each subject, and concatenates the resulting rows into a single participant-level master dataset. The final unified feature matrix is then saved as a CSV file for downstream modeling and experiments.

In [111]:
from pathlib import Path

# BASE_PATH = Path('DAIC_WOZ_subset')
ITERATE_PATH = BASE_PATH / "DAIC_WOZ_subset/extracted"

# 1. Initialize an empty list (NOT an empty DataFrame)
rows_list = []

# i = 1 
for item in ITERATE_PATH.iterdir():
    if item.is_dir():
        df_master_single = build_master_row(item.name.replace("_P", ""), ITERATE_PATH)
        
        # Append the small DataFrame to your list
        rows_list.append(df_master_single)
        print(item.name)
        # i += 1
        # if i > 10:
        #     break
# 3. Concatenate everything ONCE at the end
df_master = pd.concat(rows_list, ignore_index=True)

# Save the final concatenated DataFrame
output_path = "D:/portfolio/IU/Thesis_Final/Coding/master_dataset.csv"

df_master.to_csv(output_path, index=False)

print(f"File saved successfully to: {output_path}")

301_P
302_P
303_P
304_P
305_P
306_P
307_P
308_P
309_P
310_P
312_P
313_P
314_P
315_P
316_P
317_P
318_P
319_P
320_P
321_P
322_P
323_P
324_P
325_P
326_P
327_P
328_P
329_P
330_P
331_P
332_P
333_P
334_P
335_P
336_P
337_P
338_P
339_P
340_P
341_P
343_P
344_P
345_P
346_P
347_P
348_P
349_P
350_P
351_P
352_P
353_P
354_P
355_P
356_P
357_P
358_P
359_P
360_P
361_P
362_P
363_P
364_P
365_P
366_P
367_P
368_P
369_P
370_P
371_P
372_P
373_P
374_P
375_P
376_P
377_P
378_P
379_P
380_P
381_P
382_P
383_P
384_P
385_P
386_P
387_P
388_P
389_P
390_P
391_P
392_P
393_P
395_P
396_P
397_P
399_P
400_P
401_P
402_P
403_P
404_P
405_P
406_P
407_P
408_P
409_P
410_P
411_P
412_P
413_P
414_P
415_P
416_P
417_P
418_P
419_P
420_P
421_P
422_P
423_P
424_P
425_P
426_P
427_P
428_P
429_P
430_P
431_P
432_P
433_P
434_P
435_P
436_P
437_P
438_P
439_P
440_P
441_P
442_P
443_P
444_P
445_P
446_P
447_P
448_P
449_P
450_P
451_P
452_P
453_P
454_P
455_P
456_P
457_P
458_P
459_P
461_P
462_P
463_P
464_P
465_P
466_P
467_P
468_P
469_P
470_P
471_P
472_