In [2]:
import pandas as pd
from pydub import AudioSegment
import os

Implementation Code For patient speech segmentation

In [None]:
# Load processed.csv
processed_df = pd.read_csv("processed.csv")
processed_ids = set(processed_df['id'].astype(str))

# Directory containing interview folders
interview_root = "interviews"

# Loop through each interview folder
for interview_id in os.listdir(interview_root):
    if interview_id in processed_ids:
        print(f"Interview {interview_id} already processed. Skipping...")
        continue  # Already processed

    transcript_path = os.path.join(interview_root, interview_id, f"{interview_id}_TRANSCRIPT.csv")
    audio_path = os.path.join(interview_root, interview_id, f"{interview_id}_AUDIO.wav")
    output_audio_path = os.path.join("Segmented", f"{interview_id}_PATIENT.wav")

    # Check if both files exist
    if not (os.path.exists(transcript_path) and os.path.exists(audio_path)):
        print(f"Missing files for interview {interview_id}. Skipping...")
        continue

    try:
        # Load transcript
        transcript = pd.read_csv(transcript_path, sep='\t')
        patient_segments = transcript[transcript["speaker"] == "Participant"]

        # Load full audio
        audio = AudioSegment.from_wav(audio_path)

        # Extract and combine patient segments
        patient_audio = AudioSegment.silent()
        for _, row in patient_segments.iterrows():
            start_ms = int(row["start_time"] * 1000)
            stop_ms = int(row["stop_time"] * 1000)
            patient_audio += audio[start_ms:stop_ms]

        # Export patient-only audio
        patient_audio.export(output_audio_path, format="wav")
        print(f"Processed interview {interview_id}.")

        # Optionally update processed.csv
        processed_df.loc[len(processed_df)] = [interview_id]
        processed_df.to_csv("processed.csv", index=False)

    except Exception as e:
        print(f"Error processing interview {interview_id}: {e}")

Processed interview 373.
Interview 440 already processed. Skipping...
Interview 459 already processed. Skipping...
Interview 483 already processed. Skipping...
Interview 490 already processed. Skipping...
Interview 491 already processed. Skipping...
Interview 492 already processed. Skipping...
Missing files for interview Segmented. Skipping...
