# CANDOR corpus directory preprocessing for analysis
Extract the interview files from the nested directories of the CANDOR corpus and save them in a flat directory, keeping the original names of the interview files cross the corpus.

The CANDOR corpus can be downloaded from the official CANDOR download page.
https://betterup-data-requests.herokuapp.com

In [None]:
import os
import shutil


def flatten_directory(root_dir, target_dir):
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    for dirpath, dirnames, filenames in os.walk(root_dir):
        for filename in filenames:
            if filename.endswith('.DS_Store'):
                pass
            # Only the 'cliffhanger' transcript is kept. Change this if you prefer another transcription output
            elif "audiophile" in filename or\
                    "backbiter" in filename or\
                    "transcribe" in filename:
                pass

            else:
                # Create a unique filename by replacing the directory separators
                relative_path = os.path.relpath(dirpath, root_dir)
                flattened_name = f"{relative_path}_{filename}".replace(os.sep, '_')
                source_path = os.path.join(dirpath, filename)
                target_path = os.path.join(target_dir, flattened_name)

                # Copy the file to the flattened structure
                shutil.copy2(source_path, target_path)
                print(f"Copied {source_path} to {target_path}")

def delete_ds_store_files(root_dir):
    for dirpath, _, filenames in os.walk(root_dir):
        for filename in filenames:
            if filename == '.DS_Store':
                file_path = os.path.join(dirpath, filename)
                os.remove(file_path)
                print(f"Deleted: {file_path}")


### Example usage
flatten_directory('CANDOR_Corpus/', 'CANDOR_flattened')

## Modify the content of the audio_video_features file

you can remove the variables that you dont need for your analysis or modify variable values.

Here we are removing several acoustic variables and normalizing the f0 for each speaker (user_id).


In [None]:
import pandas as pd
import os

def remove_columns_from_csv(file_path, columns_to_remove):
    # Load the CSV file
    df = pd.read_csv(file_path)

    # Drop specified columns
    df = df.drop(columns=columns_to_remove, errors='ignore')

    # Save back to the same file
    df.to_csv(file_path, index=False)
    print(f"Updated file saved: {file_path}")


def normalize_f0_per_user(file_path):
    # Load the CSV file
    df = pd.read_csv(file_path)

    # Ensure f0 column is numeric
    df['f0'] = pd.to_numeric(df['f0'], errors='coerce')

    # Group by user_id and normalize f0 for each user
    df['f0_norm'] = df.groupby('user_id')['f0'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))

    # Save the updated DataFrame back to the same file
    df.to_csv(file_path, index=False)
    print(f"Data with normalized f0 column (f0_norm) saved to {file_path}")



columns_to_remove = ['mfcc_1', 'mfcc_2', 'mfcc_3', 'mfcc_4', "mfcc_5", "mfcc_6", "mfcc_7", "mfcc_8", "mfcc_9", "mfcc_10",
                     "mfcc_11", "mfcc_12", "mfcc_13", "mfcc_14", "mfcc_15", "mfcc_16", "mfcc_17", "mfcc_18", "mfcc_19",
                     "mfcc_0", "poly_features_0", "poly_features_1", "shimmer","spectral_bandwidth","spectral_centroid",
                     "spectral_contrast_0","spectral_contrast_1","spectral_contrast_2","spectral_contrast_3",
                     "spectral_contrast_4","spectral_contrast_5","spectral_contrast_6","spectral_flatness",
                     "spectral_rolloff","zero_crossing_rate","intensity","jitter","log_energy","onset_strength"]


INPUT_DIRECTORY = "CANDOR_flattened/"


files = sorted([file for file in os.listdir(INPUT_DIRECTORY)])
print(len(files))

for file in files:

    if file.endswith("features.csv"):
        file_path = os.path.join(INPUT_DIRECTORY, file)

        # remove unused columns from data
        remove_columns_from_csv(file_path, columns_to_remove)

        # scale f0 per user from 0 to 1
        normalize_f0_per_user(file_path)



# Rename files

you can use this code to rename files for simpler handling.

In [None]:
import os
import re


INPUT_DIRECTORY = "CANDOR_flattened/"

files = sorted([file for file in os.listdir(INPUT_DIRECTORY)])
print(len(files))

interview_dict = {}
interview_list = []
interview_count = 1

for file in files:

    interview_name = file.split('_')[0]
    if interview_name not in interview_dict:
        new_interview_name = "intrvw_" + str(interview_count)
        interview_dict[interview_name] = new_interview_name
        interview_count += 1

    file_path = os.path.join(INPUT_DIRECTORY, file)

    new_filepath = re.sub(interview_name, interview_dict[interview_name], file_path)

    print(file_path, new_filepath)
    os.rename(file_path, new_filepath)


print(len(interview_list))
