# Frame-level set data management script 

This script is used to clean up the data extracted by the `extract_frame_features.py` and `prep_data.py` scripts.
* Invalid feature values are corrected, additional information in the form of accent labels and start timestamps are added to the main feature DataFrame for each recording.
* Word labels that are not lexical words or sentence delimiters are removed from the DataFrame 

## Import necessary packages

In [2]:
import pandas as pd
import numpy as np
import os
import pickle
from pathlib import Path
from glob import glob
from promdetect.prep import process_annotations
from itertools import groupby

## Load and process accent and word label files

In [3]:
label_files = glob("/home/lukas/Dokumente/Uni/ma_thesis/quelldaten/DIRNDL-prosody/*.accents")

In [4]:
labels = {}
for file in label_files:
    df = process_annotations.AnnotationReader(file).get_annotation_data()
    recording_id = str(Path(file).stem)
    labels[recording_id] = df 

In [5]:
word_files = glob("/home/lukas/Dokumente/Uni/ma_thesis/quelldaten/DIRNDL-prosody/*.words")

In [6]:
words = {}
for file in word_files:
    df = process_annotations.AnnotationReader(file).get_annotation_data()
    recording_id = str(Path(file).stem)
    words[recording_id] = df

## Collect frame-level feature DataFrames and pipe each through a number of preprocessing steps

In [7]:
os.chdir("/home/lukas/Dokumente/Uni/ma_thesis/promdetect/data/features/frame_based")
recordings = glob("raw/dlf*")
print("Number of recordings:", len(recordings)) # should be 55

Number of recordings: 55


In [8]:
data = {}  # Dictionary in which all the processed data will be stored (recording ID => data)
idx = 0
NONWORD_TYPES = ["[@]", "[t]", "[n]", "[f]", "[h]", np.nan] # Set types of word labels to be dropped

for recording in recordings:
    idx += 1
    recording_id = str(Path(Path(recording).stem).stem)
    df = pd.read_csv(recording)
    
    # Harmonics-to-noise ratio of -200.0 means that it could not be determined
    df.loc[df["hnr"] == -200.0, "hnr"] = np.nan
    # Pitch above 500 Hz is outside of the pitch contour limits, so it is reduced to NaN
    df.loc[df["f0"] > 500, "f0"] = np.nan
    df = df.drop(columns="Unnamed: 0")
    
    # Prepare columns for labels
    df["word"] = np.nan
    df["accent"] = np.nan
    
    # Locate recording in dictionary of word DataFrames, 
    # then locate current word containing correct frame timestamps
    words_df = words[recording_id]
    for row in words_df.itertuples():
        df.loc[(row.start_est <= df["time"]) & (df["time"] <= row.end), "word"] = row.label

    # Locate recording in dictionary of accent label DataFrames, 
    # then locate label within correct frame timestamps
    labels_df = labels[recording_id]
    for row in labels_df.itertuples():
        df.loc[(df["time"] <= row.time) & (row.time <= df["time"] + 0.01), "accent"] = row.label
    
    # Turn ToBI accent labels into binary labels for 'has_accent' column
    df["has_accent"] = np.nan
    for i, g in df.groupby([(df.word != df.word.shift()).cumsum()]):
        if (g["accent"].notna()).any():
            df.loc[g.index, "has_accent"] = True
            
    # Filter out frames within the boundaries of nonword labels 
    df_words_delimiters = df.loc[~df["word"].isin(NONWORD_TYPES)]
    
    data[recording_id] = df_words_delimiters
    
    print("Finished processing recording", idx, end="\r")

Finished processing recording 55

## Store the cleaned data set in a single .pickle file

In [10]:
with open("sets/main.pickle", "wb") as file:
    pickle.dump(data, file)