# Manage and clean word-level feature extraction output

This script includes code to clean and consolidate feature data extracted in the scripts `extract_word_features.py`, `segmentation.py` and `prep_data.py`
* remove some unwanted information
* coordinate word-level features with corresponding accents

## Import necessary libraries

In [3]:
import pandas as pd
import numpy as np
import os
import pickle
from pathlib import Path
from glob import glob
from promdetect.prep import process_annotations

## Collect all label data recording by recording

In [4]:
label_files = glob("/home/lukas/Dokumente/Uni/ma_thesis/quelldaten/DIRNDL-prosody/*.accents")
labels = {}
for file in label_files:
    df = process_annotations.AnnotationReader(file).get_annotation_data()
    recording_id = str(Path(file).stem)
    labels[recording_id] = df 

## Collect all feature data recording by recording

In [5]:
os.chdir("/home/lukas/Dokumente/Uni/ma_thesis/promdetect/data/features/word_based")
recordings = glob("raw/dlf*")
print("Number of recordings:", len(recordings)) # should be 55

Number of recordings: 55


## Progress through recordings, clean feature data and coordinate with label DataFrames 

In [6]:
data = {}
idx = 0

for recording in recordings:
    idx += 1
    recording_id = str(Path(Path(recording).stem).stem)
    df = pd.read_csv(recording)
    df = df.drop(columns="Unnamed: 0")
    
    df["accent"] = np.nan

    labels_df = labels[recording_id]
    
    for row in labels_df.itertuples():
        df.loc[(df["start"] <= row.time) & (row.time <= df["end"]), "accent"] = row.label
    
    df["has_accent"] = np.nan
    
    for i, g in df.groupby([(df.label != df.label.shift()).cumsum()]):
        if (g["accent"].notna()).any():
            df.loc[g.index, "has_accent"] = True
            
    data[recording_id] = df
    
    print("Finished processing recording", idx, end="\r")

Finished processing recording 55

## Save to .pickle file

In [7]:
with open("sets/main.pickle", "wb") as file:
    pickle.dump(data, file)