# Data table management & advanced preprocessing steps
---


This notebook contains utilities to further pre-process the frame-level feature data with the following steps:
* Imputing missing values (NA values => mean column value or zero)
* Standardizing values in numeric columns (3 different standardizers)
* Consolidating data from all recording and dividing up features and labels
    * Reshaping into sentence-level frame sequences
* Exporting to NumPy arrays

The NumPy arrays then serve as the input to the frame-level neural net

## Importing necessary packages

In [52]:
import pandas as pd
import numpy as np
import os
import pickle
from glob import glob
from sklearn import preprocessing, impute
from promdetect.prep import process_annotations
from functools import reduce
import torch
from torch.nn.utils.rnn import pad_sequence

### Get ordered list of recordings

In [53]:
LIST_RECS = []
with open("/home/lukas/Dokumente/Uni/ma_thesis/promdetect/data/dirndl/list_recordings.txt") as recordings:
    for recording in recordings:
        LIST_RECS.append(recording.rstrip())

### Read feature data (output of `manage_extraction_output.ipynb`)

In [54]:
os.chdir("/home/lukas/Dokumente/Uni/ma_thesis/promdetect/data/features/frame_based/sets")
with open("main.pickle", "rb") as file:
    data = pickle.load(file)

## Define functions for pre-processing steps

### Impute missing values (using recording-internal mean value for the feature)

In [55]:
def impute_missing(df):
    df_measurements = df[["f0", "voicing_pr",
                  "rms", "loudness", "zcr",
                  "hnr"]]
    
    imputer = impute.SimpleImputer()
    df_imp = pd.DataFrame(imputer.fit_transform(df_measurements))
    df_imp.columns = df_measurements.columns
    df_imp.index= df_measurements.index
    
    df_imp[["time", "word", "accent", "has_accent"]] = df[["time", "word", "accent", "has_accent"]]
    
    return df_imp    

### Standardize values using sklearn.preprocessing functions

In [56]:
def standardize(df):
    df_standard = df.copy()
    abs_scaler = preprocessing.MinMaxScaler()
    neg_pos_scaler = preprocessing.MinMaxScaler(feature_range=[-1,1])
    
    cols_abs = ["f0", "voicing_pr", "rms", "loudness", "zcr"]
    cols_neg_pos = ["hnr"]
    
    df_standard[cols_abs] = abs_scaler.fit_transform(df_standard[cols_abs].values)
    df_standard[cols_neg_pos] = neg_pos_scaler.fit_transform(df_standard[cols_neg_pos].values)
    
    return df_standard

### Remove all but one frame for each sentence delimiter

In [57]:
def drop_dup_p(df):
    # Group into consecutive frames with the same word label
    for i, g in df.groupby([(df.word != df.word.shift()).cumsum()]):
        if (g["word"].any() == "<P>"):
            df.drop(df.loc[df.index.isin(g.index[:-1])].index, inplace=True)

---
### Run the functions defined above and consolidate data into unified array

In [58]:
simple_dfs = []
idx = 0

for recording in LIST_RECS:
    idx += 1
    df = data[recording]
    
    df_imp = impute_missing(df)
    
    df_standard = standardize(df_imp)
    
    drop_dup_p(df_standard)
    
    arr = df_standard.to_numpy()
    
    simple_dfs.append(arr)
    
    print("Finished processing recording", idx, end="\r")

Finished processing recording 55

In [59]:
simple_data = np.concatenate(simple_dfs)

### Reshape array to sentence-level (sentence x frames x features) 

In [61]:
simple_utts = np.split(simple_data, np.where(simple_data[:, -3] == "<P>")[0][1:])

####  Coordinate frame timestamps with identical words (to be able to find corresponding words later)

In [63]:
timetables = []
for utt in simple_utts:
    timetable = []
    df = pd.DataFrame(utt[:, -4:-2], columns=["time", "word"])
    # Group into consecutive frames with the same word label
    for i, g in df.groupby([(df.word != df.word.shift()).cumsum()]):
        start = g.index[0]
        end = g.index[-1]
        if end != 0:
            timetable.append((g.index[0], g.index[-1]))
    if not timetable:
        timetable.append((0, 0))
    timetable = np.array(timetable)
    timetables.append(timetable)

### Separate labels from features

In [64]:
simple_labels = []
for utt in simple_utts:
    accent_bin = np.where(np.isnan(utt[:, -1].astype("float64")), 0, 1)
    simple_labels.append(accent_bin)

### Remove unnecessary columns from feature tables

In [65]:
simple_utts_cleaned = []
for utt in simple_utts:
    simple_utts_cleaned.append(np.delete(utt, np.s_[-3:], 1))

## Store as NumPy arrays

In [66]:
np.save("frame_features.npy", np.array(simple_utts_cleaned, dtype=object))
np.save("frame_labels.npy", np.array(simple_labels, dtype=object))
np.save("frame_times.npy", np.array(timetables, dtype=object))