# Data table management & advanced pre-processing steps

This notebook contains further processing steps of the data output of `manage_extraction_output.ipynb`
* Imputing missing values (NA values => mean column value or zero)
* Standardizing values in numeric columns (3 different standardizers)
* Consolidating data from all recordings and dividing up features and labels
* Exporting to NumPy arrays

The NumPy arrays then serve as the input to the combined frame-word neural net

## Import necessary libraries

In [2]:
import pandas as pd
import numpy as np
import os
import pickle
from glob import glob
from sklearn import preprocessing, impute
from promdetect.prep import process_annotations
from functools import reduce
import torch
from torch.nn.utils.rnn import pad_sequence

### Get ordered list of recordings

In [3]:
LIST_RECS = []
with open("/home/lukas/Dokumente/Uni/ma_thesis/promdetect/data/dirndl/list_recordings.txt") as recordings:
    for recording in recordings:
        LIST_RECS.append(recording.rstrip())

## Define functions for pre-processing steps

### Impute missing values (using record-internal mean value)

In [4]:
def impute_missing(df):
    df_measurements = df[["dur_normed", "int_rms",
                  "int_min", "int_max", "int_mean",
                  "int_std", "int_min_pos", "int_max_pos",
                  "f0_min", "f0_max", "f0_mean", "f0_std",
                  "f0_slope", "f0_exc_ip", "f0_exc_utt",
                  "f0_min_pos", "f0_max_pos", "tilt_min",
                  "tilt_max", "tilt_mean", "tilt_range", 
                  "cog", "h1_h2"]]
    
    imputer = impute.SimpleImputer()
    df_imp = pd.DataFrame(imputer.fit_transform(df_measurements))
    df_imp.columns = df_measurements.columns
    df_imp.index= df_measurements.index
    
    df_imp[["has_accent", "label", "start", "end"]] = df[["has_accent", "label", "start", "end"]]
    
    return df_imp

### Standardize values using sklearn.preprocessing functions

In [5]:
def standardize(df):
    df_standard = df.copy()
    abs_scaler = preprocessing.MinMaxScaler()
    neg_pos_scaler = preprocessing.MinMaxScaler(feature_range=[-1,1])
    robust_scaler = preprocessing.RobustScaler()
    
    cols_abs = ["dur_normed", "int_rms", "int_min", "int_max", "int_mean",
               "int_std", "f0_min", "f0_max", "f0_mean", "f0_exc_ip",
               "f0_exc_utt", "tilt_max", "tilt_mean", "tilt_range", "cog"]
    cols_neg_pos = ["tilt_min"]
    cols_robust = ["f0_std", "f0_slope", "h1_h2"]
    
    df_standard[cols_abs] = abs_scaler.fit_transform(df_standard[cols_abs].values)
    df_standard[cols_neg_pos] = neg_pos_scaler.fit_transform(df_standard[cols_neg_pos].values)
    df_standard[cols_robust] = robust_scaler.fit_transform(df_standard[cols_robust].values)
    
    return df_standard

### Remove all but one element for each series of sentence delimiters

In [6]:
def drop_dup_p(df):
    for i, g in df.groupby([(df.label != df.label.shift()).cumsum()]):
        if (g["label"].any() == "<P>"):
            df.drop(df.loc[df.index.isin(g.index[:-1])].index, inplace=True)

### Run the functions defined above and consolidate data into a unified array

In [7]:
with open("/home/lukas/Dokumente/Uni/ma_thesis/promdetect/data/features/word_based/sets/main.pickle", "rb") as file:
    main = pickle.load(file)

data = []
for recording in LIST_RECS:
    df = main[recording]
    
    df_imp = impute_missing(df)
    
    df_standard = standardize(df_imp)    
    
    df_standard["has_accent"] = np.where(df_standard["has_accent"].notna(), 1, 0)
    
    drop_dup_p(df_standard)
    
    arr = df_standard.to_numpy()
    
    arr = arr[~pd.isnull(arr).any(axis=1)]
    
    data.append(arr) 
    

In [8]:
data_all = np.concatenate(data)

#### Reshape array to sentence-level

In [9]:
utts_all = np.split(data_all, np.where(data_all[:, -3] == "<P>")[0][1:])

#### Separate labels from features

In [10]:
labels = []
for utt in utts_all:
    if len(utt) > 1:
        utt_cleaned = np.delete(utt, np.s_[0], 0)
    else:
        utt_cleaned = utt
    labels.append(utt_cleaned[:, -4])

#### Remove unnecessary columns from the feature array

In [11]:
utts_all_cleaned = []
for utt in utts_all:
    utt_cleaned = np.delete(utt, np.s_[-4:], 1)
    if len(utt_cleaned) > 1:
        utt_cleaned = np.delete(utt_cleaned, np.s_[0], 0)
    utts_all_cleaned.append(utt_cleaned)

### Store as NumPy arrays

In [12]:
os.chdir("../../data/features/word_based/sets")

In [14]:
np.save("word_features.npy", np.array(utts_all_cleaned, dtype=object))
np.save("word_labels.npy", np.array(labels, dtype=object))