# Data table management & preprocessing
---

This notebook contains code to manage the output of the extraction scripts `extract_features.py`, `find_syllable_nuclei.py` and `prepare_data.py`.
* Cleans out duplicates found by the syllable nucleus detection algorithm
* Imputes missing data by replacing NAs with mean feature values
* Standardizes data with 3 different standardizers
* Consolidates DataFrames
* (optional: Balances classes using SMOTENC)
* Reshapes DataFrames to sequences of nuclei

## Importing necessary packages

In [1]:
import pandas as pd
import numpy as np
import os
from glob import glob
from sklearn import preprocessing, impute, decomposition
from promdetect.prep import process_annotations
from functools import reduce
from imblearn.over_sampling import SMOTENC

---
## Import stored feature DataFrames


In [2]:
features_dir = "../../data/features/nucleus_based/"
os.chdir(features_dir)
recordings = glob("raw/dlf*")
print("Number of recordings:", len(recordings)) # should be 55

Number of recordings: 55


## Define pre-processing functions

### Read data and clean out duplicates within each recording

In [3]:
def read_reformat(recording):
    df = pd.read_csv(recording)
    df["recording"] = recording
    speaker_info = process_annotations.AnnotationReader(recording).get_speaker_info()
    df["speaker_gender"] = speaker_info[1]
    
    df = df.drop_duplicates(subset=['start_est', 'end']).copy()
    df["has_accent"] = np.where(df["accent_label"].notna(), 1, 0)
    df["gender"] = np.where(df["speaker_gender"] == "m", 1, 0)
    
    return df


### Impute missing values (using recording-internal mean value for the feature)

In [4]:
def impute_missing(df):
    # select all columns except for metadata columns
    df_measurements = df.copy().drop(["recording", "speaker_gender", "accent_label", "Unnamed: 0", "index", "nucl_time",
                                "phone", "word", 'bound_tone', 'start_est',
                                 'end', 'word_start', 'word_end', 'ip_start',
                                 'ip_end', 'accent_time', 'accent_label'], axis=1)
    imputer = impute.SimpleImputer()
    df_imp = pd.DataFrame(imputer.fit_transform(df_measurements))
    df_imp.columns = df_measurements.columns
    df_imp.index= df_measurements.index
    
    df_imp[["gender", "has_accent"]] = df[["gender", "has_accent"]]
    
    return df_imp    

### Standardize values using sklearn.preprocessing functions

In [5]:
def standardize(df):
    df_standard = df.copy()
    abs_scaler = preprocessing.MinMaxScaler()
    neg_pos_scaler = preprocessing.MinMaxScaler(feature_range=[-1,1])
    robust_scaler = preprocessing.RobustScaler()
    
    cols_abs = ["duration_est", "rms", "duration_normed", "min_intensity_nuclei", "max_intensity_nuclei", "intensity_std_nuclei", "mean_intensity_nuclei", "min_intensity_pos", "max_intensity_pos", "f0_max_nuclei", "f0_min_nuclei", "f0_mean_nuclei", "f0_range_nuclei", "f0_std_nuclei", "f0_min_pos", "f0_max_pos"]
    cols_neg_pos = ["excursion_word", "excursion_ip"]
    cols_robust = ["pitch_slope", "f0_range_nuclei", "spectral_tilt_mean", "spectral_tilt_range", "min_spectral_tilt", "max_spectral_tilt", "spectral_cog", "h1_h2"]
    
    df_standard[cols_abs] = abs_scaler.fit_transform(df_standard[cols_abs].values)
    df_standard[cols_neg_pos] = neg_pos_scaler.fit_transform(df_standard[cols_neg_pos].values)
    df_standard[cols_robust] = robust_scaler.fit_transform(df_standard[cols_robust].values)
    
    df_standard = df_standard.drop(columns="has_accent")
    
    return df_standard

---
### Run the functions defined above and consolidate data into unified arrays, separate for features and labels

#### Without SMOTENC:

In [9]:
data = []
labels = []
for recording in recordings:
    
    df = read_reformat(recording)
    
    df_imp = impute_missing(df)
    
    df_standard = standardize(df_imp)
    
    data.append(df_standard.to_numpy())
    labels.append(df_imp["has_accent"].to_numpy())

In [10]:
labels_np = np.array(labels, dtype=object)
data_np = np.array(data, dtype=object)

##### Save the two to files

In [11]:
np.save("sets/data_standard.npy", data_np)
np.save("sets/data_standard_labels.npy", labels_np)

#### With SMOTENC

In [6]:
data_smotenc = []
labels_smotenc = []
for recording in recordings:
    
    df = read_reformat(recording)
    
    df_imp = impute_missing(df)
    
    df_standard = standardize(df_imp)

    smote = SMOTENC(categorical_features=[20])
    data_balanced, labels_balanced = smote.fit_resample(df_standard.to_numpy(), df_imp["has_accent"].to_numpy())
    
    data_smotenc.append(data_balanced)
    labels_smotenc.append(labels_balanced)

In [7]:
labels_np_smotenc = np.array(labels_smotenc, dtype=object)
data_np_smotenc = np.array(data_smotenc, dtype=object)

In [8]:
np.save("sets/data_standard_smotenc.npy", data_np_smotenc)
np.save("sets/data_standard_labels_smotenc.npy", labels_np_smotenc)