# Data Processing

In [1]:
"""
The regions of interest are labeled and the training sets (70% of the data set) 
and test (30% of the data set) are created.

@author: Juan Felipe Latorre Gil - jflatorreg@unal.edu.co
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
def class_matching(df_features):
    
    """
    Regions of interest are labeled.
    
    Parameters:
    ----------
        df_features: DataFrame 
            DataFrame with the information and characteristics of the regions of interest.          
    Returns:
    -------
        df_features_labeled: DataFrame 
        DataFrame with the information, labels and characteristics of the regions of interest.
            
    """ 

    df = pd.concat([df_features,
                    lab_wname,
                    lab_bin],
                    axis=1)
    df.reset_index(inplace=True, drop=True)
    df.dropna(axis=0, inplace=True)
    df = df.loc[~df.lab_wname.isin(['1_vfar','1_ago']),:]
    df['lab_gt'] = df['lab_gt'].astype(int)
    return df

In [3]:
def split(X, y):
    
    """
    Split the dataset into 70% for training and 30% for testing.
    
    Parameters:
    ----------
        X: numpy.array 
           Array with the characteristics of the regions of interest. 
        y: numpy.array  
           Array with labels of regions of interest.
    Returns:
    -------
        X_train: numpy.array 
            Array with the characteristics of the regions of interest for training.
        X_test: numpy.array
            Array with the characteristics of the regions of interest for test.
        y_train: numpy.array
            Array with labels of regions of interest for training.
        y_test: numpy.array
            Array with labels of regions of interest for test.
    """ 

    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.3,
                                                        random_state=42,
                                                        stratify=y,
                                                        shuffle=True)
    return X_train, X_test, y_train, y_test

In [4]:
path_mannot = './data/trainds_mannot.txt'
path_low  = './results/df_features_low.csv'
path_med  = './results/df_features_med.csv'
path_high = './results/df_features_high.csv'
path_times = './results/df_times.csv'
Path_save = './results'

# Load Data

In [5]:
df_features_low  = pd.read_csv(path_low)
df_features_med  = pd.read_csv(path_med)
df_features_high = pd.read_csv(path_high)
df_times = pd.read_csv(path_times)

In [6]:
df_features_low.head()

Unnamed: 0,min_f,min_t,max_f,max_t,shp_001,shp_002,shp_003,shp_004,shp_005,shp_006,...,shp_009,shp_010,shp_011,shp_012,shp_013,shp_014,shp_015,shp_016,frequency,fname
0,1033.59375,13.641723,3962.109375,13.943583,0.215264,0.19099,0.116268,0.126183,0.12912,0.193653,...,0.131789,0.240307,0.086261,0.134829,0.152529,0.332851,0.09393,0.1701,2540.917969,BETA-_20161029_070000.wav
1,1033.59375,89.327166,3962.109375,89.629025,0.182053,0.154664,0.11454,0.134009,0.10807,0.155816,...,0.071819,0.152941,0.116785,0.257436,0.094424,0.18564,0.192523,0.398138,2540.917969,BETA_20161115_160000.wav
2,1033.59375,40.449161,3962.109375,40.70458,0.233314,0.220245,0.126293,0.154195,0.179288,0.242303,...,0.205265,0.352204,0.231775,0.318858,0.330626,0.514024,0.231017,0.287843,2454.785156,BETA_20161128_032000.wav
3,1033.59375,81.88517,3962.109375,82.198639,0.16995,0.15895,0.107573,0.136715,0.111021,0.158293,...,0.0955,0.20659,0.143788,0.264087,0.150681,0.37541,0.226039,0.351922,2540.917969,BETA-_20161109_070000.wav
4,1033.59375,10.10068,3962.109375,10.42576,0.210391,0.20108,0.115546,0.134888,0.151931,0.202441,...,0.14589,0.203887,0.124151,0.240277,0.122903,0.306885,0.164721,0.344301,2497.851562,BETA_20161112_100000.wav


In [7]:
df_features_med.head()

Unnamed: 0,min_f,min_t,max_f,max_t,shp_001,shp_002,shp_003,shp_004,shp_005,shp_006,...,shp_041,shp_042,shp_043,shp_044,shp_045,shp_046,shp_047,shp_048,frequency,fname
0,1033.59375,13.641723,3962.109375,13.943583,0.215264,0.19099,0.181079,0.124634,0.116268,0.126183,...,0.288658,0.432378,0.096456,0.133245,0.081844,0.245924,0.134406,0.140101,2540.917969,BETA-_20161029_070000.wav
1,1033.59375,89.327166,3962.109375,89.629025,0.182053,0.154664,0.157382,0.121428,0.11454,0.134009,...,0.262315,0.525691,0.239708,0.344254,0.154582,0.156503,0.243071,0.351401,2540.917969,BETA_20161115_160000.wav
2,1033.59375,40.449161,3962.109375,40.70458,0.233314,0.220245,0.186017,0.152007,0.126293,0.154195,...,0.418175,0.353023,0.119964,0.19496,0.096618,0.138694,0.248574,0.250988,2454.785156,BETA_20161128_032000.wav
3,1033.59375,81.88517,3962.109375,82.198639,0.16995,0.15895,0.167527,0.131663,0.107573,0.136715,...,0.194589,0.295191,0.090208,0.060823,0.040825,0.083359,0.078811,0.067376,2540.917969,BETA-_20161109_070000.wav
4,1033.59375,10.10068,3962.109375,10.42576,0.210391,0.20108,0.181107,0.14418,0.115546,0.134888,...,0.252186,0.202034,0.066697,0.036928,0.082994,0.13533,0.089679,0.080279,2497.851562,BETA_20161112_100000.wav


In [8]:
df_features_high.head()

Unnamed: 0,min_f,min_t,max_f,max_t,shp_001,shp_002,shp_003,shp_004,shp_005,shp_006,...,shp_185,shp_186,shp_187,shp_188,shp_189,shp_190,shp_191,shp_192,frequency,fname
0,1033.59375,13.641723,3962.109375,13.943583,0.069621,0.215264,0.230303,0.19099,0.255856,0.232728,...,0.090906,0.134406,0.148826,0.140101,0.168458,0.262424,0.28865,0.246016,2540.917969,BETA-_20161029_070000.wav
1,1033.59375,89.327166,3962.109375,89.629025,0.059781,0.182053,0.189366,0.154664,0.233387,0.206645,...,0.122027,0.243071,0.314078,0.351401,0.357138,0.309597,0.323889,0.325886,2540.917969,BETA_20161115_160000.wav
2,1033.59375,40.449161,3962.109375,40.70458,0.077478,0.233314,0.252418,0.220245,0.280293,0.261196,...,0.219052,0.248574,0.26548,0.250988,0.335723,0.374718,0.339385,0.24226,2454.785156,BETA_20161128_032000.wav
3,1033.59375,81.88517,3962.109375,82.198639,0.058568,0.16995,0.181569,0.15895,0.235549,0.213036,...,0.06351,0.078811,0.083724,0.067376,0.132558,0.129177,0.159821,0.13619,2540.917969,BETA-_20161109_070000.wav
4,1033.59375,10.10068,3962.109375,10.42576,0.070442,0.210391,0.230359,0.20108,0.253394,0.234836,...,0.110958,0.089679,0.083098,0.080279,0.167874,0.170719,0.143566,0.126976,2497.851562,BETA_20161112_100000.wav


In [9]:
df_times

Unnamed: 0,Time Low (s),Time Med (s),Time High (s)
0,534.24422,1033.823215,3332.736162


# Get Labels

In [10]:
gt = pd.read_csv(path_mannot, header=None, usecols=[0,1,2], sep='\t', 
                 names=['onset','offset','label'])
idx_annotated = (gt.label.str[1]=='_')
lab_wname = gt['label']
lab_wname.loc[~idx_annotated] = np.nan
lab_bin = lab_wname.str[0]
lab_bin.name = 'lab_gt'
lab_wname.name = 'lab_wname'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


# Label Regions of Interest

In [11]:
df_features_low = class_matching(df_features_low)
df_features_med = class_matching(df_features_med)
df_features_high = class_matching(df_features_high)

In [12]:
X_low = df_features_low.loc[:,df_features_low.columns.str.startswith('shp')].values
y_low = df_features_low.loc[:,'lab_gt'].values

In [13]:
X_low.shape

(1110, 16)

In [14]:
X_med = df_features_med.loc[:,df_features_med.columns.str.startswith('shp')].values
y_med = df_features_med.loc[:,'lab_gt'].values

In [15]:
X_med.shape

(1110, 48)

In [16]:
X_high = df_features_high.loc[:,df_features_high.columns.str.startswith('shp')].values
y_high = df_features_high.loc[:,'lab_gt'].values

In [17]:
X_high.shape

(1110, 192)

# Split Data

In [18]:
X_train_low, X_test_low, y_train_low, y_test_low = split(X_low, y_low)

In [19]:
X_train_med, X_test_med, y_train_med, y_test_med = split(X_med, y_med)

In [20]:
X_train_high, X_test_high, y_train_high, y_test_high = split(X_high, y_high)

## Save Data

In [21]:
np.save('./results/X_train_low.npy', X_train_low)
np.save('./results/X_test_low.npy', X_test_low)
np.save('./results/y_train_low.npy', y_train_low)
np.save('./results/y_test_low.npy', y_test_low)

In [22]:
np.save('./results/X_train_med.npy', X_train_med)
np.save('./results/X_test_med.npy', X_test_med)
np.save('./results/y_train_med.npy', y_train_med)
np.save('./results/y_test_med.npy', y_test_med)

In [23]:
np.save('./results/X_train_high.npy', X_train_high)
np.save('./results/X_test_high.npy', X_test_high)
np.save('./results/y_train_high.npy', y_train_high)
np.save('./results/y_test_high.npy', y_test_high)