# Data Augmentation Data Processing

In [1]:
"""
The regions of interest of augmented data are labeled and the training and test sets are created.

@author: Juan Felipe Latorre Gil - jflatorreg@unal.edu.co
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
def class_matching(df_features):
    
    """
    Regions of interest are labeled.
    
    Parameters:
    ----------
        df_features: DataFrame 
            DataFrame with the information and characteristics of the regions of interest.          
    Returns:
    -------
        df_features_labeled: DataFrame 
        DataFrame with the information, labels and characteristics of the regions of interest.
            
    """ 
    
    df = pd.concat([df_features,
                    lab_wname,
                    lab_bin],
                    axis=1)
    df.reset_index(inplace=True, drop=True)
    df.dropna(axis=0, inplace=True)
    df = df.loc[~df.lab_wname.isin(['1_vfar','1_ago']),:]
    df['lab_gt'] = df['lab_gt'].astype(int)
    return df

In [3]:
def split(X, y):
    
    """
    Split the dataset into 70% for training and 30% for testing.
    
    Parameters:
    ----------
        X: numpy.array 
           Array with the characteristics of the regions of interest. 
        y: numpy.array  
           Array with labels of regions of interest.
    Returns:
    -------
        X_train: numpy.array 
            Array with the characteristics of the regions of interest for training.
        X_test: numpy.array
            Array with the characteristics of the regions of interest for test.
        y_train: numpy.array
            Array with labels of regions of interest for training.
        y_test: numpy.array
            Array with labels of regions of interest for test.
    """ 
    
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.3,
                                                        random_state=42,
                                                        stratify=y,
                                                        shuffle=True)
    return X_train, X_test, y_train, y_test

In [4]:
path_mannot = './data/trainds_mannot.txt'
gt = pd.read_csv(path_mannot, header=None, usecols=[0,1,2], sep='\t', 
                 names=['onset','offset','label'])
idx_annotated = (gt.label.str[1]=='_')
lab_wname = gt['label']
lab_wname.loc[~idx_annotated] = np.nan
lab_bin = lab_wname.str[0]
lab_bin.name = 'lab_gt'
lab_wname.name = 'lab_wname'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [5]:
path_low  = './results/df_features_low.csv'
path_med  = './results/df_features_med.csv'
path_high = './results/df_features_high.csv'

In [6]:
df_features_low  = pd.read_csv(path_low)
df_features_med  = pd.read_csv(path_med)
df_features_high = pd.read_csv(path_high)

In [7]:
df_features_low = class_matching(df_features_low)
df_features_med = class_matching(df_features_med)
df_features_high = class_matching(df_features_high)

In [8]:
df_features_low.head()

Unnamed: 0,min_f,min_t,max_f,max_t,shp_001,shp_002,shp_003,shp_004,shp_005,shp_006,...,shp_011,shp_012,shp_013,shp_014,shp_015,shp_016,frequency,fname,lab_wname,lab_gt
0,1033.59375,13.641723,3962.109375,13.943583,0.215264,0.19099,0.116268,0.126183,0.12912,0.193653,...,0.086261,0.134829,0.152529,0.332851,0.09393,0.1701,2540.917969,BETA-_20161029_070000.wav,0_bio,0
1,1033.59375,89.327166,3962.109375,89.629025,0.182053,0.154664,0.11454,0.134009,0.10807,0.155816,...,0.116785,0.257436,0.094424,0.18564,0.192523,0.398138,2540.917969,BETA_20161115_160000.wav,0_bio,0
2,1033.59375,40.449161,3962.109375,40.70458,0.233314,0.220245,0.126293,0.154195,0.179288,0.242303,...,0.231775,0.318858,0.330626,0.514024,0.231017,0.287843,2454.785156,BETA_20161128_032000.wav,0_bio,0
3,1033.59375,81.88517,3962.109375,82.198639,0.16995,0.15895,0.107573,0.136715,0.111021,0.158293,...,0.143788,0.264087,0.150681,0.37541,0.226039,0.351922,2540.917969,BETA-_20161109_070000.wav,0_bio,0
4,1033.59375,10.10068,3962.109375,10.42576,0.210391,0.20108,0.115546,0.134888,0.151931,0.202441,...,0.124151,0.240277,0.122903,0.306885,0.164721,0.344301,2497.851562,BETA_20161112_100000.wav,0_bio,0


In [9]:
df_features_low['lab_wname'].value_counts()

0_bio      387
0_geo      325
1_med      123
1_cl       107
1_ovlp     100
1_far       45
0_dog       17
0_voice      4
0_ovlp       2
Name: lab_wname, dtype: int64

In [10]:
id_1_cl = list(df_features_low[df_features_low['lab_wname']=='1_cl'].index)
id_1_med = list(df_features_low[df_features_low['lab_wname']=='1_med'].index)
id_1_ovlp = list(df_features_low[df_features_low['lab_wname']=='1_ovlp'].index)
id_1_far = list(df_features_low[df_features_low['lab_wname']=='1_far'].index)

In [11]:
id_0 = list(df_features_low[df_features_low['lab_gt']== 0].index)

In [12]:
np.random.seed(24)

data_percent = 0.3

id_1_cl = np.random.choice(id_1_cl, int(len(id_1_cl)*data_percent), replace=False)
id_1_med = np.random.choice(id_1_med, int(len(id_1_med)*data_percent), replace=False)
id_1_ovlp = np.random.choice(id_1_ovlp, int(len(id_1_ovlp)*data_percent), replace=False)
id_1_far = np.random.choice(id_1_far, int(len(id_1_far)*data_percent), replace=False)
id_0 = np.random.choice(id_0, int(len(id_0)*data_percent), replace=False)

In [13]:
len(id_0)

220

In [14]:
s_1_cl = pd.Series(id_1_cl, index=id_1_cl)
s_1_med = pd.Series(id_1_med, index=id_1_med)
s_1_ovlp = pd.Series(id_1_ovlp, index=id_1_ovlp)
s_1_far = pd.Series(id_1_far, index=id_1_far)

s_0 = pd.Series(id_0, index=id_0)

In [15]:
s_id = s_1_cl.append([s_1_med,s_1_ovlp,s_1_far,s_0])
#s_id = s_1_cl.append([s_1_med,s_1_ovlp,s_1_far])



s_id_0 = s_1_cl.append([s_1_med,s_1_ovlp,s_1_far,s_0])

In [16]:
s_id 

415      415
431      431
1060    1060
845      845
1064    1064
445      445
422      422
803      803
1057    1057
1678    1678
1674    1674
1839    1839
1694    1694
823      823
1129    1129
1116    1116
843      843
1110    1110
1061    1061
1050    1050
1637    1637
1863    1863
1056    1056
825      825
1672    1672
1068    1068
1474    1474
805      805
1089    1089
842      842
        ... 
528      528
324      324
326      326
416      416
867      867
1341    1341
1213    1213
1890    1890
1588    1588
206      206
235      235
1464    1464
1350    1350
1775    1775
1386    1386
1445    1445
536      536
727      727
1380    1380
1364    1364
129      129
744      744
106      106
872      872
1381    1381
970      970
325      325
1593    1593
1369    1369
979      979
Length: 331, dtype: int64

In [17]:
print('Datos Anotados Categoria 1:', len(s_id) - len(id_0))

Datos Anotados Categoria 1: 111


In [18]:
id_selc = s_id.tolist()
id_selc_0 = s_id_0.tolist()

In [19]:
len(id_selc)

331

In [20]:
len(id_selc_0 )

331

In [21]:
X_low = df_features_low.loc[:,df_features_low.columns.str.startswith('shp')]
y_low = df_features_low.loc[:,'lab_gt']

X_med = df_features_med.loc[:,df_features_med.columns.str.startswith('shp')]
y_med = df_features_med.loc[:,'lab_gt']

X_high = df_features_high.loc[:,df_features_high.columns.str.startswith('shp')]
y_high = df_features_high.loc[:,'lab_gt']

In [22]:
X_train_low, X_test_low, y_train_low, y_test_low = split(X_low, y_low)

X_train_med, X_test_med, y_train_med, y_test_med = split(X_med, y_med)

X_train_high, X_test_high, y_train_high, y_test_high = split(X_high, y_high)

In [23]:
X_train_low = X_train_low.reindex(id_selc_0).dropna()
y_train_low = y_train_low.reindex(id_selc_0).dropna()

X_train_med = X_train_med.reindex(id_selc_0).dropna()
y_train_med = y_train_med.reindex(id_selc_0).dropna()

X_train_high = X_train_high.reindex(id_selc_0).dropna()
y_train_high = y_train_high.reindex(id_selc_0).dropna()

In [24]:
len(y_train_low)

229

In [25]:
len(y_train_low)

229

In [26]:
path_low_set_1  = './data_augmentation/df_features_low_set_1.csv'
path_med_set_1  = './data_augmentation/df_features_med_set_1.csv'
path_high_set_1 = './data_augmentation/df_features_high_set_1.csv'

In [27]:
df_features_low_set_1  = pd.read_csv(path_low_set_1)
df_features_med_set_1  = pd.read_csv(path_med_set_1)
df_features_high_set_1 = pd.read_csv(path_high_set_1)

In [28]:
df_features_low_set_1 = class_matching(df_features_low_set_1)
df_features_med_set_1 = class_matching(df_features_med_set_1)
df_features_high_set_1 = class_matching(df_features_high_set_1)

In [29]:
path_low_set_2  = './data_augmentation/df_features_low_set_2.csv'
path_med_set_2  = './data_augmentation/df_features_med_set_2.csv'
path_high_set_2 = './data_augmentation/df_features_high_set_2.csv'

In [30]:
df_features_low_set_2  = pd.read_csv(path_low_set_2)
df_features_med_set_2  = pd.read_csv(path_med_set_2)
df_features_high_set_2 = pd.read_csv(path_high_set_2)

In [31]:
df_features_low_set_2 = class_matching(df_features_low_set_2)
df_features_med_set_2 = class_matching(df_features_med_set_2)
df_features_high_set_2 = class_matching(df_features_high_set_2)

In [32]:
path_low_set_3  = './data_augmentation/df_features_low_set_3.csv'
path_med_set_3  = './data_augmentation/df_features_med_set_3.csv'
path_high_set_3 = './data_augmentation/df_features_high_set_3.csv'

In [33]:
df_features_low_set_3  = pd.read_csv(path_low_set_3)
df_features_med_set_3  = pd.read_csv(path_med_set_3)
df_features_high_set_3 = pd.read_csv(path_high_set_3)

In [34]:
df_features_low_set_3 = class_matching(df_features_low_set_3)
df_features_med_set_3 = class_matching(df_features_med_set_3)
df_features_high_set_3 = class_matching(df_features_high_set_3)

In [35]:
X_low_set_1 = df_features_low_set_1.loc[:,df_features_low_set_1.columns.str.startswith('shp')]
y_low_set_1 = df_features_low_set_1.loc[:,'lab_gt']

X_med_set_1 = df_features_med_set_1.loc[:,df_features_med_set_1.columns.str.startswith('shp')]
y_med_set_1 = df_features_med_set_1.loc[:,'lab_gt']

X_high_set_1 = df_features_high_set_1.loc[:,df_features_high_set_1.columns.str.startswith('shp')]
y_high_set_1 = df_features_high_set_1.loc[:,'lab_gt']

In [36]:
X_low_set_2 = df_features_low_set_2.loc[:,df_features_low_set_2.columns.str.startswith('shp')]
y_low_set_2 = df_features_low_set_2.loc[:,'lab_gt']

X_med_set_2 = df_features_med_set_2.loc[:,df_features_med_set_2.columns.str.startswith('shp')]
y_med_set_2 = df_features_med_set_2.loc[:,'lab_gt']

X_high_set_2 = df_features_high_set_2.loc[:,df_features_high_set_2.columns.str.startswith('shp')]
y_high_set_2 = df_features_high_set_2.loc[:,'lab_gt']

In [37]:
X_low_set_3 = df_features_low_set_3.loc[:,df_features_low_set_3.columns.str.startswith('shp')]
y_low_set_3 = df_features_low_set_3.loc[:,'lab_gt']

X_med_set_3 = df_features_med_set_3.loc[:,df_features_med_set_3.columns.str.startswith('shp')]
y_med_set_3 = df_features_med_set_3.loc[:,'lab_gt']

X_high_set_3 = df_features_high_set_3.loc[:,df_features_high_set_3.columns.str.startswith('shp')]
y_high_set_3 = df_features_high_set_3.loc[:,'lab_gt']

In [38]:
len(y_high_set_3)

1110

In [39]:
X_train_low_set_1, X_test_low_set_1, y_train_low_set_1, y_test_low_set_1 = split(X_low_set_1, y_low_set_1)

X_train_med_set_1, X_test_med_set_1, y_train_med_set_1, y_test_med_set_1 = split(X_med_set_1, y_med_set_1)

X_train_high_set_1, X_test_high_set_1, y_train_high_set_1, y_test_high_set_1 = split(X_high_set_1, y_high_set_1)

In [40]:
X_train_low_set_1 = X_train_low_set_1.reindex(id_selc).dropna()
y_train_low_set_1 = y_train_low_set_1.reindex(id_selc).dropna()

X_train_med_set_1 = X_train_med_set_1.reindex(id_selc).dropna()
y_train_med_set_1 = y_train_med_set_1.reindex(id_selc).dropna()

X_train_high_set_1 = X_train_high_set_1.reindex(id_selc).dropna()
y_train_high_set_1 = y_train_high_set_1.reindex(id_selc).dropna()

In [41]:
X_train_low_set_2, X_test_low_set_2, y_train_low_set_2, y_test_low_set_2 = split(X_low_set_2, y_low_set_2)

X_train_med_set_2, X_test_med_set_2, y_train_med_set_2, y_test_med_set_2 = split(X_med_set_2, y_med_set_2)

X_train_high_set_2, X_test_high_set_2, y_train_high_set_2, y_test_high_set_2 = split(X_high_set_2, y_high_set_2)

In [42]:
X_train_low_set_2 = X_train_low_set_2.reindex(id_selc).dropna()
y_train_low_set_2 = y_train_low_set_2.reindex(id_selc).dropna()

X_train_med_set_2 = X_train_med_set_2.reindex(id_selc).dropna()
y_train_med_set_2 = y_train_med_set_2.reindex(id_selc).dropna()

X_train_high_set_2 = X_train_high_set_2.reindex(id_selc).dropna()
y_train_high_set_2 = y_train_high_set_2.reindex(id_selc).dropna()

In [43]:
X_train_low_set_3, X_test_low_set_3, y_train_low_set_3, y_test_low_set_3 = split(X_low_set_3, y_low_set_3)

X_train_med_set_3, X_test_med_set_3, y_train_med_set_3, y_test_med_set_3 = split(X_med_set_3, y_med_set_3)

X_train_high_set_3, X_test_high_set_3, y_train_high_set_3, y_test_high_set_3 = split(X_high_set_3, y_high_set_3)

In [44]:
X_train_low_set_3 = X_train_low_set_3.reindex(id_selc).dropna()
y_train_low_set_3 = y_train_low_set_3.reindex(id_selc).dropna()

X_train_med_set_3 = X_train_med_set_3.reindex(id_selc).dropna()
y_train_med_set_3 = y_train_med_set_3.reindex(id_selc).dropna()

X_train_high_set_3 = X_train_high_set_3.reindex(id_selc).dropna()
y_train_high_set_3 = y_train_high_set_3.reindex(id_selc).dropna()

In [45]:
len(y_high_set_3)

1110

In [46]:
np.save('./data_augmentation/X_train_low.npy', X_train_low)
np.save('./data_augmentation/X_test_low.npy', X_test_low)
np.save('./data_augmentation/y_train_low.npy', y_train_low)
np.save('./data_augmentation/y_test_low.npy', y_test_low)

np.save('./data_augmentation/X_train_med.npy', X_train_med)
np.save('./data_augmentation/X_test_med.npy', X_test_med)
np.save('./data_augmentation/y_train_med.npy', y_train_med)
np.save('./data_augmentation/y_test_med.npy', y_test_med)

np.save('./data_augmentation/X_train_high.npy', X_train_high)
np.save('./data_augmentation/X_test_high.npy', X_test_high)
np.save('./data_augmentation/y_train_high.npy', y_train_high)
np.save('./data_augmentation/y_test_high.npy', y_test_high)

In [47]:
np.save('./data_augmentation/X_train_low_set_1.npy', X_train_low_set_1)
np.save('./data_augmentation/X_test_low_set_1.npy', X_test_low_set_1)
np.save('./data_augmentation/y_train_low_set_1.npy', y_train_low_set_1)
np.save('./data_augmentation/y_test_low_set_1.npy', y_test_low_set_1)

np.save('./data_augmentation/X_train_med_set_1.npy', X_train_med_set_1)
np.save('./data_augmentation/X_test_med_set_1.npy', X_test_med_set_1)
np.save('./data_augmentation/y_train_med_set_1.npy', y_train_med_set_1)
np.save('./data_augmentation/y_test_med_set_1.npy', y_test_med_set_1)

np.save('./data_augmentation/X_train_high_set_1.npy', X_train_high_set_1)
np.save('./data_augmentation/X_test_high_set_1.npy', X_test_high_set_1)
np.save('./data_augmentation/y_train_high_set_1.npy', y_train_high_set_1)
np.save('./data_augmentation/y_test_high_set_1.npy', y_test_high_set_1)

In [48]:
np.save('./data_augmentation/X_train_low_set_2.npy', X_train_low_set_2)
np.save('./data_augmentation/X_test_low_set_2.npy', X_test_low_set_2)
np.save('./data_augmentation/y_train_low_set_2.npy', y_train_low_set_2)
np.save('./data_augmentation/y_test_low_set_2.npy', y_test_low_set_2)

np.save('./data_augmentation/X_train_med_set_2.npy', X_train_med_set_2)
np.save('./data_augmentation/X_test_med_set_2.npy', X_test_med_set_2)
np.save('./data_augmentation/y_train_med_set_2.npy', y_train_med_set_2)
np.save('./data_augmentation/y_test_med_set_2.npy', y_test_med_set_2)

np.save('./data_augmentation/X_train_high_set_2.npy', X_train_high_set_2)
np.save('./data_augmentation/X_test_high_set_2.npy', X_test_high_set_2)
np.save('./data_augmentation/y_train_high_set_2.npy', y_train_high_set_2)
np.save('./data_augmentation/y_test_high_set_2.npy', y_test_high_set_2)

In [49]:
np.save('./data_augmentation/X_train_low_set_3.npy', X_train_low_set_3)
np.save('./data_augmentation/X_test_low_set_3.npy', X_test_low_set_3)
np.save('./data_augmentation/y_train_low_set_3.npy', y_train_low_set_3)
np.save('./data_augmentation/y_test_low_set_3.npy', y_test_low_set_3)

np.save('./data_augmentation/X_train_med_set_3.npy', X_train_med_set_3)
np.save('./data_augmentation/X_test_med_set_3.npy', X_test_med_set_3)
np.save('./data_augmentation/y_train_med_set_3.npy', y_train_med_set_3)
np.save('./data_augmentation/y_test_med_set_3.npy', y_test_med_set_3)

np.save('./data_augmentation/X_train_high_set_3.npy', X_train_high_set_3)
np.save('./data_augmentation/X_test_high_set_3.npy', X_test_high_set_3)
np.save('./data_augmentation/y_train_high_set_3.npy', y_train_high_set_3)
np.save('./data_augmentation/y_test_high_set_3.npy', y_test_high_set_3)