In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
def class_matching(df_features):
    df = pd.concat([df_features,
                    lab_wname,
                    lab_bin],
                    axis=1)
    df.reset_index(inplace=True, drop=True)
    df.dropna(axis=0, inplace=True)
    df = df.loc[~df.lab_wname.isin(['1_vfar','1_ago']),:]
    df['lab_gt'] = df['lab_gt'].astype(int)
    return df

In [3]:
def split(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.3,
                                                        random_state=42,
                                                        stratify=y,
                                                        shuffle=True)
    return X_train, X_test, y_train, y_test

In [4]:
path_mannot = './data/trainds_mannot.txt'
path_low  = './results/df_features_low.csv'
path_med  = './results/df_features_med.csv'
path_high = './results/df_features_high.csv'
path_times = './results/df_times.csv'
Path_save = './results'

# Data Processing

In [5]:
df_features_low  = pd.read_csv(path_low)
df_features_med  = pd.read_csv(path_med)
df_features_high = pd.read_csv(path_high)
df_times = pd.read_csv(path_times)

In [6]:
gt = pd.read_csv(path_mannot, header=None, usecols=[0,1,2], sep='\t', 
                 names=['onset','offset','label'])
idx_annotated = (gt.label.str[1]=='_')
lab_wname = gt['label']
lab_wname.loc[~idx_annotated] = np.nan
lab_bin = lab_wname.str[0]
lab_bin.name = 'lab_gt'
lab_wname.name = 'lab_wname'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [7]:
df_features_low = class_matching(df_features_low)
df_features_med = class_matching(df_features_med)
df_features_high = class_matching(df_features_high)

In [8]:
X_low = df_features_low.loc[:,df_features_low.columns.str.startswith('shp')].values
y_low = df_features_low.loc[:,'lab_gt'].values

In [9]:
X_med = df_features_med.loc[:,df_features_med.columns.str.startswith('shp')].values
y_med = df_features_med.loc[:,'lab_gt'].values

In [10]:
X_high = df_features_high.loc[:,df_features_high.columns.str.startswith('shp')].values
y_high = df_features_high.loc[:,'lab_gt'].values

# Data Split

In [11]:
X_train_low, X_test_low, y_train_low, y_test_low = split(X_low, y_low)

In [12]:
X_train_med, X_test_med, y_train_med, y_test_med = split(X_med, y_med)

In [13]:
X_train_high, X_test_high, y_train_high, y_test_high = split(X_high, y_high)

In [14]:
np.save('./results/X_train_low.npy', X_train_low)
np.save('./results/X_test_low.npy', X_test_low)
np.save('./results/y_train_low.npy', y_train_low)
np.save('./results/y_test_low.npy', y_test_low)

In [15]:
np.save('./results/X_train_med.npy', X_train_med)
np.save('./results/X_test_med.npy', X_test_med)
np.save('./results/y_train_med.npy', y_train_med)
np.save('./results/y_test_med.npy', y_test_med)

In [16]:
np.save('./results/X_train_high.npy', X_train_high)
np.save('./results/X_test_high.npy', X_test_high)
np.save('./results/y_train_high.npy', y_train_high)
np.save('./results/y_test_high.npy', y_test_high)