# Prepare data for modelling: initial feature selection

Most algorithms in scikit learn can not handle a 7kx70k matrix as data. Therefor we use simple filters as a first step to reduce the numbers of fitlers.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import gc
import pickle
import pathlib
import tqdm
import numpy as np
import numpy.random
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import StratifiedShuffleSplit

In [3]:
base_path = "/home/jsimon/Documents/thesis/gesture-analysis/data/"
time_groups_path_corrected_pickl = base_path+"transformed/time_added/all/time-and-groups-corrected-all.pkl"
stats_added_base_path = base_path+"transformed/stats_added/all/"
stats_added_path_pickl = stats_added_base_path+"raw_stats-added-all.pkl"
gyro_calibration_path = base_path+'../scripts/gestureanalysis/gyro_offset.txt'

In [4]:
import os
os.getcwd()

'/home/jsimon/Documents/thesis/gesture-analysis/scripts'

In [5]:
with open( time_groups_path_corrected_pickl, "rb" ) as users_pickle_file:
    users = pickle.load(users_pickle_file)

In [5]:
with open( stats_added_base_path+'train.pkl', "rb" ) as users_pickle_file:
    traindata = pickle.load(users_pickle_file)

In [6]:
with open( stats_added_base_path+'validation.pkl', "rb" ) as users_pickle_file:
    validdata = pickle.load(users_pickle_file)

In [7]:
train_data_df = traindata['train']['data']
train_labels_df = traindata['train']['labels']

## Prepare the Training set

In [8]:
train_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77610 entries, 0 to 77609
Columns: 7201 entries, 0_0_Thumb_base_mean to 7200_xcorr_8_10
dtypes: float64(7201)
memory usage: 4.2 GB


In [9]:
# we fill the NaN values:
train_data_df.fillna(method='backfill', inplace=True)

In [10]:
headers = list(train_data_df.columns)
print(headers[0:3])

['0_0_Thumb_base_mean', '0_1_Thumb_base_std', '0_2_Thumb_base_min']


In [11]:
nh = len(headers)
headers = np.array(headers).reshape(1,nh)

In [12]:
X = train_data_df.values
print(X.shape)

(77610, 7201)


In [13]:
np.argwhere(np.isnan(X)).shape

(0, 2)

In [14]:
def simple_label_vec(labels_df, gestures):
    y = np.zeros((len(labels_df),))
    for index, row in labels_df.iterrows():
        if row.values.sum() == 0:
            y[index] = 0
        else:
            idx = row.idxmax()
            y[index] = gestures.index(idx) + 1
    return y

In [15]:
gestures = list(train_labels_df.columns.copy())
gestures.sort()
print(gestures)

['(1) One', '(2) Two', '(3) Three', '(4) Four', '(5) Five', 'Calling', 'Come here', 'Continue', 'Cutthroat', 'Down', 'Go away', 'Grasp 1', 'Grasp 2', 'Knocking', 'Money', 'Never mind', 'Point', 'Point to self', 'Push away', 'Scissor', 'Shoot', 'Shoulder pat', 'Swipe left', 'Swipe right', 'Swipe up', 'Talking', 'Thumbs down', 'Thumbs up', 'Turn', 'Up', 'Walking', 'Waving', 'Zoom']


In [16]:
y = simple_label_vec(train_labels_df, gestures)

In [17]:
y = y.astype(np.int)

In [18]:
y.shape

(77610,)

## Prepare the validation set

In [19]:
valid_data_df = validdata['valid']['data']
valid_labels_df = validdata['valid']['labels']

In [20]:
valid_data_df.fillna(method='backfill', inplace=True)

In [21]:
Xval = valid_data_df.values
print(Xval.shape)

(6186, 7201)


In [22]:
yval = simple_label_vec(valid_labels_df, gestures)

In [23]:
yval = yval.astype(np.int)

In [24]:
yval.shape

(6186,)

## Scale the Data

In [25]:
transformer = RobustScaler().fit(X)
X = transformer.transform(X)
Xval = transformer.transform(Xval)

In [29]:
with open( stats_added_base_path+'scaler.pkl', "wb" ) as users_pickle_file:
    pickle.dump(transformer, users_pickle_file)

In [30]:
with open( stats_added_base_path+'scaler.pkl', "rb" ) as users_pickle_file:
    transformer = pickle.load(users_pickle_file)

## Clear Memory

In [26]:
del [[train_data_df,train_labels_df,valid_data_df,valid_labels_df]]
gc.collect()
transformer = None
traindata = None
validdata = None
train_data_df = None
train_labels_df = None
valid_data_df = None
valid_labels_df = None
gc.collect()
train_data_df = pd.DataFrame()
train_labels_df = pd.DataFrame()
valid_data_df = pd.DataFrame()
valid_labels_df = pd.DataFrame()

## Initial Feature selection

With 7201 features and > 70k instances our training set is to complex for most algorithms to fit well. Therfor I perform simple initial feature selection without a model to reduce the features who are f.e. only one value or clearly redundant and so on.

In [27]:
sel = VarianceThreshold().fit(X) # default: remove constants
X = sel.transform(X)
Xval = sel.transform(Xval)
print(headers.shape)
headers = sel.transform(headers)
print(headers.shape)

(1, 7201)
(1, 6368)


In [28]:
X.shape

(77610, 6368)

In [29]:
def subsample(X,y,times):
    X_samples = X.copy()
    y_samples = y.copy()
    for i in range(times):
        sampler = StratifiedShuffleSplit(n_splits=2, test_size=0.5)
        sampler.get_n_splits(X_samples, y_samples)
        train_index, test_index = list(sampler.split(X_samples, y_samples))[0]
        X_samples = X_samples[train_index]
        y_samples = y_samples[train_index]
        gc.collect()
    return X_samples, y_samples

In [30]:
X_samples, y_samples = subsample(X,y,5)

In [31]:
X_samples.shape

(2425, 6368)

In [37]:
sel = None
sampler = None
gc.collect()

1150

In [33]:
sel = SelectPercentile(f_classif, percentile=60).fit(X_samples, y_samples) # keep a bit more than half of the features
candidate1 = sel.transform(X)
candidateVal1 = sel.transform(Xval)
candidateHeaders1 = sel.transform(headers)

In [34]:
candidate1.shape

(77610, 3820)

In [35]:
sel = SelectPercentile(mutual_info_classif, percentile=60).fit(X_samples, y_samples) # keep a bit more than half of the features
candidate2 = sel.transform(X)
candidateVal2 = sel.transform(Xval)
candidateHeaders2 = sel.transform(headers)

In [36]:
candidate2.shape

(77610, 3821)

In [38]:
with open( stats_added_base_path+'train-data-scaled-noconst.pkl', "wb" ) as users_pickle_file:
    pickle.dump({
        'X': X, 'y': y, 'Xval' : Xval, 'yval': yval, 
        'gestures' : gestures, 'headers': headers
    }, users_pickle_file)

In [39]:
with open( stats_added_base_path+'train-data-scaled-f-ANOVA60.pkl', "wb" ) as users_pickle_file:
    pickle.dump({
        'X': candidate1, 'y': y, 'Xval' : candidateVal1, 'yval': yval, 
        'gestures' : gestures, 'headers': candidateHeaders1
    }, users_pickle_file)

In [40]:
with open( stats_added_base_path+'train-data-scaled-mutual-inf60.pkl', "wb" ) as users_pickle_file:
    pickle.dump({
        'X': candidate2, 'y': y, 'Xval' : candidateVal2, 'yval': yval, 
        'gestures' : gestures, 'headers': candidateHeaders2
    }, users_pickle_file)