In [59]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings("ignore")
df_raw = pd.read_csv("CoST.csv")
# df = pd.read_csv("https://data.4tu.nl/articles/dataset/Corpus_of_Social_Touch_CoST_/12696869?file=24044075", sep='\t')


# Preprocessing

frame = 135 - 1 second

In [2]:
df = df_raw.copy()
df.columns = df.columns.str.strip(" ")
df = df.set_index(['subject', 'variant', 'gesture'])
df['frame'].value_counts()

1       7805
7       7805
10      7805
9       7805
8       7805
        ... 
1580       1
1581       1
1582       1
1583       1
1747       1
Name: frame, Length: 1747, dtype: int64

In [3]:
df['observation'] = np.nan
df = df.reset_index()
values = df['frame'].values
i = 1
for index, element in tqdm(enumerate(values)):
    if values[index + 1] < values[index]:
        df.loc[index, "observation"] = i
        i += 1

0it [00:00, ?it/s]

IndexError: index 1496855 is out of bounds for axis 0 with size 1496855

In [4]:
df['observation'] = df['observation'].fillna(method='bfill')
df['observation'] = df['observation'].fillna(df['observation'].max() + 1)
df['observation'] = df['observation'].astype(int)

In [5]:
# Create ML dataset

data = pd.DataFrame(index=np.arange(1, df['observation'].max() + 1).astype(int), columns=['duration'])

# Add gesture (y)
data['gesture'] = pd.Series(df.drop_duplicates(['observation'], keep='last')['gesture'].values)

dict_gesture = {1: "grab", 2: "hit", 3: "massage", 4: "pat", 5: "pinch",
                6: "poke", 7: "press", 8: "rub", 9: "scratch", 10: "slap", 11: "squeeze",
                12: "stroke", 13: "tap", 14: "tickle"}

data['gesture'] = data['gesture'].map(dict_gesture)

data['variant'] = pd.Series(df.drop_duplicates(['observation'], keep='last')['variant'].values)
dict_variant = {1: "gentle", 2: "normal", 3: "rough"}
data['variant'] = data['variant'].map(dict_variant)


# Add duration for every observation
data['duration'] = (df.drop_duplicates(['observation'], keep='last')['frame'] / 135).values

Some unique combination of subject, variant, gesture have 5 repetitions. 7805 gesture captures

# Feature engineering

– Mean pressure is the mean over channels and time (1).

– Maximum pressure is the maximum value over channels
and time (2).

– Pressure variability is the mean over time of the sum over
channels of the absolute value of difference between two
consecutive frames (3).

– Mean pressure per row is the mean over columns and time
resulting in one feature per row which are in the direction
of the mannequin arm’s length (from top to bottom, 4–
11).

– Mean pressure per column is the mean over rows and
time resulting in one feature per column which are in
the direction of the mannequin arm’s width (from left to
right, 12–19).

– Contact area per frame is the fraction of channels with a
value above 50 % of the maximum value. Mean contact
area is the mean over time of contact area (20) and the
maximum pressure contact area is the contact area of the
frame with the highest mean pressure over channels (21).
The size of the contact area indicated whether the whole
hand was used for a touch gesture, as would be expected

In [6]:
# Mean pressure
ch_cols = [i for i in df.columns if i.startswith("ch")]
data['mean_pressure'] = df.groupby('observation')[ch_cols].mean().mean(axis=1)

In [7]:
data.groupby(['gesture'])['mean_pressure'].mean()

gesture
grab       341.931508
hit        127.462967
massage    207.492889
pat        128.959427
pinch      148.889580
poke       118.970094
press      201.823134
rub        164.400707
scratch    138.925870
slap       120.755853
squeeze    285.747048
stroke     149.573673
tap        117.651540
tickle     127.242386
Name: mean_pressure, dtype: float64

In [8]:
# Maximum pressure
data['maximum_pressure'] = df.groupby('observation')[ch_cols].max().max(axis=1)

In [9]:
data.groupby(['gesture'])['maximum_pressure'].mean() * 2

gesture
grab       1708.850987
hit        1689.304659
massage    1693.558348
pat        1513.917415
pinch      1691.086022
poke       1534.254480
press      1703.835125
rub        1595.583483
scratch    1539.315412
slap       1579.863799
squeeze    1756.057451
stroke     1539.482014
tap        1491.870968
tickle     1444.904847
Name: maximum_pressure, dtype: float64

In [10]:
# Variance over channels and time (44)
data['variance'] = df.groupby('observation')[ch_cols].var().var(axis=1)

In [11]:
# Contact area per frame
# Attention! It's really time and memory expensive
df['contact_area'] = df[ch_cols].apply(lambda x: np.mean(x > x.max() * 0.5), axis=1)
data['mean_contact_area'] = df.groupby('observation')['contact_area'].mean()

In [14]:
data.to_csv("final_dataset.csv")

# Modeling

In [24]:
# dict_gesture_inverse
dict_gesture_inverse = {value: key for key, value in dict_gesture.items()}
data = data.dropna()
data['gesture'] = data['gesture'].map(dict_gesture_inverse)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['gesture'] = data['gesture'].astype(int)


In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=['gesture', 'variant']),
                                                    data['gesture'],
                                                    test_size=0.3)

In [45]:
from sklearn.metrics import (accuracy_score, recall_score, precision_score,
                             roc_auc_score, f1_score)

In [50]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.compose import ColumnTransformer

pipe_svm = Pipeline([
    ('scal', StandardScaler()),
    ('model', SVC(decision_function_shape='ovo'))
])
pipe_svm.fit(X_train, y_train)

Pipeline(steps=[('scal', StandardScaler()),
                ('model', SVC(decision_function_shape='ovo'))])

In [44]:
from sklearn.ensemble import RandomForestClassifier
pipe_forest = Pipeline([
    ('scal', StandardScaler()),
    ('model', RandomForestClassifier())
])
pipe_forest.fit(X_train, y_train)

Pipeline(steps=[('scal', StandardScaler()),
                ('model', RandomForestClassifier())])

In [49]:
def metrics(y_true, model):
    dict_metrics = {}
    dict_metrics['accuracy'] = accuracy_score(y_true, model.predict(X_test))
    dict_metrics['precision'] = precision_score(y_true, model.predict(X_test), average='micro')
    dict_metrics['recall'] = recall_score(y_true, model.predict(X_test), average='micro')
    return dict_metrics
metrics(y_test, pipe_forest)

{'accuracy': 0.4039282664389411,
 'precision': 0.4039282664389411,
 'recall': 0.4039282664389411}

## Add complex preprocessing and cross validation

In [51]:
%%time
from sklearn.impute import SimpleImputer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler(with_std=False, with_mean=False))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, X_train.columns),
        ])

pipe_forest = Pipeline([('scl', preprocessor),
                        ('clf', RandomForestClassifier(n_jobs=-1, oob_score=False))
                        ])  # pipeline with all steps
param_dist_forest = {'clf__max_depth': [3, 5, 6, 10, 15],
                     'clf__n_estimators': [100, 200, 300, 400, 500],
                     'clf__max_features': ['sqrt', 'log2'],
                     'clf__min_samples_leaf': np.arange(1, 30)
                     }  # parameters for Grid
# Cross-validation choice
skf = KFold(5)
forest_randomized_pipe = RandomizedSearchCV(estimator=pipe_forest,
                                            param_distributions=param_dist_forest,
                                            cv=skf, n_iter=30, n_jobs=-1)
forest_randomized_pipe.fit(X_train, y_train)

CPU times: user 3.21 s, sys: 433 ms, total: 3.64 s
Wall time: 28.9 s


RandomizedSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
                   estimator=Pipeline(steps=[('scl',
                                              ColumnTransformer(transformers=[('num',
                                                                               Pipeline(steps=[('imputer',
                                                                                                SimpleImputer(strategy='median')),
                                                                                               ('scaler',
                                                                                                StandardScaler(with_mean=False,
                                                                                                               with_std=False))]),
                                                                               Index(['duration', 'mean_pressure', 'maximum_pressure', 'variance',
       'mean_contact_area'],
      d

In [52]:
metrics(y_test, forest_randomized_pipe)

{'accuracy': 0.40905209222886424,
 'precision': 0.40905209222886424,
 'recall': 0.40905209222886424}

## Analyze quality of predictions depending on gesture name

In [64]:
data_predictions = data.iloc[X_test.index]
data_predictions['y_predict'] = forest_randomized_pipe.predict(X_test)
data_predictions['y_true'] = y_test.values
data_predictions['correct'] = np.where(data_predictions['y_true'] == data_predictions['y_predict'], 1, 0)

gesture_correcter = data_predictions.groupby(['gesture'])['correct'].mean().sort_values(ascending=False).to_frame()\
    .join(data_predictions['gesture'].value_counts(normalize=True).to_frame("gesture_num_obs"))
gesture_correcter.index = gesture_correcter.index.map(dict_gesture)