In [2]:
from knn.discoverer import KnnDiscovery
import os
import numpy as np

#### load features

Our UTD algorithm searches for motifs over a dictionary of feature arrays.  
The keys are *sequence names (identifiers)* and values are `numpy` arrays of shape `[T x d]`  
where `T` is sequence length and `d` is the features' dimension.

In [3]:
feats_dir = './data/phoenix_Signer03_deephand/'

feats_dict = {}
for fname in os.listdir(feats_dir):
    seq_name = fname.replace('.npy','')
    feats_dict[seq_name] = np.load(os.path.join(feats_dir, fname))
    
feats_dict[seq_name].shape, len(feats_dict)

((183, 61), 470)

#### define parameters

The definitions of the parameters are as follows:
- `a` : stepsize for temporal segmentations
- `lmin,lmax` : Min and max lengths for segment cuts 
- `dim_fix` : Dimension of fixed-length temporal embeddings
- `r,s` : Parameters for Gaussian kernel embedding
- `k` : Search for top K nearest neighbours for each segment
- `metric` : Distance metric for KNN
- `norm` : Normalize features or not (FAISS parameter)
- `pca` :  FAISS PCA parameter, `PCAW40` applies whitened PCA, transforms into 40 dimensions
- `olapthr_m` : Max allowed overlap ratio between pairs
- `top_delta` : Pick top $\delta \%$ of pairs with lowest distance

In [4]:
params = {'disc': {
                    'a': 4,
                    'dim_fix': 10,
                    'emb_type': 'gauss_kernel',
                    'k': 100,
                    'lmax': 28,
                    'lmin': 4,
                    'metric': 'L2',
                    'norm': False,
                    'olapthr_m': 0.25,
                    'pca': 'PCAW40',
                    'r': 0.2,
                    's': 0.2,
                    'seg_type': 'uniform',
                    'top_delta': 0.02,
                    'use_gpu': True
                    }
         }

### run

In [5]:
knndisc = KnnDiscovery(feats_dict, params)
matches_df = knndisc.run()

Computing Embeddings
Building index of size 208739x610
Searching index
Selecting good pairs


### output format

A `pandas` dataframe, where each row corresponds to a discovered pair of segments.  

Column names refer to:
- `seg_id` --> The ID's for each pair, among many segments that are extracted from input sequences
- `f1,f2` --> Names of the sequences to which the discovered segments belong
- `f1_id,f2_id` --> ID's of sequences to which the discovered segments belong
- `f1_start, f1_end` --> Onset and offset indices for the 1st segment of the discovered pair
- `f2_start, f2_end` --> Onset and offset indices for the 2nd segment of the discovered pair
- `cost` --> Distance between segment embedding vectors, computed during KNN search

In [7]:
matches_df.head(3)

Unnamed: 0,seg_id1,seg_id2,f1,f2,f1_id,f2_id,f1_start,f1_end,f2_start,f2_end,cost
0,7874,165462,11November_2009_Wednesday_tagesschau_default-0,01October_2009_Thursday_tagesschau_default-0,15,374,24,48,28,56,2.5e-05
1,8744,69505,30November_2009_Monday_tagesschau_default-0,29April_2010_Thursday_tagesschau_default-0,19,159,12,28,16,32,2.5e-05
2,8744,172734,30November_2009_Monday_tagesschau_default-0,30September_2009_Wednesday_tagesschau_default-0,19,390,12,28,16,44,2.8e-05
