In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from knn.discoverer import KnnDiscovery
import os
import numpy as np
from os.path import join
import glob



#### load features

Our UTD algorithm searches for motifs over a dictionary of feature arrays.  
The keys are *sequence names (identifiers)* and values are `numpy` arrays of shape `[T x d]`  
where `T` is sequence length and `d` is the features' dimension.

In [3]:
feats_dir = './data/phoenix_Signer03_deephand/'

feats_dict = {}
for fname in os.listdir(feats_dir):
    seq_name = fname.replace('.npy','')
    feats_dict[seq_name] = np.load(os.path.join(feats_dir, fname))
    
feats_dict[seq_name].shape, len(feats_dict)

((183, 61), 470)

#### define parameters

The definitions of the parameters are as follows:
- `a` : stepsize for temporal segmentations
- `lmin,lmax` : Min and max lengths for segment cuts 
- `dim_fix` : Dimension of fixed-length temporal embeddings
- `r,s` : Parameters for Gaussian kernel embedding
- `k` : Search for top K nearest neighbours for each segment
- `metric` : Distance metric for KNN
- `norm` : Normalize features or not (FAISS parameter)
- `pca` :  FAISS PCA parameter, `PCAW40` applies whitened PCA, transforms into 40 dimensions
- `olapthr_m` : Max allowed overlap ratio between pairs
- `top_delta` : Pick top $\delta \%$ of pairs with lowest distance

In [4]:
params = {'disc': {
                    'a': 4,
                    'dim_fix': 10,
                    'emb_type': 'gauss_kernel',
                    'k': 100,
                    'lmax': 28,
                    'lmin': 4,
                    'metric': 'L2',
                    'norm': False,
                    'olapthr_m': 0.25,
                    'pca': 'PCAW40',
                    'r': 0.2,
                    's': 0.2,
                    'seg_type': 'uniform',
                    'top_delta': 0.02,
                    'use_gpu': True
                    }
         }

### run

In [8]:
knndisc = KnnDiscovery(feats_dict, params)
matches_df = knndisc.run()

Computing Embeddings
Building index of size 208739x610
Searching index
Selecting good pairs


### output format

A `pandas` dataframe, where each row corresponds to a discovered pair of segments.  

Column names refer to:
- `seg_id` --> The ID's for each pair, among many segments that are extracted from input sequences
- `f1,f2` --> Names of the sequences to which the discovered segments belong
- `f1_id,f2_id` --> ID's of sequences to which the discovered segments belong
- `f1_start, f1_end` --> Onset and offset indices for the 1st segment of the discovered pair
- `f2_start, f2_end` --> Onset and offset indices for the 2nd segment of the discovered pair
- `cost` --> Distance between segment embedding vectors, computed during KNN search

In [9]:
matches_df.head(3)

Unnamed: 0,seg_id1,seg_id2,f1,f2,f1_id,f2_id,f1_start,f1_end,f2_start,f2_end,cost
0,7874,165462,11November_2009_Wednesday_tagesschau_default-0,01October_2009_Thursday_tagesschau_default-0,15,374,24,48,28,56,2.5e-05
1,8744,69505,30November_2009_Monday_tagesschau_default-0,29April_2010_Thursday_tagesschau_default-0,19,159,12,28,16,32,2.5e-05
2,8744,172734,30November_2009_Monday_tagesschau_default-0,30September_2009_Wednesday_tagesschau_default-0,19,390,12,28,16,44,2.8e-05


## clustering

In [11]:
from clustering.comm_detection import run_clustering_Modularity
from clustering.wrappers import run_clustering

In [12]:
params = {**params, 
                 'config_file': join(os.getcwd(), 'config/config_phoenix.json'),
                 'dataset': 'phoenix',
                 'disc_method': 'knn',
                 'exp_root': join(os.getcwd(), 'results'),
                 'njobs': 2}
 

params['clustering'] =     {
                       'method': 'modularity',
                       'cost_thr': 0.5,
                       'peak_thr': 0.5,
                       'modularity_thr': 0,
                       'clus_alg': 'fastgreedy',
                       'min_cluster_size':2,
                       'dedupthr': 0.5,
                       }

params['expname'] = 'deneme'

seq_names = sorted(feats_dict.keys())


In [13]:
nodes_df, clusters_list, postdisc_name = run_clustering(seq_names, matches_df, params)
print('*** post disc completed, found {} segments from {} clusters ***'.format(len(nodes_df), len(clusters_list)))

*** mean edge weight is 0.557 after cost threshold ***
*** similarity profile computed ***
*** node centers found ***
*** graph constructed ***
*** vertex mapping done ***
*** graph clustering done !! ***
*** intervals found ***
*** post disc completed, found 24 segments from 8 clusters ***


In [14]:
nodes_df

Unnamed: 0,filename,idx,start,end
1,01October_2009_Thursday_tagesschau_default-0,32,24,47
2,06April_2010_Tuesday_tagesschau_default-0,21,8,36
3,08June_2010_Tuesday_tagesschau_default-0,19,9,30
4,09February_2010_Tuesday_tagesschau_default-0,19,10,28
5,10February_2010_Wednesday_tagesschau_default-0,39,8,72
6,11February_2010_Thursday_tagesschau_default-0,21,16,36
7,11November_2009_Wednesday_tagesschau_default-0,35,24,48
8,12January_2010_Tuesday_heute_default-1,43,36,52
9,12January_2010_Tuesday_tagesschau_default-0,15,8,26
10,14June_2010_Monday_tagesschau_default-0,15,8,28


In [22]:
# the numbers refer to row indices of nodes dataframe above
clusters_list

[[1, 3, 7, 10, 18],
 [2, 9, 15, 21],
 [4, 5, 17, 22, 23],
 [6, 13],
 [8, 19],
 [11, 14],
 [12, 24],
 [16, 20]]

## evaluation

In [15]:
# adjust library paths

# change according to your tde build
TDEROOT='/home/korhan/Desktop/tez/tdev2/tdev2'    

# to activate conda env from bash script for evaluation, change according to your conda env
SOURCE = glob.glob('/home/korhan/*/etc/profile.d/conda.sh')[0] 


In [16]:
import subprocess
from utils.helper_fncs import load_json

dataset= "phoenix"
jobs=2
outdir = join(params['exp_root'],params['expname'], postdisc_name)
os.makedirs(outdir, exist_ok=1)

with open(join(outdir, 'seq_names.txt'), 'w') as f: f.write('\n'.join(seq_names))


cmd = './run_tde.sh {} {} {} {} {} {} {} {}'.format(TDEROOT, outdir, dataset, 
                                                    'sdtw', join(outdir, 'scores.json'), 
                                                    jobs, params['config_file'], SOURCE )


In [19]:
subprocess.call(cmd.split())   

try:
    scores = load_json(outdir + '/scores.json')

except Exception as exc:

    print(traceback.format_exc())
    print(exc)
    scores = {'ned':100.0, 'coverageNS': 0.0}

if 'ned' not in scores.keys(): scores['ned'] = 100.0
if 'coverageNS' not in scores.keys(): scores['coverageNS'] = 0.0

In [20]:
scores

{'ned': 28.49,
 'coverage': 0.86,
 'coverageNS': 0.97,
 'coverageNS_f': 1.02,
 'grouping_F': 60.42,
 'grouping_P': 54.17,
 'grouping_R': 68.42,
 'token_F': 0.19,
 'token_P': 20.83,
 'token_R': 0.09,
 'type_F': 1.44,
 'type_P': 30.0,
 'type_R': 0.74,
 'boundary_F': 1.64,
 'boundary_P': 100.0,
 'boundary_R': 0.83,
 'n_clus': 8,
 'n_node': 24,
 'exp_path': '/home/korhan/Desktop/knn_utd/results/deneme/post_cost0.5_peak0.5_q0_fastgreedyAlg_mc2'}