In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
from knn.discoverer import KnnDiscovery
import os
import numpy as np
from os.path import join
import glob



ModuleNotFoundError: No module named 'knn'

#### load features

Our UTD algorithm searches for motifs over a dictionary of feature arrays.  
The keys are *sequence names (identifiers)* and values are `numpy` arrays of shape `[T x d]`  
where `T` is sequence length and `d` is the features' dimension.

In [6]:
feats_dir = './data/sample/features/phoenix_Signer03_deephand/'

feats_dict = {}
for fname in os.listdir(feats_dir):
    seq_name = fname.replace('.npy','')
    feats_dict[seq_name] = np.load(os.path.join(feats_dir, fname))
    
feats_dict[seq_name].shape, len(feats_dict)

((183, 61), 470)

#### define parameters

The definitions of the parameters are as follows:
- `a` : stepsize for temporal segmentations
- `lmin,lmax` : Min and max lengths for segment cuts 
- `dim_fix` : Dimension of fixed-length temporal embeddings
- `r,s` : Parameters for Gaussian kernel embedding
- `k` : Search for top K nearest neighbours for each segment
- `metric` : Distance metric for KNN
- `norm` : Normalize features or not (FAISS parameter)
- `pca` :  FAISS PCA parameter, `PCAW40` applies whitened PCA, transforms into 40 dimensions
- `olapthr_m` : Max allowed overlap ratio between pairs
- `top_delta` : Pick top $\delta \%$ of pairs with lowest distance

In [7]:
params = {'disc': {
                    'a': 3,
                    'dim_fix': 4,
                    'emb_type': 'gauss_kernel',
                    'k': 150,
                    'lmax': 15,
                    'lmin': 2,
                    'metric': 'L2',
                    'norm': False,
                    'olapthr_m': 0.2,
                    'pca': '',
                    'r': 0.21,
                    's': 0.6,
                    'seg_type': 'uniform',
                    'top_delta': 0.05,
                    'use_gpu': True
                    }
         }

### run

In [8]:
knndisc = KnnDiscovery(feats_dict, params)
matches_df = knndisc.run()

Computing Embeddings
Building index of size 238937x244
Searching index
Selecting good pairs


### output format

A `pandas` dataframe, where each row corresponds to a discovered pair of segments.  

Column names refer to:
- `seg_id` --> The ID's for each pair, among many segments that are extracted from input sequences
- `f1,f2` --> Names of the sequences to which the discovered segments belong
- `f1_id,f2_id` --> ID's of sequences to which the discovered segments belong
- `f1_start, f1_end` --> Onset and offset indices for the 1st segment of the discovered pair
- `f2_start, f2_end` --> Onset and offset indices for the 2nd segment of the discovered pair
- `cost` --> Distance between segment embedding vectors, computed during KNN search

In [9]:
matches_df.head(3)

Unnamed: 0,seg_id1,seg_id2,f1,f2,f1_id,f2_id,f1_start,f1_end,f2_start,f2_end,cost
0,5,50197,22April_2010_Thursday_heute_default-13,30April_2010_Friday_tagesschau_default-7,0,97,6,27,144,159,2.2e-05
1,56,60963,22April_2010_Thursday_heute_default-13,25August_2010_Wednesday_heute_default-2,0,118,18,24,105,114,2.3e-05
2,58,224191,22April_2010_Thursday_heute_default-13,21May_2010_Friday_tagesschau_default-14,0,440,18,30,24,36,2.4e-05


## clustering

In [18]:
# from clustering.comm_detection import run_clustering_Modularity
from clustering.wrappers import run_clustering

In [19]:
params = {**params, 
                 'config_file': join(os.getcwd(), 'config/config_phoenix.json'),
                 'dataset': 'phoenix',
                 'disc_method': 'knn',
                 'exp_root': join(os.getcwd(), 'results'),
                 'njobs': 2}
 

# params['clustering'] =     {
#                        'method': 'modularity',
#                        'cost_thr': 0.5,
#                        'peak_thr': 0.5,
#                        'modularity_thr': 0,
#                        'clus_alg': 'fastgreedy',
#                        'min_cluster_size':2,
#                        'dedupthr': 0.5,
#                        }


params['clustering'] = {'cost_thr': 0.05, 'method': 'pairwise', 'olapthr_m': 0.5}

params['expname'] = 'example'

seq_names = sorted(feats_dict.keys())


In [20]:
nodes_df, clusters_list, postdisc_name = run_clustering(seq_names, matches_df, params)
print('*** post disc completed, found {} segments from {} clusters ***'.format(len(nodes_df), len(clusters_list)))

*** pairwise clustering ***
*** post disc completed, found 804 segments from 402 clusters ***


In [21]:
nodes_df

Unnamed: 0,filename,start,end
1,28August_2010_Saturday_tagesschau_default-8,60,66
2,28August_2010_Saturday_tagesschau_default-8,66,72
3,25January_2010_Monday_heute_default-2,135,141
4,25January_2010_Monday_heute_default-2,141,147
5,20July_2010_Tuesday_heute_default-7,21,27
6,20July_2010_Tuesday_heute_default-7,27,33
7,11February_2010_Thursday_tagesschau_default-9,39,48
8,11February_2010_Thursday_tagesschau_default-9,48,54
9,04April_2010_Sunday_tagesschau_default-0,18,24
10,04April_2010_Sunday_tagesschau_default-0,24,30


In [22]:
# the numbers refer to row indices of nodes dataframe above
clusters_list

[[1, 3, 7, 10, 18],
 [2, 9, 15, 21],
 [4, 5, 17, 22, 23],
 [6, 13],
 [8, 19],
 [11, 14],
 [12, 24],
 [16, 20]]

## evaluation

Call Term Discovery Evaluation (TDE) toolkit sh script using `subprocess`

In [22]:
# adjust library paths

# change according to your tde build
TDEROOT='/home/korhan/Desktop/tez/tdev2/tdev2'    

# to activate conda env from bash script for evaluation, change according to your conda env
SOURCE = glob.glob('/home/korhan/*/etc/profile.d/conda.sh')[0] 


In [23]:
import subprocess
from utils.helper_fncs import load_json

dataset= "phoenix"
jobs=2
outdir = join(params['exp_root'],params['expname'], postdisc_name)
os.makedirs(outdir, exist_ok=1)

with open(join(outdir, 'seq_names.txt'), 'w') as f: f.write('\n'.join(seq_names))


cmd = './run_tde.sh {} {} {} {} {} {} {} {}'.format(TDEROOT, outdir, dataset, 
                                                    'sdtw', join(outdir, 'scores.json'), 
                                                    jobs, params['config_file'], SOURCE )




if you encounter permission error, use `chmod u+x run_tde.sh` in your terminal to give permission to TDE shell script

In [27]:
subprocess.call(cmd.split())   

try:
    scores = load_json(outdir + '/scores.json')

except Exception as exc:

    # print(traceback.format_exc())
    print(exc)
    scores = {'ned':100.0, 'coverageNS': 0.0}

if 'ned' not in scores.keys(): scores['ned'] = 100.0
if 'coverageNS' not in scores.keys(): scores['coverageNS'] = 0.0

/home/korhan/miniconda3/etc/profile.d/conda.sh
/home/korhan/Desktop/knn_utd/results/deneme/postpairwise_cost0.05_olap0.5 phoenix sdtw /home/korhan/Desktop/knn_utd/results/deneme/postpairwise_cost0.05_olap0.5/scores.json
Reading gold
*** Config file read, ovth 50.0 ***
/home/korhan/Desktop/knn_utd/config/config_phoenix.json
Generating discovered -class- file
Reading discovered classes
Discovered Class file read

297 unique intervals, 130 clusters with 260 nodes found
Computing scores..
Computing Boundary...
Computing Grouping...
Number of grouping jobs: 2
Computing Token and Type...
Computing Coverage...
Computing Coverage...
Computing Coverage No Single...
*** Config file read, ovth 50.0 ***
Computing NED...
*** Config file read, ovth 50.0 ***
scores saved to  /home/korhan/Desktop/knn_utd/results/deneme/postpairwise_cost0.05_olap0.5/scores.json


In [28]:
scores

{'ned': 45.38,
 'coverage': 4.15,
 'coverageNS': 4.58,
 'coverageNS_f': 6.39,
 'grouping_F': 52.49,
 'grouping_P': 52.07,
 'grouping_R': 53.01,
 'token_F': 2.38,
 'token_P': 22.56,
 'token_R': 1.26,
 'type_F': 9.64,
 'type_P': 32.86,
 'type_R': 5.65,
 'boundary_F': 11.68,
 'boundary_P': 100.0,
 'boundary_R': 6.21,
 'n_clus': 130,
 'n_node': 260,
 'exp_path': '/home/korhan/Desktop/knn_utd/results/deneme/postpairwise_cost0.05_olap0.5'}

## Run with 10% coverage

if you want to get results with 10% coverage use the higher level function below that iteratively optimizes `cost_thr` to adjust coverage

In [32]:
from run.pipeline import run_exp

ModuleNotFoundError: No module named 'utils.sdtw_funcs'

In [None]:
scores = run_exp(feats_dict, params)