In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pkg_resources 
from tdev2.readers.gold_reader import *
from tdev2.readers.disc_reader import *

In [3]:
from tdev2.measures.ned import *
from tdev2.measures.boundary import *
from tdev2.measures.grouping import *
from tdev2.measures.coverage import *
from tdev2.measures.token_type import *

In [4]:
def prf2dict(dct, measurename, obj):
    # save precision-recall-f of tde objects to dictionary
    dct[measurename + '_P'] = obj.precision
    dct[measurename + '_R'] = obj.recall
    dct[measurename + '_F'] = obj.fscore
    
    return dct


def compute_scores(gold, disc, measures=[]):
    scores = dict()
    
    # Launch evaluation of each metric
    if len(measures) == 0 or "boundary" in measures:
        print('Computing Boundary...')
        boundary = Boundary(gold, disc)
        boundary.compute_boundary()
        scores = prf2dict(scores, 'boundary', boundary)
        
    if len(measures) == 0 or "grouping" in measures:
        print('Computing Grouping...')
        grouping = Grouping(disc)
        grouping.compute_grouping()
        scores = prf2dict(scores, 'grouping', grouping)    
        
    if len(measures) == 0 or "token/type" in measures:
        print('Computing Token and Type...')
        token_type = TokenType(gold, disc)
        token_type.compute_token_type()
        scores['token_P'],scores['token_R'],scores['token_F'] = token_type.precision[0], token_type.recall[0], token_type.fscore[0]
        scores['type_P'],scores['type_R'],scores['type_F'] = token_type.precision[1], token_type.recall[1], token_type.fscore[1]        
        
    if len(measures) == 0 or "coverage" in measures:
        print('Computing Coverage...')
        coverage = Coverage(gold, disc)
        coverage.compute_coverage()
        scores['coverage'] = coverage.coverage

        
    if len(measures) == 0 or "coverageNS" in measures:
        print('Computing Coverage No Single...')
        coverage = Coverage_NoSingleton(gold, disc)
        coverage.compute_coverage()
        scores['coverageNS'] = coverage.coverage

        
    if len(measures) == 0 or "ned" in measures:
        print('Computing NED...')
        ned = Ned(disc)
        ned.compute_ned()
        scores['ned'] = ned.ned
        
        
    
    return scores

In [5]:
os.chdir('build/lib/')

In [6]:
os.chdir('../../')

In [26]:
import sys
sys.path.append('/home/korhan/Dropbox/tez_scripts/')
from tdev2.utils import zrexp2tde, sdtw2tde, narrow_gold, read_config

wrd_path = 'tdev2/share/phoenix.wrd'
phn_path = 'tdev2/share/phoenix.phn'
cnf_path = 'config.json'

# read_config(cnf_path)

# exp_path = '/home/korhan/Desktop/zerospeech2017/track2/src/ZRTools/exp/'
# exp_path = '/home/korhan/Desktop/tez/sdtw_exps/sdtw_pairs_A_0_341_c3right_PCA_Wh_diag_olapFalse_w8_L2_extend_r0.3_diag_thr0.7_loss_funceuclid_end_cut0/postpairwise_cost0.001_olap0.25/'
exp_path = '/home/korhan/Desktop/tez/zr_exps/zr_mDGS_r_c3r_PCA40_Wh_rhothr0.3_T0.5_D10_dx18_dy9_medthr0.4_Tscore0.5_R12_castthr8_trimthr1.25_B50_P8/postpairwise_cost0.0006181300604703248_olap0.25//'

disc_clsfile = sdtw2tde(exp_path)

print(disc_clsfile)

/home/korhan/Desktop/tez/zr_exps/zr_mDGS_r_c3r_PCA40_Wh_rhothr0.3_T0.5_D10_dx18_dy9_medthr0.4_Tscore0.5_R12_castthr8_trimthr1.25_B50_P8/postpairwise_cost0.0006181300604703248_olap0.25//master_graph.class


In [24]:
from tdev2.eval_sign import try_compute_scores

In [31]:
kwargs = {
#     'config_file' : 'config.json',
    'config_file' : '/home/korhan/Dropbox/config/config_mdgs.json',
         'njobs': 1}

In [126]:
exp_path = '/home/korhan/Desktop/tez/zr_exps/zr_mDGS_r_c3r_PCA40_Wh_rhothr0.3_T0.5_D10_dx18_dy9_medthr0.4_Tscore0.5_R12_castthr8_trimthr1.25_B50_P8/postpairwise_cost0.010509214885954384_olap0.25/'
disc_clsfile = sdtw2tde(exp_path)

wrd_path = 'tdev2/share/mdgsClean_both.wrd'
phn_path = 'tdev2/share/mdgsClean_both.phn'


In [127]:
# read_config(cnf_path)
gold = Gold(wrd_path=wrd_path, phn_path=phn_path, **kwargs) 
disc = Disc(disc_clsfile, gold) 


*** Config file read, ovth 100.0 ***
/home/korhan/Dropbox/config/config_mdgs.json
Discovered Class file read

966 unique intervals, 494 clusters with 971 nodes found


In [128]:
try_compute_scores(gold, disc, measures=[], **kwargs)

Computing Boundary...
Computing Grouping...
Number of grouping jobs: 1
Computing Token and Type...
Computing Coverage...
Computing Coverage...
Computing Coverage No Single...
*** Config file read, ovth 100.0 ***
Computing NED...


{'boundary_P': 1.0,
 'boundary_R': 0.0222,
 'boundary_F': 0.0435,
 'n_clus': 494,
 'n_node': 971,
 'grouping_P': 0.0221,
 'grouping_R': 0.2239,
 'grouping_F': 0.0401,
 'token_P': 0.0704,
 'token_R': 0.003,
 'token_F': 0.0058,
 'type_P': 0.046,
 'type_R': 0.0093,
 'type_F': 0.0154,
 'coverage': 0.046,
 'coverageNS': 0.0456,
 'coverageNS_f': 0.029,
 'ned': 0.8644}

In [46]:
disc_clsfile

'/home/korhan/Desktop/tez/zr_exps/zr_mDGS_r_c3r_PCA40_Wh_rhothr0.3_T0.5_D10_dx18_dy9_medthr0.4_Tscore0.5_R12_castthr8_trimthr1.25_B50_P8/postpairwise_cost0.0006181300604703248_olap0.25//master_graph.class'

In [25]:
scores = compute_scores(gold, disc)
scores['n_clus'] = len(disc.clusters)
scores['n_node'] = len(disc.intervals)
scores

Computing Boundary...
Computing Grouping...
Number of grouping jobs: 1
Computing Token and Type...
Computing Coverage...
Computing Coverage No Single...
Computing NED...


{'boundary_P': 0.5131578947368421,
 'boundary_R': 0.005875086619866831,
 'boundary_F': 0.011594830287040826,
 'grouping_P': 0.5307443365695792,
 'grouping_R': 0.45054945054945056,
 'grouping_F': 0.48687383012976415,
 'token_P': 0.20609579100145137,
 'token_R': 0.0023389501078881917,
 'token_F': 0.004625407166123778,
 'type_P': 0.4444444444444444,
 'type_R': 0.02924451665312754,
 'type_F': 0.054878048780487805,
 'coverage': 0.005818596691386195,
 'coverageNS': 0.006113361801525603,
 'ned': 0.5161290322580645,
 'n_clus': 521,
 'n_node': 689}

### save disc intervals and clusters

In [47]:
disc.clusters

{'1': [('1420216_B_both_208220_210800',
   1.0,
   80.0,
   ((0.0, 6.0, 'ICH1'),
    (15.0, 21.0, '$NUM-EINER1A:1d'),
    (29.0, 38.0, 'RASIEREN2'),
    (39.0, 44.0, 'ICH1'),
    (58.0, 62.0, 'VOR1G'),
    (69.0, 75.0, '$NUM-EINER1A:1d')),
   ('ICH1',
    '$NUM-EINER1A:1d',
    'RASIEREN2',
    'ICH1',
    'VOR1G',
    '$NUM-EINER1A:1d')),
  ('1420216_B_both_208220_210800',
   50.0,
   128.0,
   ((58.0, 62.0, 'VOR1G'),
    (69.0, 75.0, '$NUM-EINER1A:1d'),
    (83.0, 91.0, 'RASIEREN2'),
    (96.0, 98.0, 'ICH1'),
    (107.0, 114.0, 'KLEIN3'),
    (121.0, 129.0, '$GEST-OFF')),
   ('VOR1G', '$NUM-EINER1A:1d', 'RASIEREN2', 'ICH1', 'KLEIN3', '$GEST-OFF'))],
 '2': [('1184756_A_both_283520_285920',
   0.0,
   71.0,
   ((0.0, 4.0, 'GUT1'), (27.0, 36.0, 'SCHÖN1A'), (41.0, 120.0, 'SEHR-VIEL1')),
   ('GUT1', 'SCHÖN1A', 'SEHR-VIEL1')),
  ('1184756_A_both_283520_285920',
   50.0,
   119.0,
   ((41.0, 120.0, 'SEHR-VIEL1'),),
   ('SEHR-VIEL1',))],
 '3': [('1419931_B_both_187880_189900',
   1.0,
   52.

In [39]:
import json

with open(exp_path + 'clusters_tde.json','w') as f:
    json.dump(disc.clusters, f)

### visualize clusters

In [119]:
def find_commons(labelslist):
    # labelslist: list of dicts
    common_labels = set()
    for i in range(len(labelslist)):
        for j in range(i+1,len(labelslist)):
            if '$' not in labelslist[i] and  '$' not in labelslist[j]:
                common_labels |= (labelslist[i] & labelslist[j])
    
    common_labels = list(common_labels)
    if len(common_labels) == 0:
        return 'none'
    else:
        return str(common_labels)

In [129]:
clusters = disc.clusters

all_clus = []
for cid, clus in clusters.items():
    clus_within = []
    for fname, start, end, tokens, ngram in clus:
        clus_within.append((fname,start,end,ngram))
    tmp = pd.DataFrame(clus_within, columns=['file','start','end','labels'])
    common_labels = find_commons(list(tmp.labels.apply(lambda x: set(x))))
    tmp['common_labels'] = common_labels
    tmp['cluster_id'] = cid
    tmp = tmp[['cluster_id', 'file','start','end', 'common_labels', 'labels']]
    all_clus.append(tmp)
pd.concat(all_clus)

Unnamed: 0,cluster_id,file,start,end,common_labels,labels
0,1,1420216_B_both_208220_210800,1.0,80.0,none,"(ICH1, $, RASIEREN2, ICH1, VOR1G, $)"
1,1,1420216_B_both_208220_210800,50.0,128.0,none,"(VOR1G, $, RASIEREN2, ICH1, KLEIN3, $)"
0,2,1184756_A_both_283520_285920,0.0,71.0,none,"(GUT1, SCHÖN1A)"
1,2,1184756_A_both_283520_285920,50.0,119.0,none,"(SEHR-VIEL1,)"
0,3,1419931_B_both_187880_189900,1.0,52.0,none,"($,)"
...,...,...,...,...,...,...
1,493,1184756_B_both_155020_292700,5365.0,5470.0,none,"(GLAUBEN2A, AB2, GLAUBEN2A, $, LOS-START1)"
0,494,1431690_B_both_113760_272000,2784.0,2886.0,none,"(ICH2, SEHEN1, FÜHREN1, ICH1, SEHEN-AUF1, AUF-..."
1,494,1431690_B_both_113760_272000,5314.0,5402.0,none,"($, NICHT3B)"
0,495,1184756_B_both_155020_292700,3132.0,3235.0,none,"($, SCHÖN1B, THEATER6, BEREICH1A, GUT1, $)"


In [130]:
alldf = pd.concat(all_clus)
alldf.loc[alldf.common_labels != 'none']

Unnamed: 0,cluster_id,file,start,end,common_labels,labels
0,75,1177918_B_both_32720_40900,1.0,84.0,['SAGEN1'],"(NEIN1A, MOMENT2, SAGEN1)"
1,75,1177918_B_both_32720_40900,56.0,139.0,['SAGEN1'],"(SAGEN1, AUCH3A, DAZU1)"
0,121,1177918_A_both_112820_172780,1.0,77.0,['JA2'],"(JA2,)"
1,121,1177918_A_both_112820_172780,56.0,132.0,['JA2'],"(JA2,)"
0,158,1184756_B_both_691540_694720,31.0,110.0,['SCHWEIZ1A'],"(SCHWEIZ1A,)"
1,158,1184756_B_both_691540_694720,80.0,158.0,['SCHWEIZ1A'],"(SCHWEIZ1A,)"
0,238,1431690_B_both_113760_272000,2787.0,2891.0,['SEHEN-AUF1'],"(SEHEN1, FÜHREN1, ICH1, SEHEN-AUF1, AUF-PERSON1)"
1,238,1431690_B_both_113760_272000,3224.0,3346.0,['SEHEN-AUF1'],"(SEHEN-AUF2, SEHR6, ARMER1, ICH2, SEHEN-AUF1, ..."
0,253,1431690_B_both_113760_272000,2784.0,2885.0,"['ICH1', 'SEHEN-AUF1']","(ICH2, SEHEN1, FÜHREN1, ICH1, SEHEN-AUF1, AUF-..."
1,253,1431690_B_both_113760_272000,2917.0,3013.0,"['ICH1', 'SEHEN-AUF1']","(ICH1, SEHEN-AUF2, ARMER1, SEHEN-AUF1)"


In [110]:
str(common_labels)

"['SEHR-VIEL1']"

### write read config file

In [28]:
conf = {'overlap_th': 5. ,
'excluded_units' : ['SIL','__ON__','__OFF__','__EMOTION__','SPN'] ,
'discoverable_th' : 1}
conf

{'overlap_th': 5.0,
 'excluded_units': ['SIL', '__ON__', '__OFF__', '__EMOTION__', 'SPN'],
 'discoverable_th': 1}

In [29]:
import json
config_pth = 'config.json'

with open(config_pth, 'w') as f:
    json.dump(conf, f)

In [32]:
with open(config_pth, 'r') as f:
    conf = json.load(f)
    
conf['overlap_th']

5.0

In [1]:
import editdistance

In [2]:
s1 = []
s2 = ['s']
editdistance.eval(s1, s2)

1