Skip to content
Find file
Fetching contributors…
Cannot retrieve contributors at this time
executable file 3234 lines (2992 sloc) 257 KB
/*
AFFIXTRAIN - supervised learning of affix rules for CSTLEMMA
Copyright (C) 2012 Center for Sprogteknologi, University of Copenhagen
This file is part of AFFIXTRAIN.
AFFIXTRAIN is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
AFFIXTRAIN is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with AFFIXTRAIN; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include "comp.h"
#include "affixtrain.h"
#include "graph.h"
#include "optionaff.h"
#include <float.h>
#define ZIGGURAT 1
#if ZIGGURAT
#include "rnorrexp.c"
#endif
#define NPARMS parms.ROWPARMS
/*
ACL 2009 paper:
Icelandic 71.3 1.5 even_better (71,30 1,51 iflg. D:\dokumenter\tvärsök\even_better\icelandic.xls) peen 71,51 1,65 sugar 70,93 1,86 affiksFEW3 71,02 2,16 no pruning
Danish 92.8 0.2 peen sugar: 92,72 0,19 no pruning
Norwegian 87.6 0.3 affiksFEW2 sugar: 86,67 0,68
Greek 90.4 0.4 sugar no pruning
Slovene 86.7 0.3 affiksFEW3 affiksFEW2: 86,23 0,58 sugar: 86,27 0,41 peen:86,13 0,55 0,4
Swedish 92.3 0.3 sugar pruning 1
German 91.46 0.17 sugar no pruning
English 89.0 1.3 sugar pruning 2
Dutch 90.4 0.5 affiksFEW2 sugar: 90,17 0,31 0,3 no pruning
Polish 93.88 0.08 peen sugar: 93,88 0,08 (?) no pruning
*/
#if _NA
// IMPORTANT (20090511) R__NA and W__NA are not updated as sibling rules are
// added and eat up the training pairs that earlier siblings did not handle.
// This error was detected after having used the weight functions for
// the ACL-paper.
static int comp_fairly_good(const vertex * a,const vertex * b)
{
//const vertex * a = *(const vertex **)A;
//const vertex * b = *(const vertex **)B;
//fairly good, Icelandic 71.270883
//AMBI:
// French ok 85.767516 ambi1 1.156051 ambi2 0.955414 diff 12.121019 rules 7337.500000 2.731849% cutoff 2
int A1 = a->W__R + a->R__R;
int B1 = b->W__R + b->R__R;
int A2 = a->W__R + a->W__W + a->R__NA;
int B2 = b->W__R + b->W__W + b->R__NA;
int A3 = a->W__R + a->R__R + a->R__NA;
int B3 = b->W__R + b->R__R + b->R__NA;
/* int A2 = a->R__NA - a->W__NA;
int B2 = b->R__NA - b->W__NA;
int A3 = a->W__R - a->R__W;
int B3 = b->W__R - b->R__W;
*/
return (A1>B1)?-1:(A1<B1)?1:(A2>B2)?-1:(A2<B2)?1:(A3>B3)?-1:(A3<B3)?1:0;
}
#endif
#if _NA
static int comp_even_better(const vertex * a,const vertex * b)
{
//const vertex * a = *(const vertex **)A;
//const vertex * b = *(const vertex **)B;
//even better, Icelandic 71.300716
// BEST Icelandic 71.535870 +/- 1.919590 at 0.9856 of dataset, 17 iterations, 23209.882353 = 40.477646% rules, cutoff = 0
// Icelandic 71.283167 +/- 1.714260 at 0.9856 of dataset, 17 iterations, 22719.470588 = 39.622376% rules, cutoff = 0, RECURSE == 4
//AMBI:
// French ok 85.487261 ambi1 1.283439 ambi2 1.050955 diff 12.178344 rules 7360.125000 2.740283% cutoff 2
int A1 = a->W__R + a->R__R;// wr + rr
int B1 = b->W__R + b->R__R;
int A2 = a->W__R + a->R__R + a->R__NA;// wr + rr + rn - r = wr - rw
int B2 = b->W__R + b->R__R + b->R__NA;
int A3 = a->W__R + a->W__W + a->R__NA;// wr + ww + rn - w = -wn + rn
int B3 = b->W__R + b->W__W + b->R__NA;
// int A2 = a->W__R - a->R__W;
// int B2 = b->W__R - b->R__W;
// int A3 = a->R__NA - a->W__NA;
// int B3 = b->R__NA - b->W__NA;
return (A1>B1)?-1:(A1<B1)?1:(A2>B2)?-1:(A2<B2)?1:(A3>B3)?-1:(A3<B3)?1:0;
}
#endif
#if _NA
static int comp_affiksFEW3(const vertex * a,const vertex * b)
{
//const vertex * a = *(const vertex **)A;
//const vertex * b = *(const vertex **)B;
// Icelandic 65.781623, cutoff 1 (old lemmatizer 73.329356, cutoff 0)
// Icelandic 66.544995 +/- 1.943469 at 0.9856 of dataset, 17 iterations, 11134.176471 = 19.417817% rules, cutoff = 1
// English 87.863636, cutoff 2 (old 87.954545, cutoff 1)
// English 87.806061 +/- 1.009323 at 0.9856 of dataset, 15 iterations, 1619.133333 = 2.152101% rules (cutoff = 2)
// BEST Slovene 86.669776 +/- 0.331106 at 0.9856 of dataset, 9 iterations, 5650.777778 = 2.888237% rules (cutoff = 2)
// Slovene-ambi (4.23%) 83.165661, cutoff 3 (3550 rules!) (old 82.017103, cutoff 1, 9377 rules) Better than _affiksFEW2, 82.780013, 6656 rules.
// Danish 90.942165 +/- 0.589437 at 0.9856 of dataset, 5 iterations, 32327.400000 = 5.925881% rules, cutoff = 1
// German 90.266461 +/- 0.509202 at 0.9856 of dataset, 7 iterations, 21539.428571 = 6.930653% rules, cutoff = 1
// Greek 89.640779 +/- 0.402079 at 0.9856 of dataset, 5 iterations, 13377.200000 = 2.472132% rules, cutoff = 2
// Dutch 87.817059 +/- 0.366236 at 0.9856 of dataset, 7 iterations, 23493.571429 = 7.895486% rules, cutoff = 1
// Norwegian 85.788507 +/- 0.484921 at 0.9856 of dataset, 6 iterations, 14904.000000 = 3.157580% rules, cutoff = 2
// Polish 93.203365 +/- 0.175436 at 0.9856 of dataset, 2 iterations, 50597.500000 = 1.491153% rules, cutoff = 2
// Swedish 91.709042 +/- 0.170094 at 0.9856 of dataset, 6 iterations, 4407.666667 = 0.935737% rules, cutoff = 3
//AMBI:
// French ok 82.754777 ambi1 2.353503 ambi2 1.805732 diff 13.085987 rules 7360.125000 2.740283% cutoff 2
/* Interesting because it generates far less rules than the above
variables, only 20 % more than the old lemmatizer.
Also interesting is that there are not many leaves with only one
supporting training pair.
Yet, the leaves with only one supporter are detrimentous to the overall
result (cutoff has to be 1 or even 2).
*/
#if 1
int A1 = a->W__R + a->R__R + a->R__NA; // Good: previously wrong words got it right. Bad: previously right words got it wrong.
int B1 = b->W__R + b->R__R + b->R__NA;
int A2 = a->W__R + a->R__R; // Good: any rightly lemmatized word
int B2 = b->W__R + b->R__R;
int A3 = a->W__R + a->W__W + a->R__NA; // Good: previously right words that didn't match. They may return to the parent.
int B3 = b->W__R + b->W__W + b->R__NA; // Bad: previously wrong words that didn't match. They must be handled by siblings.
#else
int A1 = a->W__R - a->R__W; // Good: previously wrong words got it right. Bad: previously right words got it wrong.
int B1 = b->W__R - b->R__W;
int A2 = a->W__R + a->R__R; // Good: any rightly lemmatized word
int B2 = b->W__R + b->R__R;
int A3 = a->W__R + a->W__W - a->R__R - a->R__W; // Good: previously right words that didn't match. They may return to the parent.
int B3 = b->W__R + b->W__W - b->R__R - b->R__W; // Bad: previously wrong words that didn't match. They must be handled by siblings.
#endif
/*
int A1 = a->W__R - a->R__W; // Good: previously wrong words got it right. Bad: previously right words got it wrong.
int B1 = b->W__R - b->R__W;
int A2 = a->W__R + a->R__R; // Good: any rightly lemmatized word
int B2 = b->W__R + b->R__R;
int A3 = a->R__NA - a->W__NA; // Good: previously right words that didn't match. They may return to the parent.
int B3 = b->R__NA - b->W__NA; // Bad: previously wrong words that didn't match. They must be handled by siblings.
*/
return (A1>B1)?-1:(A1<B1)?1:(A2>B2)?-1:(A2<B2)?1:(A3>B3)?-1:(A3<B3)?1:0;
}
#endif
#if _NA
static int comp_affiksFEW(const vertex * a,const vertex * b)
{
//const vertex * a = *(const vertex **)A;
//const vertex * b = *(const vertex **)B;
/**/
//_affiksFEW
// Dutch 88.138224, 39943.5 flexrules cutoff 1 (old 89.656164, 47277.75 flexrules, cutoff 1)
// German 90.266461 +/- 0.509202 at 0.9856 of dataset, 7 iterations, 21539.428571 = 6.930653% rules, cutoff = 1
//AMBI:
// French ok 82.617834 ambi1 2.455414 ambi2 1.872611 diff 13.054140 rules 7360.125000 2.740283% cutoff 2
int N = a->W__W + a->W__R + a->W__NA + a->R__W + a->R__R + a->R__NA;
int A1;
int B1;
int A2;
int B2;
int A3;
int B3;
// good for small numbers:
if(N < 3)
{
A1 = a->W__R + a->R__R;
B1 = b->W__R + b->R__R;
A2 = a->W__R + a->R__R + a->R__NA;
B2 = b->W__R + b->R__R + b->R__NA;
A3 = a->W__R + a->W__W + a->R__NA;
B3 = b->W__R + b->W__W + b->R__NA;
/* A1 = a->W__R + a->R__R;
B1 = b->W__R + b->R__R;
A2 = a->W__R - a->R__W;
B2 = b->W__R - b->R__W;
A3 = a->R__NA - a->W__NA;
B3 = b->R__NA - b->W__NA;
*/
}
// good for big numbers:
else
{
A1 = a->W__R + a->R__R + a->R__NA; // Good: previously wrong words got it right. Bad: previously right words got it wrong.
B1 = b->W__R + b->R__R + b->R__NA;
A2 = a->W__R + a->R__R; // Good: any rightly lemmatized word
B2 = b->W__R + b->R__R;
A3 = a->R__NA + a->W__R + a->W__W; // Good: previously right words that didn't match. They may return to the parent.
B3 = b->R__NA + b->W__R + a->W__W; // Bad: previously wrong words that didn't match. They must be handled by siblings.
/* A1 = a->W__R - a->R__W; // Good: previously wrong words got it right. Bad: previously right words got it wrong.
B1 = b->W__R - b->R__W;
A2 = a->W__R + a->R__R; // Good: any rightly lemmatized word
B2 = b->W__R + b->R__R;
A3 = a->R__NA - a->W__NA; // Good: previously right words that didn't match. They may return to the parent.
B3 = b->R__NA - b->W__NA; // Bad: previously wrong words that didn't match. They must be handled by siblings.
*/
}
return (A1>B1)?-1:(A1<B1)?1:(A2>B2)?-1:(A2<B2)?1:(A3>B3)?-1:(A3<B3)?1:0;
}
#endif
#if _NA
static int comp_affiksFEW2(const vertex * a,const vertex * b)
{
//const vertex * a = *(const vertex **)A;
//const vertex * b = *(const vertex **)B;
//_affiksFEW2
// (OK) BEST Dutch 90.452096 +/- 0.655431 at 0.9856 of dataset, 7 iterations, 53607.714286 = 18.015948% rules, cutoff = 0
// (OK) Norwegian 86.776860 +/- 0.642621 at 0.9856 of dataset, 6 iterations, 112374.000000 = 23.807698% rules, cutoff = 0
// (OK) English 88.424242 +/- 1.191106 at 0.9856 of dataset, 15 iterations, 1383.000000 = 1.838240% rules, cutoff = 2
// (OK) Icelandic 71.304226 +/- 1.453643 at 0.9856 of dataset, 17 iterations, 25635.000000 = 44.707011% rules, cutoff = 0
// (OK) German 91.156762 +/- 0.348391 at 0.9856 of dataset, 7 iterations, 48816.571429 = 15.707506% rules, cutoff = 0
// (OK) Slovene 86.537639 +/- 0.559484 at 0.9856 of dataset, 9 iterations, 40643.444444 = 20.773759% rules, cutoff = 0
// (OK) Swedish 91.907598 +/- 0.224888 at 0.9856 of dataset, 6 iterations, 27958.000000 = 5.935415% rules, cutoff = 1
// (OK) Greek 90.741209 +/- 0.312526 at 0.9856 of dataset, 5 iterations, 125306.400000 = 23.156860% rules, cutoff = 0
// (OK) Danish 92.994605 +/- 0.210674 at 0.9856 of dataset, 5 iterations, 67278.800000 = 12.332763% rules, cutoff = 0
// (?)ALMOST BEST Polish 93.398015 +/- 0.045642 at 0.9856 of dataset, 2 iterations, 165511.500000 = 4.877770% rules, cutoff = 1
//AMBI:
// French ok 84.194268 ambi1 2.277070 ambi2 1.576433 diff 11.952229 rules 6453.250000 2.402640% cutoff 2
/* 20140922,
nohup nice /home/zgk261/bin/testrules -I -D /home/zgk261/sandkasse/nl/dict_nl_non_ambiguous -L nl -C affiksFEW2 -A >/home/zgk261/sandkasse/nl/testout 2>/home/zgk261/sandkasse/nl/testerr &
cutoff fraction iterations trainlines suffixrules affixrules suffix% affix% s-same s-ambiguous s-different a-same a-ambiguous a-different s-same-stddev% s-amb-stddev% s-diff-stddev% a-same-stddev% a-amb-stddev% a-diff-stddev% s-same% s-ambiguous% s-different% s-amb.rules% a-same% a-ambiguous% a-different% a-amb.rules% s_false_amb s_false_not_amb s_true_amb s_true_not_amb s_precision s_recall a_false_amb a_false_not_amb a_true_amb a_true_not_amb a_precision a_recall
0 0.985600 7 297719.000000 41163.857143 50127.285714 13.826413 16.837113 3830.428571 0.000000 0.000000 0.000000 519.571429 3922.428571 0.000000 0.000000 0.000000 427.571429 0.539105 0.000000 0.000000 0.000000 0.539105 0.440368 0.000000 0.000000 0.000000 0.440368 88.055829 0.000000 0.000000 0.000000 11.944171 0.000000 90.170772 0.000000 0.000000 0.000000 9.829228 0.000000 0.000000 0.000000 0.000000 100.000000 0.000000 -nan 0.000000 0.000000 0.000000 100.000000 0.000000 -nan
1 0.985600 7 297719.000000 11178.571429 22834.000000 3.754739 7.669648 3803.285714 0.000000 0.000000 0.000000 546.714286 3914.285714 0.000000 0.000000 0.000000 435.714286 0.587252 0.000000 0.000000 0.000000 0.587252 0.570207 0.000000 0.000000 0.000000 0.570207 87.431856 0.000000 0.000000 0.000000 12.568144 0.000000 89.983580 0.000000 0.000000 0.000000 10.016420 0.000000 0.000000 0.000000 0.000000 100.000000 0.000000 -nan 0.000000 0.000000 0.000000 100.000000 0.000000 -nan
2 0.985600 7 297719.000000 6855.000000 8014.285714 2.302507 2.691896 3757.857143 0.000000 0.000000 0.000000 592.142857 3895.857143 0.000000 0.000000 0.000000 454.142857 0.549072 0.000000 0.000000 0.000000 0.549072 0.530304 0.000000 0.000000 0.000000 0.530304 86.387521 0.000000 0.000000 0.000000 13.612479 0.000000 89.559934 0.000000 0.000000 0.000000 10.440066 0.000000 0.000000 0.000000 0.000000 100.000000 0.000000 -nan 0.000000 0.000000 0.000000 100.000000 0.000000 -nan
3 0.985600 7 297719.000000 5109.142857 5404.142857 1.716096 1.815182 3726.285714 0.000000 0.000000 0.000000 623.714286 3868.571429 0.000000 0.000000 0.000000 481.428571 0.586802 0.000000 0.000000 0.000000 0.586802 0.407759 0.000000 0.000000 0.000000 0.407759 85.661741 0.000000 0.000000 0.000000 14.338259 0.000000 88.932677 0.000000 0.000000 0.000000 11.067323 0.000000 0.000000 0.000000 0.000000 100.000000 0.000000 -nan 0.000000 0.000000 0.000000 100.000000 0.000000 -nan
4 0.985600 7 297719.000000 4082.571429 4193.857143 1.371283 1.408663 3694.285714 0.000000 0.000000 0.000000 655.714286 3844.714286 0.000000 0.000000 0.000000 505.285714 0.695342 0.000000 0.000000 0.000000 0.695342 0.391416 0.000000 0.000000 0.000000 0.391416 84.926108 0.000000 0.000000 0.000000 15.073892 0.000000 88.384236 0.000000 0.000000 0.000000 11.615764 0.000000 0.000000 0.000000 0.000000 100.000000 0.000000 -nan 0.000000 0.000000 0.000000 100.000000 0.000000 -nan
5 0.985600 7 297719.000000 3462.714286 3482.571429 1.163081 1.169751 3667.000000 0.000000 0.000000 0.000000 683.000000 3827.000000 0.000000 0.000000 0.000000 523.000000 0.788006 0.000000 0.000000 0.000000 0.788006 0.407141 0.000000 0.000000 0.000000 0.407141 84.298851 0.000000 0.000000 0.000000 15.701149 0.000000 87.977011 0.000000 0.000000 0.000000 12.022989 0.000000 0.000000 0.000000 0.000000 100.000000 0.000000 -nan 0.000000 0.000000 0.000000 100.000000 0.000000 -nan
cutoff 0 Affix a 0.676984 b 0.804745: N(rules)= 1.967934*N(trainpairs)^0.804745
Suffix a 0.181309 b 0.828157: N(rules)= 1.198786*N(trainpairs)^0.828157
cutoff 1 Affix a 0.020304 b 0.794314: N(rules)= 1.020512*N(trainpairs)^0.794314
Suffix a -1.577537 b 0.865510: N(rules)= 0.206483*N(trainpairs)^0.865510
cutoff 2 Affix a -1.037201 b 0.799804: N(rules)= 0.354446*N(trainpairs)^0.799804
Suffix a -2.496919 b 0.900882: N(rules)= 0.082338*N(trainpairs)^0.900882
cutoff 3 Affix a -1.918028 b 0.840841: N(rules)= 0.146896*N(trainpairs)^0.840841
Suffix a -2.960479 b 0.913754: N(rules)= 0.051794*N(trainpairs)^0.913754
cutoff 4 Affix a -2.344658 b 0.853928: N(rules)= 0.095880*N(trainpairs)^0.853928
Suffix a -3.219691 b 0.916661: N(rules)= 0.039967*N(trainpairs)^0.916661
cutoff 5 Affix a -2.716219 b 0.868745: N(rules)= 0.066124*N(trainpairs)^0.868745
Suffix a -3.525562 b 0.927406: N(rules)= 0.029435*N(trainpairs)^0.927406
New (old) algorithm, least wrongly lemmatised (MIN(diff)).
Suffix only no
cutoff 0
fraction 9856.000000
iterations 7
trainlines 297719.000000
rules 50127.285714 ( 41163.857143)
rules% 16.837113 ( 13.826413)
same%stdev 0.440368
ambi1%stdev 0.000000
ambi2%stdev 0.000000
ambi3%stdev 0.000000
diff%stdev 0.440368
same% 90.170772 ( 88.055829)
ambi1% 0.000000 ( 0.000000)
ambi2% 0.000000 ( 0.000000)
ambi3% 0.000000 ( 0.000000)
diff% 9.829228 ( 11.944171)
amb.rules% 0.000000 ( 0.000000)
false_amb% 0.000000 ( 0.000000)
false_not_amb% 0.000000 ( 0.000000)
true_amb% 0.000000 ( 0.000000)
true_not_amb% 100.000000 ( 100.000000)
precision 0.000000 ( 0.000000)
recall -nan ( -nan)
*/
/* 20140922, same as above, except that data are ambiguous.
nohup nice /home/zgk261/bin/testrules -I -D /home/zgk261/sandkasse/nl/dict_nl_without_doubles_UTF8 -L nl -C affiksFEW2 -A >/home/zgk261/sandkasse/nl/testout 2>/home/zgk261/sandkasse/nl/testerr &
cutoff fraction iterations trainlines suffixrules affixrules suffix% affix% s-same s-ambiguous s-different a-same a-ambiguous a-different s-same-stddev% s-amb-stddev% s-diff-stddev% a-same-stddev% a-amb-stddev% a-diff-stddev% s-same% s-ambiguous% s-different% s-amb.rules% a-same% a-ambiguous% a-different% a-amb.rules% s_false_amb s_false_not_amb s_true_amb s_true_not_amb s_precision s_recall a_false_amb a_false_not_amb a_true_amb a_true_not_amb a_precision a_recall
0 0.985600 7 306848.000000 73132.000000 50311.000000 23.833299 16.396066 3635.857143 78.000000 117.000000 4.714286 648.428571 3843.428571 0.000000 0.000000 0.000000 640.571429 0.368578 0.192276 0.216604 0.027956 0.556177 0.484660 0.000000 0.000000 0.000000 0.484660 81.085128 1.739518 2.609277 0.105136 14.460940 4.982796 85.714286 0.000000 0.000000 0.000000 14.285714 0.000000 4.552695 6.203008 0.430101 88.814197 0.045105 0.064841 0.000000 6.633108 0.000000 93.366892 0.000000 0.000000
1 0.985600 7 306848.000000 18391.714286 22871.857143 5.993754 7.453807 3743.714286 0.000000 0.000000 0.000000 740.285714 3897.142857 0.000000 0.000000 0.000000 586.857143 0.341564 0.000000 0.000000 0.000000 0.341564 0.373271 0.000000 0.000000 0.000000 0.373271 83.490506 0.000000 0.000000 0.000000 16.509494 0.000000 86.912196 0.000000 0.000000 0.000000 13.087804 0.000000 0.000000 6.633108 0.000000 93.366892 0.000000 0.000000 0.000000 6.633108 0.000000 93.366892 0.000000 0.000000
2 0.985600 7 306848.000000 11355.142857 8029.142857 3.700576 2.616652 3696.857143 0.000000 0.000000 0.000000 787.142857 3909.714286 0.000000 0.000000 0.000000 574.285714 0.396739 0.000000 0.000000 0.000000 0.396739 0.301886 0.000000 0.000000 0.000000 0.301886 82.445521 0.000000 0.000000 0.000000 17.554479 0.000000 87.192558 0.000000 0.000000 0.000000 12.807442 0.000000 0.000000 6.633108 0.000000 93.366892 0.000000 0.000000 0.000000 6.633108 0.000000 93.366892 0.000000 0.000000
3 0.985600 7 306848.000000 8112.285714 5438.857143 2.643747 1.772492 3652.571429 0.000000 0.000000 0.000000 831.428571 3888.428571 0.000000 0.000000 0.000000 595.571429 0.255206 0.000000 0.000000 0.000000 0.255206 0.205149 0.000000 0.000000 0.000000 0.205149 81.457882 0.000000 0.000000 0.000000 18.542118 0.000000 86.717854 0.000000 0.000000 0.000000 13.282146 0.000000 0.000000 6.633108 0.000000 93.366892 0.000000 0.000000 0.000000 6.633108 0.000000 93.366892 0.000000 0.000000
4 0.985600 7 306848.000000 6239.285714 4213.000000 2.033347 1.372992 3615.000000 0.000000 0.000000 0.000000 869.000000 3864.000000 0.000000 0.000000 0.000000 620.000000 0.273440 0.000000 0.000000 0.000000 0.273440 0.204397 0.000000 0.000000 0.000000 0.204397 80.619982 0.000000 0.000000 0.000000 19.380018 0.000000 86.173060 0.000000 0.000000 0.000000 13.826940 0.000000 0.000000 6.633108 0.000000 93.366892 0.000000 0.000000 0.000000 6.633108 0.000000 93.366892 0.000000 0.000000
5 0.985600 7 306848.000000 5044.285714 3494.285714 1.643904 1.138768 3592.714286 0.000000 0.000000 0.000000 891.285714 3847.428571 0.000000 0.000000 0.000000 636.571429 0.262209 0.000000 0.000000 0.000000 0.262209 0.203118 0.000000 0.000000 0.000000 0.203118 80.122977 0.000000 0.000000 0.000000 19.877023 0.000000 85.803492 0.000000 0.000000 0.000000 14.196508 0.000000 0.000000 6.633108 0.000000 93.366892 0.000000 0.000000 0.000000 6.633108 0.000000 93.366892 0.000000 0.000000
cutoff 0 Affix a 0.884193 b 0.790803: N(rules)= 2.421031*N(trainpairs)^0.790803
Suffix a -0.706998 b 0.933881: N(rules)= 0.493123*N(trainpairs)^0.933881
cutoff 1 Affix a 0.242492 b 0.779144: N(rules)= 1.274421*N(trainpairs)^0.779144
Suffix a -2.482111 b 0.967669: N(rules)= 0.083567*N(trainpairs)^0.967669
cutoff 2 Affix a -0.969721 b 0.794266: N(rules)= 0.379189*N(trainpairs)^0.794266
Suffix a -3.356905 b 0.996574: N(rules)= 0.034843*N(trainpairs)^0.996574
cutoff 3 Affix a -1.776403 b 0.828451: N(rules)= 0.169246*N(trainpairs)^0.828451
Suffix a -3.794027 b 1.003571: N(rules)= 0.022505*N(trainpairs)^1.003571
cutoff 4 Affix a -2.351608 b 0.854877: N(rules)= 0.095216*N(trainpairs)^0.854877
Suffix a -4.020226 b 1.000475: N(rules)= 0.017949*N(trainpairs)^1.000475
cutoff 5 Affix a -2.757390 b 0.871710: N(rules)= 0.063457*N(trainpairs)^0.871710
Suffix a -4.205819 b 0.997875: N(rules)= 0.014909*N(trainpairs)^0.997875
New (old) algorithm, least wrongly lemmatised (MIN(diff)).
Suffix only no
cutoff 2
fraction 9856.000000
iterations 7
trainlines 306848.000000
rules 8029.142857 ( 11355.142857)
rules% 2.616652 ( 3.700576)
same%stdev 0.301886
ambi1%stdev 0.000000
ambi2%stdev 0.000000
ambi3%stdev 0.000000
diff%stdev 0.301886
same% 87.192558 ( 82.445521)
ambi1% 0.000000 ( 0.000000)
ambi2% 0.000000 ( 0.000000)
ambi3% 0.000000 ( 0.000000)
diff% 12.807442 ( 17.554479)
amb.rules% 0.000000 ( 0.000000)
false_amb% 0.000000 ( 0.000000)
false_not_amb% 6.633108 ( 6.633108)
true_amb% 0.000000 ( 0.000000)
true_not_amb% 93.366892 ( 93.366892)
precision 0.000000 ( 0.000000)
recall 0.000000 ( 0.000000)
*/
#if 1 // 20090511
int A1 = a->W__R + 2*a->R__R + a->R__NA; // good: all words that are lemmatised correctly. bad: all previously right words that got it wrong
// wr + 2rr + rn - r = ww + rr - rw
int B1 = b->W__R + 2*b->R__R + b->R__NA;
int A2 = a->W__R + a->R__R + a->R__NA;
// wr + rr + rn - r = wr - rw
int B2 = b->W__R + b->R__R + b->R__NA;
int A3 = a->W__R + a->W__W + a->R__NA;
// wr + ww + rn - w = -wn + rn
int B3 = b->W__R + b->W__W + b->R__NA;
#else
int A1 = a->W__R + a->R__R - a->R__W; // good: all words that are lemmatised correctly. bad: all previously right words that got it wrong
// wr + 2rr + rn - r = ww + rr - rw
int B1 = b->W__R + b->R__R - b->R__W;
int A2 = a->W__R - a->R__W;
// wr + rr + rn - r = wr - rw
int B2 = b->W__R - b->R__W;
int A3 = a->W__R + a->W__W - a->R__R - a->R__W;
// wr + ww + rn - w = -wn + rn
int B3 = b->W__R + b->W__W - b->R__R - b->R__W;
#endif
/* int A1 = a->W__R + a->R__R - a->R__W; // good: all words that are lemmatised correctly. bad: all previously right words that got it wrong
int B1 = b->W__R + b->R__R - b->R__W;
int A2 = a->W__R - a->R__W;
int B2 = b->W__R - b->R__W;
int A3 = a->R__NA - a->W__NA;
int B3 = b->R__NA - b->W__NA;
*/
return (A1>B1)?-1:(A1<B1)?1:(A2>B2)?-1:(A2<B2)?1:(A3>B3)?-1:(A3<B3)?1:0;
}
#endif
#if _NA
static int comp_affiksFEW2org(const vertex * a,const vertex * b)
{
// BEST Norwegian 87.494563 +/- 0.217147 at 0.9856 of dataset, 6 iterations, 101814.500000 = 21.570549% rules, cutoff = 0
// English 88.260606 +/- 0.826699 at 0.9856 of dataset, 15 iterations, 7362.466667 = 9.785960% rules, cutoff = 1
// Icelandic 70.651411 +/- 1.565857 at 0.9856 of dataset, 17 iterations, 23232.941176 = 40.517860% rules, cutoff = 0
// German 90.307358 +/- 0.355867 at 0.9856 of dataset, 7 iterations, 50595.857143 = 16.280019% rules, cutoff = 0
// Dutch 90.274675 +/- 0.462929 at 0.9856 of dataset, 7 iterations, 23452.142857 = 7.881563% rules, cutoff = 1
// Slovene 86.417162 +/- 0.540735 at 0.9856 of dataset, 9 iterations, 40847.666667 = 20.878142% rules, cutoff = 0
// Swedish 91.982663 +/- 0.250703 at 0.9856 of dataset, 6 iterations, 28998.000000 = 6.156204% rules, cutoff = 1
// Greek 90.258032 +/- 0.234665 at 0.9856 of dataset, 5 iterations, 43156.000000 = 7.975310% rules, cutoff = 1 (but exactly the same as cutoff = 0)
// Danish 92.425041 +/- 0.374415 at 0.9856 of dataset, 5 iterations, 73177.800000 = 13.414099% rules, cutoff = 0
//AMBI:
// French ok 84.761146 ambi1 2.015924 ambi2 1.665605 diff 11.557325 rules 7262.500000 2.703935% cutoff 2
int A1 = a->W__R + a->R__R - a->R__W; // good: all words that are lemmatised correctly. bad: all previously right words that got it wrong
int B1 = b->W__R + b->R__R - b->R__W;
int A2 = a->W__R - a->R__W;
int B2 = b->W__R - b->R__W;
int A3 = a->R__NA - a->W__NA;
int B3 = b->R__NA - b->W__NA;
return (A1>B1)?-1:(A1<B1)?1:(A2>B2)?-1:(A2<B2)?1:(A3>B3)?-1:(A3<B3)?1:0;
}
#endif
#if _NA
static int comp_fixNA(const vertex * a,const vertex * b)
{
/*
Icelandic 47.982267 (at 0.8488 of dataset)
*/
//const vertex * a = *(const vertex **)A;
//const vertex * b = *(const vertex **)B;
//_fixNA
//AMBI:
// French: stopped because of very bad results. (> 25% wrong results)
int A1 = a->W__R + a->W__NA + a->R__NA;
int B1 = b->W__R + b->W__NA + b->R__NA;
int A2 = a->W__R + a->R__R + a->R__NA;
int B2 = b->W__R + b->R__R + b->R__NA;
int A3 = a->W__R + a->R__NA;
int B3 = b->W__R + b->R__NA;
return (A1>B1)?-1:(A1<B1)?1:(A2>B2)?-1:(A2<B2)?1:(A3>B3)?-1:(A3<B3)?1:0;
}
#endif
#if _NA
static int comp_fruit(const vertex * a,const vertex * b)
{
//const vertex * a = *(const vertex **)A;
//const vertex * b = *(const vertex **)B;
//Icelandic 71.344041 at 0.939 of dataset
//ALMOST BEST Icelandic 71.521831 +/- 1.988737 at 0.9856 of dataset, 17 iterations, 23539.352941 = 41.052237% rules
//Slovene 85.900276 +/- 0.456532 at 0.9856 of dataset, 9 iterations, 42167.333333 = 21.552652% rules
//English 87.626771 +/- 0.060148 at 0.4928 (!) of dataset, 3 iterations, 933.000000 = 2.480262% rules
//AMBI:
// French ok 85.382166 ambi1 1.359873 ambi2 1.089172 diff 12.168790 rules 7259.125000 2.899075% cutoff 2
int A1 = a->W__R + a->R__R;
int B1 = b->W__R + b->R__R;
int A2 = a->W__R + a->R__R + a->R__NA;
int B2 = b->W__R + b->R__R + b->R__NA;
int A3 = a->W__R + a->W__NA + a->R__NA;
int B3 = b->W__R + b->W__NA + b->R__NA;
return (A1>B1)?-1:(A1<B1)?1:(A2>B2)?-1:(A2<B2)?1:(A3>B3)?-1:(A3<B3)?1:0;
}
#endif
#if _NA
static int comp_ice(const vertex * a,const vertex * b)
{
//const vertex * a = *(const vertex **)A;
//const vertex * b = *(const vertex **)B;
//Icelandic 60.242322 at 0.939 of dataset
//AMBI:
// French ok 82.557325 ambi1 2.522293 ambi2 1.866242 diff 13.054140 rules 8556.625000 3.185757% cutoff 2
int A1 = a->W__R + a->R__R + a->R__NA;
int B1 = b->W__R + b->R__R + b->R__NA;
int A2 = a->W__R + a->R__R;
int B2 = b->W__R + b->R__R;
int A3 = a->W__R + a->W__NA + a->R__NA;
int B3 = b->W__R + b->W__NA + b->R__NA;
return (A1>B1)?-1:(A1<B1)?1:(A2>B2)?-1:(A2<B2)?1:(A3>B3)?-1:(A3<B3)?1:0;
}
#endif
#if _NA
static int comp_pisang(const vertex * a,const vertex * b)
{
//const vertex * a = *(const vertex **)A;
//const vertex * b = *(const vertex **)B;
//Icelandic 71.287687 at 0.939 of dataset
//AMBI:
// French ok 85.414013 ambi1 1.359873 ambi2 1.085987 diff 12.140127 rules 7848.375000 2.922065% cutoff 2
int A1 = a->W__R + a->R__R;
int B1 = b->W__R + b->R__R;
int A2 = a->W__R + a->W__NA + a->R__NA;
int B2 = b->W__R + b->W__NA + b->R__NA;
int A3 = a->W__R + a->R__R + a->R__NA;
int B3 = b->W__R + b->R__R + b->R__NA;
return (A1>B1)?-1:(A1<B1)?1:(A2>B2)?-1:(A2<B2)?1:(A3>B3)?-1:(A3<B3)?1:0;
}
#endif
#if _NA
static int comp_kiwi(const vertex * a,const vertex * b)
{
//const vertex * a = *(const vertex **)A;
//const vertex * b = *(const vertex **)B;
//Icelandic 70.865032 at 0.939 of dataset
//AMBI:
// French ok 85.410828 ambi1 1.378981 ambi2 1.035032 diff 12.175159 rules 7676.875000 2.858213% cutoff 2
int A1 = a->W__R + a->R__R;
int B1 = b->W__R + b->R__R;
int A2 = a->W__R + a->W__NA + a->R__R + a->R__NA;
int B2 = b->W__R + b->W__NA + b->R__R + b->R__NA;
int A3 = a->W__R + a->R__NA;
int B3 = b->W__R + b->R__NA;
return (A1>B1)?-1:(A1<B1)?1:(A2>B2)?-1:(A2<B2)?1:(A3>B3)?-1:(A3<B3)?1:0;
}
#endif
#if _NA
static int comp_carrot(const vertex * a,const vertex * b)
{
//const vertex * a = *(const vertex **)A;
//const vertex * b = *(const vertex **)B;
//Icelandic 71.090448 at 0.939 of dataset
//AMBI:
// French ok 85.060510 ambi1 1.328025 ambi2 1.041401 diff 12.570064 rules 7241.625000 2.696163% cutoff 2
int A1 = 4*(a->W__R + a->R__R) + a->R__NA;
int B1 = 4*(b->W__R + b->R__R) + b->R__NA;
int A2 = a->W__R + a->R__R + a->R__NA;
int B2 = b->W__R + b->R__R + b->R__NA;
int A3 = a->W__R + a->W__NA + a->R__NA;
int B3 = b->W__R + b->W__NA + b->R__NA;
return (A1>B1)?-1:(A1<B1)?1:(A2>B2)?-1:(A2<B2)?1:(A3>B3)?-1:(A3<B3)?1:0;
}
#endif
#if _NA
static int comp_peen(const vertex * a,const vertex * b)
{
//const vertex * a = *(const vertex **)A;
//const vertex * b = *(const vertex **)B;
// Icelandic 71.344041 at 0.939 of dataset
// ALMOST BEST Icelandic 71.507792 +/- 1.645702 at 0.9856 of dataset, 17 iterations, 25240.882353 = 44.019676% rules
// Slovene 86.133458 +/- 0.549185 at 0.9856 of dataset, 9 iterations, 40898.777778 = 20.904266% rules
// English 87.803261 +/- 0.106156 at 0.4928 (!) of dataset, 3 iterations, 889.333333 = 2.364179% rules
// Dutch 89.837692 +/- 0.412795 at 0.9856 of dataset, 7 iterations, 56640.285714 = 19.035104% rules, cutoff = 0
// ALMOST BEST German 91.288892 +/- 0.670828 at 0.9856 of dataset, 7 iterations, 50584.857143 = 16.276480% rules, cutoff = 0
// Swedish 91.873698 +/- 0.367967 at 0.9856 of dataset, 6 iterations, 9066.166667 = 1.924725% rules, cutoff = 2
// ALMOST BEST Norwegian 87.535644 +/- 0.344659 at 0.9856 of dataset, 6 iterations, 48468 = 10.268492% rules, cutoff = 1
// ALMOST BEST Greek 90.414875+/- 0.385254 at 0.9856 of dataset, 5 iterations, 120691.4 = 22.303999% rules, cutoff = 0
// BEST Danish 92.796387 +/- 0.214267 at 0.9856 of dataset, 5 iterations, 67807 = 12.429587% rules, cutoff = 0
// ALMOST BEST Russian 80.484806 +/- 0.409391 at 0.9856 of dataset, 6 iterations, 54630 = 14.022614% rules, cutoff = 1
// BEST Polish 93.880103 +/- 0.077021 at 0.9856 of dataset, 2 iterations, 344944.5 = 10.165818% rules, cutoff = 0
//AMBI:
// French ok 84.993631 ambi1 1.388535 ambi2 1.085987 diff 12.531847 rules 7318.375000 2.724738% cutoff 2
/*
0 0.985600 2 3490123.000000 0.000000 415069.500000 0.000000 11.892690 0.000000 0.000000 0.000000 0.000000 0.000000 45811.000000 299.500000 168.000000 0.000000 4714.500000 0.000000 0.000000 0.000000 0.000000 0.000000 0.044374 0.026347 0.033280 0.000000 0.015253 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 89.837821 0.587336 0.329457 0.000000 9.245387 1.282529 0.000000 0.000000 0.000000 0.000000 0.000000 nan 1.013865 6.709744 0.268664 92.007727 0.116994 0.038499
1 0.985600 2 3490123.000000 0.000000 198203.500000 0.000000 5.678983 0.000000 0.000000 0.000000 0.000000 0.000000 46176.500000 283.000000 241.000000 0.000000 4292.500000 0.000000 0.000000 0.000000 0.000000 0.000000 0.092907 0.063787 0.016640 0.000000 0.012480 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 90.554586 0.554978 0.472614 0.000000 8.417822 1.257035 0.000000 0.000000 0.000000 0.000000 0.000000 nan 0.784421 6.505795 0.472614 92.237170 0.231508 0.067725
2 0.985600 2 3490123.000000 0.000000 57342.000000 0.000000 1.642979 0.000000 0.000000 0.000000 0.000000 0.000000 46504.500000 192.000000 245.000000 0.000000 4051.500000 0.000000 0.000000 0.000000 0.000000 0.000000 0.134507 0.030507 0.044374 0.000000 0.059627 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 91.197811 0.376522 0.480458 0.000000 7.945208 0.999157 0.000000 0.000000 0.000000 0.000000 0.000000 nan 0.543212 6.522464 0.455945 92.478379 0.295613 0.065337
3 0.985600 2 3490123.000000 0.000000 34167.000000 0.000000 0.978963 0.000000 0.000000 0.000000 0.000000 0.000000 46470.500000 178.500000 210.000000 0.000000 4134.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.051307 0.029120 0.030507 0.000000 0.008320 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 91.131136 0.350048 0.411821 0.000000 8.106995 0.890318 0.000000 0.000000 0.000000 0.000000 0.000000 nan 0.498108 6.586198 0.392211 92.523484 0.282486 0.056203
4 0.985600 2 3490123.000000 0.000000 24896.500000 0.000000 0.713342 0.000000 0.000000 0.000000 0.000000 0.000000 46392.500000 166.500000 180.000000 0.000000 4254.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.059627 0.029120 0.044374 0.000000 0.013867 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 90.978173 0.326515 0.352990 0.000000 8.342321 0.809915 0.000000 0.000000 0.000000 0.000000 0.000000 nan 0.455945 6.624439 0.353970 92.565646 0.279628 0.050724
5 0.985600 2 3490123.000000 0.000000 19778.500000 0.000000 0.566699 0.000000 0.000000 0.000000 0.000000 0.000000 46335.500000 151.500000 180.500000 0.000000 4325.500000 0.000000 0.000000 0.000000 0.000000 0.000000 0.065174 0.006933 0.009707 0.000000 0.048534 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 90.866393 0.297100 0.353970 0.000000 8.482537 0.759908 0.000000 0.000000 0.000000 0.000000 0.000000 nan 0.429471 6.647971 0.330438 92.592120 0.277824 0.047351
New algorithm, least wrongly lemmatised (MIN(diff)).
Suffix only no
cutoff 2
fraction 9856.000000
iterations 2
trainlines 3490123.000000
rules 57342.000000
rules% 1.642979
same%stdev 0.134507
ambi1%stdev 0.030507
ambi2%stdev 0.044374
ambi3%stdev 0.000000
diff%stdev 0.059627
same% 91.197811
ambi1% 0.376522
ambi2% 0.480458
ambi3% 0.000000
diff% 7.945208
amb.rules% 0.999157
false_amb% 0.543212
false_not_amb% 6.522464
true_amb% 0.455945
true_not_amb% 92.478379
precision 0.295613
recall 0.065337
Comment: If comparing by diff%, comp_peen is marginally worse than best_pl
(best_pl uses automatically computed parameters)
If compared by same%, comp_peen is 0.3% better. Reason: best_pl has many more
ambiguous rules and generates more false ambiguous results, but also more true
ambiguous results.
*/
int A1 = 3*(a->W__R + a->R__R) + a->R__NA;
// 3wr + 3rr + rn - r = 3wr + 2rr - rw
int B1 = 3*(b->W__R + b->R__R) + b->R__NA;
int A2 = a->W__R + a->R__R + a->R__NA;
//wr + rr + rn - r = wr - rw
int B2 = b->W__R + b->R__R + b->R__NA;
int A3 = a->W__R + a->W__NA + a->R__NA;
// wr + wn + rn - w = -ww + rn
int B3 = b->W__R + b->W__NA + b->R__NA;
return (A1>B1)?-1:(A1<B1)?1:(A2>B2)?-1:(A2<B2)?1:(A3>B3)?-1:(A3<B3)?1:0;
}
#endif
#if _NA
static int comp_sugar(const vertex * a,const vertex * b)
{
//const vertex * a = *(const vertex **)A;
//const vertex * b = *(const vertex **)B;
// Slovene 86.273367 +/- 0.410931 at 0.9856 of dataset, 9 iterations, 17254.777778 = 8.819297% rules (cutoff = 1)
// BEST English 89.060606 +/- 1.320829 at 0.9856 of dataset, 3 iterations, 1318.266667 = 1.752199% rules, cutoff=2
// Icelandic 70.925172 +/- 1.858255 at 0.9856 of dataset, 17 iterations, 27151.294118 = 47.351402% rules, cutoff = 0
// Dutch 90.172822 +/- 0.307911 at 0.9856 of dataset, 7 iterations, 57761.142857 = 19.411791% rules, cutoff = 0
// BEST Greek 90.422464 +/- 0.437009 at 0.9856 of dataset, 5 iterations, 132765.6 = 24.535334% rules, cutoff = 0
// BEST German 91.461918 +/- 0.167574 at 0.9856 of dataset, 7 iterations, 50986 = 16.405554% rules, cutoff = 0
// BEST Swedish 92.265969 +/- 0.277289 at 0.9856 of dataset, 6 iterations, 25935.333333 = 5.506008% rules, cutoff = 1
// Norwegian 86.665700 +/- 0.676264 at 0.9856 of dataset, 6 iterations, 46685.5 = 9.890849% rules, cutoff = 1
// Danish 92.585623 +/- 0.171327 at 0.9856 of dataset, 5 iterations, 30422.400000 = 5.576679% rules, cutoff = 1
// BEST Russian 80.815622 +/- 0.450500 at 0.9856 of dataset, 6 iterations, 47079.166667 = 12.084440% rules, cutoff = 1
//AMBI:
// French ok 75.472316 ambi1 4.615600 ambi2 3.493266 diff 16.418818 rules 4162.909091 3.129560% cutoff 2
#if 1
// next lines from affixFEW2
int A1 = a->W__R + 2*a->R__R + a->R__NA; // good: all words that are lemmatised correctly. bad: all previously right words that got it wrong
// wr - rw + rr
int B1 = b->W__R + 2*b->R__R + b->R__NA;
int A2 = a->W__R + a->R__R + a->R__NA;
// wr - rw
int B2 = b->W__R + b->R__R + b->R__NA;
// next lines from peen
int A3 = a->W__R + a->W__NA + a->R__NA;
// -ww + rn
int B3 = b->W__R + b->W__NA + b->R__NA;
return (A1>B1)?-1:(A1<B1)?1:(A2>B2)?-1:(A2<B2)?1:(A3>B3)?-1:(A3<B3)?1:0;
#else
//equivalent with:
int AA1 = a->W__R - a->R__W + a->R__R;
int AA2 = - a->R__R;
int AA3 = - a->W__R - 2*a->W__W;//a->R__NA - a->W__W;
int BB1 = b->W__R - b->R__W + b->R__R;
int BB2 = - b->R__R;
int BB3 = - b->W__R - 2*b->W__W;//b->R__NA - b->W__W;
#endif
}
#endif
static int comp_honey(const vertex * a,const vertex * b)
{
//const vertex * a = *(const vertex **)A;
//const vertex * b = *(const vertex **)B;
// (OK) Dutch 90.179393 +/- 0.589662 at 0.9856 of dataset, 7 iterations, 73324.571429 = 24.642193% rules, cutoff = 0
// (OK) Norwegian 87.272244 +/- 0.267729 at 0.9856 of dataset, 6 iterations, 141038.666667 = 29.880630% rules, cutoff = 0
// (OK) English 88.315152 +/- 1.097312 at 0.9856 of dataset, 3 iterations, 5285.466667 = 7.025276% rules, cutoff=1
// (OK) Icelandic 70.742665 +/- 1.686147 at 0.9856 of dataset, 17 iterations, 29857.000000 = 52.070108% rules, cutoff = 0
// (?) Slovene 86.273367 +/- 0.410931 at 0.9856 of dataset, 9 iterations, 17254.777778 = 8.819297% rules (cutoff = 1)
// (?) BEST Greek 90.422464 +/- 0.437009 at 0.9856 of dataset, 5 iterations, 132765.6 = 24.535334% rules, cutoff = 0
// (?) BEST German 91.461918 +/- 0.167574 at 0.9856 of dataset, 7 iterations, 50986 = 16.405554% rules, cutoff = 0
// (?) BEST Swedish 92.265969 +/- 0.277289 at 0.9856 of dataset, 6 iterations, 25935.333333 = 5.506008% rules, cutoff = 1
// (?) Danish 92.585623 +/- 0.171327 at 0.9856 of dataset, 5 iterations, 30422.400000 = 5.576679% rules, cutoff = 1
// (?) BEST Russian 80.815622 +/- 0.450500 at 0.9856 of dataset, 6 iterations, 47079.166667 = 12.084440% rules, cutoff = 1
//AMBI:
// French ok 84.477707 ambi1 2.251592 ambi2 1.426752 diff 11.843949 rules 7413.875000 2.760295% cutoff 2
int A1 = a->W__R + 2*a->R__R;
int B1 = b->W__R + 2*b->R__R;
int A2 = a->W__R + a->R__R;
int B2 = b->W__R + b->R__R;
int A3 = a->W__R ;
int B3 = b->W__R ;
return (A1>B1)?-1:(A1<B1)?1:(A2>B2)?-1:(A2<B2)?1:(A3>B3)?-1:(A3<B3)?1:0;
}
#if _NA
static int comp_beet(const vertex * a,const vertex * b)
{
//const vertex * a = *(const vertex **)A;
//const vertex * b = *(const vertex **)B;
//Icelandic 71.034094 at 0.939 of dataset
//AMBI:
// French ok 85.057325 ambi1 1.283439 ambi2 1.057325 diff 12.601911 rules 7260.375000 2.703144% cutoff 2
int A1 = 2*(a->W__R + a->R__R) + a->R__NA;
int B1 = 2*(b->W__R + b->R__R) + b->R__NA;
int A2 = a->W__R + a->R__R + a->R__NA;
int B2 = b->W__R + b->R__R + b->R__NA;
int A3 = a->W__R + a->W__NA + a->R__NA;
int B3 = b->W__R + b->W__NA + b->R__NA;
return (A1>B1)?-1:(A1<B1)?1:(A2>B2)?-1:(A2<B2)?1:(A3>B3)?-1:(A3<B3)?1:0;
}
#endif
static int comp_koud(const vertex * a,const vertex * b)
{
// German 91.260578 +/- 0.363285 at 0.9856 of dataset, 7 iterations, 30890.714286 = 9.939577% rules, cutoff = 0
//AMBI:
// French 86.356688 ambi1 0.996815 ambi2 0.796178 diff 11.850318 rules 3335.250000 1.241763% cutoff 3 (!)
// French 85.250493 ambi1 2.333057 ambi2 2.161181 diff 10.255268 rules 28520.250000 10.618597% cutoff 0 (!) paradigms+homographs clumped
// French 85.313973 ambi1 2.050694 ambi2 2.289517 diff 10.345816 rules 28509.250000 10.614432% cutoff 0 (!) homographs clumped
// Dutch.clumped.ph 85.789838 ambi1 1.086067 ambi2 1.256060 diff 11.868035 rules 37400.142857 12.190637% cutoff 0 paradigms+homographs clumped
// Dutch.clumped.h 85.818923 ambi1 1.095507 ambi2 1.060476 diff 12.025095 rules 37411.857143 12.192383% cutoff 0 homographs clumped
// (Dutch.clumped.ph suffix, old algo:
// 83.532708 ambi1 1.948624 ambi2 2.719889 ambi3 0.107033 diff 11.691746 rules 73024.571429 23.802477% cutoff 0 paradigms+homographs clumped
// (Dutch.clumped.h suffix, old algo:
// 83.624725 ambi1 1.859813 ambi2 2.611382 ambi3 0.162415 diff 11.741664 rules 72975.428571 23.782417% cutoff 0 paradigms+homographs clumped
// Russian clumped ph 74.983460 ambi1 0.517762 ambi2 0.558033 diff 23.940745 rules 95077.500000 24.184389% cutoff 0 paradigms+homographs clumped
// (old algo:)
// 79.485114 ambi1 0.218611 ambi2 0.342298 ambi3 0.005753 diff 19.948224 rules 94247.166667 23.973181% cutoff 0 paradigms+homographs clumped
// The A1 vs B1 condition is pretty close to what later was found as the
// best factors using automatic factor setting (comp_parms).
// These factors were found by manual optimizing.
int A1 = 6*a->W__R - 5*a->R__W + a->W__W;
int B1 = 6*b->W__R - 5*b->R__W + b->W__W;
int A2 = a->W__R - 6*a->R__W;
int B2 = b->W__R - 6*b->R__W;
int A3 = a->R__R - a->W__W;
int B3 = b->R__R - b->W__W;
return (A1>B1)?-1:(A1<B1)?1:(A2>B2)?-1:(A2<B2)?1:(A3>B3)?-1:(A3<B3)?1:0;
}
int (*comp)(const vertex * a,const vertex * b) = comp_parms;
// returns b > a ? 1 : b < a ? -1 : 0
// (Chosen like this to let qsort sort in descending order.)
// You can find a local optimum for the parameters by using comp_parms as the
// weight function and setting compute_parms = true. The parameters parms.Matrix[]
// can be seeded with non-zero values by hard coding. The file parms.txt
// will contain the currently best parameters.
// Optimal parameters == resulting in smallest rule set.
// Hypothesis: small rule sets give (almost) best lemmatization results.
// Optimizing for the size of rule sets is computationally MUCH cheaper
// than optimizing for accuracy.
// If you have found a good set of parameters (presumably with a small subset
// of the training data), you can hard code them (as is done below) and run
// the program with the full set of the training data. In that case,
// set compute_parms = false
//bool compute_parms = false;
struct rotation
{
double Matrix[6];
int ROWPARMS;
// metadata:
const char * Language;
const char * Xparm;
int NumberOfRules;
int CutOffForFewestErrors;
double FractionErroneousPedictionOOV;
bool suffixOnly;
void init(optionStruct * options)
{
ROWPARMS = options->numberOfParms();
for(int i = 0;i < ROWPARMS;++i)
{
Matrix[i] = options->parm(i);
}
}
void better(optionStruct * options)
{
for(int i = 0;i < ROWPARMS;++i)
{
options->setParm(i,Matrix[i]);
}
}
} rotation;
static struct rotation parms =
/* R_R W_R R_W W_W R_NA W_NA */
{{ 0.0, 3.0, -2.0, 1.0, 0.0, 0.0},6,"","",0,0,0.0}
;
static void normalise(double * ROW)
{
double modulus = 0.0;
for(int i = 0;i < parms.ROWPARMS;++i)
modulus += ROW[i] * ROW[i];
modulus = sqrt(modulus);
for(int i = 0;i < parms.ROWPARMS;++i)
ROW[i] /= modulus;
}
static double inner(double * a, double * b)
{
double ret = 0;
for(int i = 0;i < parms.ROWPARMS;++i)
ret += a[i]*b[i];
return ret;
}
static void times(double * a, double f)
{
for(int i = 0;i < parms.ROWPARMS;++i)
a[i] *= f;
}
struct bestParms
{
bool suffixonly;
const char * langbase;
int rowss;
struct rotation val;
// Each row:
// R__R W__R R__W W__W R__NA W__NA
// Generally, good that Wrongs change to Rights (W__R > 0) and that Rights don't change to Wrongs (R__W < 0)
// But what about rules that don't improve lemmatisation? (R__R > 0 or W__W > 0)
// Intuitively difficult to decide!
};
#if 1
static bestParms best_is_suffix =
{
true,
"is",
1,
//iteration:18.1
/*weight (not used): 1.41244386452166131e+05 suffix only: yes */
/* number of nodes: 152108, nodes/line: 1.05629895368709495e-01 weight (not used): 1.41244386452166131e+05 blobs 2809220 lines 2873370 * fraction 5.01187233627272799e-01 = 1440009 lines*/
{{
0.00000000000000000e+00, 6.94542434383270568e-01, -7.18112257666929654e-01, 4.38815704990783637e-02
}}
};
#elif 1
static bestParms best_is_suffix =
{
true,
"is",
1,
/*
0 0.985600 2 2831993.000000 471048.500000 306036.000000 16.633110 10.806383 36063.500000 712.500000 528.500000 83.000000 3989.500000 36948.000000 188.500000 140.500000 0.000000 4100.000000 0.176020 0.158931 0.029052 0.006836 0.052977 0.102536 0.039306 0.042723 0.000000 0.105954 87.158325 1.721971 1.277280 0.200595 9.641830 3.905551 89.295986 0.455567 0.339561 0.000000 9.908887 1.377577 1.841603 33.482369 2.063949 62.612079 0.359125 0.058064 0.424149 34.592890 0.953428 64.029533 0.529175 0.026822
1 0.985600 2 2831993.000000 128789.000000 168125.000000 4.547645 5.936632 36520.000000 0.000000 0.000000 0.000000 4857.000000 37099.500000 273.500000 263.000000 0.000000 3741.000000 0.051268 0.000000 0.000000 0.000000 0.051268 0.032470 0.001709 0.047850 0.000000 0.082029 88.261595 0.000000 0.000000 0.000000 11.738405 0.000000 89.662131 0.660995 0.635619 0.000000 9.041255 1.598714 0.000000 35.546318 0.000000 64.453682 0.000000 0.000000 0.493028 34.440631 1.105687 63.960654 0.528596 0.031106
2 0.985600 2 2831993.000000 84003.000000 52867.500000 2.966215 1.866795 36089.500000 0.000000 0.000000 0.000000 5287.500000 37338.500000 236.500000 207.000000 0.000000 3595.000000 0.063231 0.000000 0.000000 0.000000 0.063231 0.029052 0.049559 0.037597 0.000000 0.041014 87.221162 0.000000 0.000000 0.000000 12.778838 0.000000 90.239747 0.571574 0.500278 0.000000 8.688402 1.203567 0.000000 35.546318 0.000000 64.453682 0.000000 0.000000 0.420523 34.763274 0.783044 64.033159 0.482143 0.022029
3 0.985600 2 2831993.000000 63704.000000 38416.000000 2.249441 1.356501 35649.000000 0.000000 0.000000 0.000000 5728.000000 37204.000000 219.500000 209.500000 0.000000 3744.000000 0.017089 0.000000 0.000000 0.000000 0.017089 0.088865 0.022216 0.025634 0.000000 0.085447 86.156560 0.000000 0.000000 0.000000 13.843440 0.000000 89.914687 0.530488 0.506320 0.000000 9.048505 1.185441 0.000000 35.546318 0.000000 64.453682 0.000000 0.000000 0.422940 34.783817 0.762501 64.030742 0.474080 0.021451
4 0.985600 2 2831993.000000 51623.000000 31338.500000 1.822851 1.106588 35334.000000 0.000000 0.000000 0.000000 6043.000000 37065.500000 224.000000 188.000000 0.000000 3899.500000 0.061522 0.000000 0.000000 0.000000 0.061522 0.097409 0.006836 0.023925 0.000000 0.080320 85.395268 0.000000 0.000000 0.000000 14.604732 0.000000 89.579960 0.541364 0.454359 0.000000 9.424318 1.152814 0.000000 35.546318 0.000000 64.453682 0.000000 0.000000 0.435024 34.828528 0.717790 64.018658 0.452055 0.020193
5 0.985600 2 2831993.000000 43987.500000 27450.500000 1.553235 0.969300 35034.500000 0.000000 0.000000 0.000000 6342.500000 36936.000000 219.500000 179.000000 0.000000 4042.500000 0.093992 0.000000 0.000000 0.000000 0.093992 0.082029 0.015380 0.003418 0.000000 0.100827 84.671436 0.000000 0.000000 0.000000 15.328564 0.000000 89.266984 0.530488 0.432607 0.000000 9.769920 1.125021 0.000000 35.546318 0.000000 64.453682 0.000000 0.000000 0.408439 34.829736 0.716582 64.045243 0.467297 0.020159
New (old) algorithm, least wrongly lemmatised (MIN(diff)).
Suffix only yes
cutoff 2
fraction 9856.000000
iterations 2
trainlines 2831993.000000
rules 52867.500000 (84003.000000)
rules% 1.866795 (2.966215)
same%stdev 0.029052
ambi1%stdev 0.049559
ambi2%stdev 0.037597
ambi3%stdev 0.000000
diff%stdev 0.041014
same% 90.239747 (87.221162)
ambi1% 0.571574 (0.000000)
ambi2% 0.500278 (0.000000)
ambi3% 0.000000 (0.000000)
diff% 8.688402 (12.778838)
amb.rules% 1.203567 (0.000000)
false_amb% 0.420523 (0.000000)
false_not_amb% 34.763274 (35.546318)
true_amb% 0.783044 (0.000000)
true_not_amb% 64.033159 (64.453682)
precision 0.482143 (0.000000)
recall 0.022029 (0.000000)
bests[11].suffixonly == [true]
bests[11].langbase == [is]
comp = comp_parms0_off
bests[11].rows == [1]
R->R W->R R->W W->W
0.0085050.669878-0.7383730.077434
*/
//iteration:14.-1
/* number of nodes: 88858, nodes/line: 1.20004915909585078e-01 weight: 8.44399637102287234e+04 blobs 1 lines 5881633 * fraction 1.25892541179416839e-01 = 740453 lines*/
{ // # decisions
8.50547688621742723e-03, 6.69877760720549498e-01, -7.38373491250877478e-01, 7.74340362692699236e-02, //1177883
-7.82684292973299223e-01, 4.59948960180274258e-01, 3.92892970820137744e-01, -1.46585691805502294e-01, //0
-6.01147073676968957e-01, -5.82239786374368462e-01, -5.42665925463000409e-01, -7.16430060350067843e-02, //0
-1.61106021629986690e-01, -2.66002045826051053e-02, 7.71582415526826937e-02, 9.83556752135442247e-01 //0
} //(0 unresolved comparisons)
// Same as
//iteration:13.11
/* number of nodes: 74744, nodes/line: 1.42586526923832640e-01 weight: 7.11594670680982090e+04 blobs 1 lines 5881633 * fraction 8.91250938133746201e-02 = 524201 lines*/
};
#endif
static bestParms best_isC0 =
{
false,
"isC0",
1,
/* SINGLE SHOT
cutoff fraction iterations trainlines suffixrules affixrules suffix% affix% s-same s-ambiguous s-different a-same a-ambiguous a-different s-same-stddev% s-amb-stddev% s-diff-stddev% a-same-stddev% a-amb-stddev% a-diff-stddev% s-same% s-ambiguous% s-different% s-amb.rules% a-same% a-ambiguous% a-different% a-amb.rules% s_false_amb s_false_not_amb s_true_amb s_true_not_amb s_precision s_recall a_false_amb a_false_not_amb a_true_amb a_true_not_amb a_precision a_recall
0 0.985600 2 2831993.000000 471048.500000 60440.000000 16.633110 2.134186 36063.500000 712.500000 528.500000 83.000000 3989.500000 36357.500000 343.000000 254.000000 0.000000 4422.500000 0.176020 0.158931 0.029052 0.006836 0.052977 0.227289 0.092283 0.003418 0.000000 0.131588 87.158325 1.721971 1.277280 0.200595 9.641830 3.905551 87.868864 0.828963 0.613868 0.000000 10.688305 2.130411 1.841603 33.482369 2.063949 62.612079 0.359125 0.058064 0.775793 34.191701 1.354617 63.677889 0.466112 0.038109
1 0.985600 2 2831993.000000 128789.000000 32445.000000 4.547645 1.145660 36520.000000 0.000000 0.000000 0.000000 4857.000000 36779.500000 343.500000 318.000000 0.000000 3936.000000 0.051268 0.000000 0.000000 0.000000 0.051268 0.278557 0.083738 0.064940 0.000000 0.259758 88.261595 0.000000 0.000000 0.000000 11.738405 0.000000 88.888755 0.830171 0.768543 0.000000 9.512531 1.969693 0.000000 35.546318 0.000000 64.453682 0.000000 0.000000 0.679121 34.255746 1.290572 63.774561 0.487226 0.036307
2 0.985600 2 2831993.000000 84003.000000 14192.000000 2.966215 0.501131 36089.500000 0.000000 0.000000 0.000000 5287.500000 36974.500000 310.500000 289.500000 0.000000 3802.500000 0.063231 0.000000 0.000000 0.000000 0.063231 0.295646 0.090574 0.029052 0.000000 0.234124 87.221162 0.000000 0.000000 0.000000 12.778838 0.000000 89.360031 0.750417 0.699664 0.000000 9.189888 1.688136 0.000000 35.546318 0.000000 64.453682 0.000000 0.000000 0.607826 34.466008 1.080310 63.845856 0.470526 0.030392
3 0.985600 2 2831993.000000 63704.000000 10130.000000 2.249441 0.357699 35649.000000 0.000000 0.000000 0.000000 5728.000000 36863.000000 286.500000 274.000000 0.000000 3953.500000 0.017089 0.000000 0.000000 0.000000 0.017089 0.283684 0.083738 0.013671 0.000000 0.213617 86.156560 0.000000 0.000000 0.000000 13.843440 0.000000 89.090558 0.692414 0.662204 0.000000 9.554825 1.579380 0.000000 35.546318 0.000000 64.453682 0.000000 0.000000 0.566740 34.533678 1.012640 63.886942 0.471847 0.028488
4 0.985600 2 2831993.000000 51623.000000 8154.000000 1.822851 0.287924 35334.000000 0.000000 0.000000 0.000000 6043.000000 36739.500000 269.500000 257.500000 0.000000 4110.500000 0.061522 0.000000 0.000000 0.000000 0.061522 0.275139 0.029052 0.008545 0.000000 0.237542 85.395268 0.000000 0.000000 0.000000 14.604732 0.000000 88.792083 0.651328 0.622326 0.000000 9.934263 1.498417 0.000000 35.546318 0.000000 64.453682 0.000000 0.000000 0.547406 34.595307 0.951011 63.906276 0.464855 0.026754
5 0.985600 2 2831993.000000 43987.500000 6904.000000 1.553235 0.243786 35034.500000 0.000000 0.000000 0.000000 6342.500000 36631.500000 255.000000 239.500000 0.000000 4251.000000 0.093992 0.000000 0.000000 0.000000 0.093992 0.305900 0.017089 0.015380 0.000000 0.273430 84.671436 0.000000 0.000000 0.000000 15.328564 0.000000 88.531068 0.616284 0.578824 0.000000 10.273824 1.407787 0.000000 35.546318 0.000000 64.453682 0.000000 0.000000 0.526863 34.665394 0.880924 63.926819 0.455340 0.024782
cutoff 0 Affix a 11.009406 b 0.000000: N(rules)= 60440.000000*N(trainpairs)^0.000000
Suffix a -0.112324 b 0.878094: N(rules)= 0.893755*N(trainpairs)^0.878094
cutoff 1 Affix a 10.387302 b 0.000000: N(rules)= 32445.000000*N(trainpairs)^0.000000
Suffix a -0.733958 b 0.836893: N(rules)= 0.480005*N(trainpairs)^0.836893
cutoff 2 Affix a 9.560434 b 0.000000: N(rules)= 14192.000000*N(trainpairs)^0.000000
Suffix a -1.605792 b 0.868583: N(rules)= 0.200730*N(trainpairs)^0.868583
cutoff 3 Affix a 9.223257 b 0.000000: N(rules)= 10130.000000*N(trainpairs)^0.000000
Suffix a -2.263068 b 0.896039: N(rules)= 0.104031*N(trainpairs)^0.896039
cutoff 4 Affix a 9.006264 b 0.000000: N(rules)= 8154.000000*N(trainpairs)^0.000000
Suffix a -2.871889 b 0.924806: N(rules)= 0.056592*N(trainpairs)^0.924806
cutoff 5 Affix a 8.839856 b 0.000000: N(rules)= 6904.000000*N(trainpairs)^0.000000
Suffix a -3.315163 b 0.944783: N(rules)= 0.036328*N(trainpairs)^0.944783
New (old) algorithm, least wrongly lemmatised (MIN(diff)).
Suffix only no
Redo training no
cutoff 2
fraction 9856.000000
iterations 2
trainlines 2831993.000000
rules 14192.000000 ( 84003.000000)
rules% 0.501131 ( 2.966215)
same%stdev 0.295646
ambi1%stdev 0.090574
ambi2%stdev 0.029052
ambi3%stdev 0.000000
diff%stdev 0.234124
same% 89.360031 ( 87.221162)
ambi1% 0.750417 ( 0.000000)
ambi2% 0.699664 ( 0.000000)
ambi3% 0.000000 ( 0.000000)
diff% 9.189888 ( 12.778838)
amb.rules% 1.688136 ( 0.000000)
false_amb% 0.607826 ( 0.000000)
false_not_amb% 34.466008 ( 35.546318)
true_amb% 1.080310 ( 0.000000)
true_not_amb% 63.845856 ( 64.453682)
precision 0.470526 ( 0.000000)
recall 0.030392 ( 0.000000)
bests[16].suffixonly == [false]
bests[16].langbase == [isC0]
comp = comp_parms0_off
bests[16].rows == [1]
R->R W->R R->W W->W
0.247669 0.636764 -0.729558 0.022230 -0.004214 0.020625
*/
/* REDO
cutoff fraction iterations trainlines suffixrules affixrules suffix% affix% s-same s-ambiguous s-different a-same a-ambiguous a-different s-same-stddev% s-amb-stddev% s-diff-stddev% a-same-stddev% a-amb-stddev% a-diff-stddev% s-same% s-ambiguous% s-different% s-amb.rules% a-same% a-ambiguous% a-different% a-amb.rules% s_false_amb s_false_not_amb s_true_amb s_true_not_amb s_precision s_recall a_false_amb a_false_not_amb a_true_amb a_true_not_amb a_precision a_recall
0 0.985600 2 2831993.000000 471048.500000 60440.000000 16.633110 2.134186 36063.500000 712.500000 528.500000 83.000000 3989.500000 38009.500000 563.500000 543.000000 0.000000 2261.000000 0.176020 0.158931 0.029052 0.006836 0.052977 5.419039 0.845924 0.991183 0.000000 7.256146 87.158325 1.721971 1.277280 0.200595 9.641830 3.905551 91.861421 1.361868 1.312323 0.000000 5.464388 3.111632 1.841603 33.482369 2.063949 62.612079 0.359125 0.058064 0.374604 32.809290 2.737028 64.079078 0.785095 0.076999
1 0.985600 2 2831993.000000 128789.000000 32445.000000 4.547645 1.145660 36520.000000 0.000000 0.000000 0.000000 4857.000000 37620.000000 466.500000 460.000000 0.000000 2830.500000 0.051268 0.000000 0.000000 0.000000 0.051268 2.594166 0.504136 0.420398 0.000000 3.518701 88.261595 0.000000 0.000000 0.000000 11.738405 0.000000 90.920076 1.127438 1.111729 0.000000 6.840757 2.505015 0.000000 35.546318 0.000000 64.453682 0.000000 0.000000 0.546197 33.587500 1.958818 63.907485 0.641980 0.055106
2 0.985600 2 2831993.000000 84003.000000 14192.000000 2.966215 0.501131 36089.500000 0.000000 0.000000 0.000000 5287.500000 37357.500000 328.500000 317.500000 0.000000 3373.500000 0.063231 0.000000 0.000000 0.000000 0.063231 1.013400 0.152095 0.066649 0.000000 1.232143 87.221162 0.000000 0.000000 0.000000 12.778838 0.000000 90.285666 0.793919 0.767335 0.000000 8.153080 1.789642 0.000000 35.546318 0.000000 64.453682 0.000000 0.000000 0.541364 34.298040 1.248278 63.912318 0.535511 0.035117
3 0.985600 2 2831993.000000 63704.000000 10130.000000 2.249441 0.357699 35649.000000 0.000000 0.000000 0.000000 5728.000000 37147.500000 295.000000 281.500000 0.000000 3653.000000 0.017089 0.000000 0.000000 0.000000 0.017089 0.688702 0.112790 0.011963 0.000000 0.813454 86.156560 0.000000 0.000000 0.000000 13.843440 0.000000 89.778138 0.712956 0.680330 0.000000 8.828576 1.604756 0.000000 35.546318 0.000000 64.453682 0.000000 0.000000 0.513570 34.455132 1.091186 63.940112 0.515117 0.030698
4 0.985600 2 2831993.000000 51623.000000 8154.000000 1.822851 0.287924 35334.000000 0.000000 0.000000 0.000000 6043.000000 36962.000000 285.000000 261.500000 0.000000 3868.500000 0.061522 0.000000 0.000000 0.000000 0.061522 0.485338 0.082029 0.022216 0.000000 0.589583 85.395268 0.000000 0.000000 0.000000 14.604732 0.000000 89.329821 0.688788 0.631994 0.000000 9.349397 1.540711 0.000000 35.546318 0.000000 64.453682 0.000000 0.000000 0.525654 34.531261 1.015057 63.928028 0.491228 0.028556
5 0.985600 2 2831993.000000 43987.500000 6904.000000 1.553235 0.243786 35034.500000 0.000000 0.000000 0.000000 6342.500000 36809.500000 273.000000 235.500000 0.000000 4059.000000 0.093992 0.000000 0.000000 0.000000 0.093992 0.302482 0.078611 0.001709 0.000000 0.382802 84.671436 0.000000 0.000000 0.000000 15.328564 0.000000 88.961259 0.659787 0.569157 0.000000 9.809798 1.437997 0.000000 35.546318 0.000000 64.453682 0.000000 0.000000 0.506320 34.614641 0.931677 63.947362 0.479180 0.026210
cutoff 0 Affix a 11.009406 b 0.000000: N(rules)= 60440.000000*N(trainpairs)^0.000000
Suffix a -0.112324 b 0.878094: N(rules)= 0.893755*N(trainpairs)^0.878094
cutoff 1 Affix a 10.387302 b 0.000000: N(rules)= 32445.000000*N(trainpairs)^0.000000
Suffix a -0.733958 b 0.836893: N(rules)= 0.480005*N(trainpairs)^0.836893
cutoff 2 Affix a 9.560434 b 0.000000: N(rules)= 14192.000000*N(trainpairs)^0.000000
Suffix a -1.605792 b 0.868583: N(rules)= 0.200730*N(trainpairs)^0.868583
cutoff 3 Affix a 9.223257 b 0.000000: N(rules)= 10130.000000*N(trainpairs)^0.000000
Suffix a -2.263068 b 0.896039: N(rules)= 0.104031*N(trainpairs)^0.896039
cutoff 4 Affix a 9.006264 b 0.000000: N(rules)= 8154.000000*N(trainpairs)^0.000000
Suffix a -2.871889 b 0.924806: N(rules)= 0.056592*N(trainpairs)^0.924806
cutoff 5 Affix a 8.839856 b 0.000000: N(rules)= 6904.000000*N(trainpairs)^0.000000
Suffix a -3.315163 b 0.944783: N(rules)= 0.036328*N(trainpairs)^0.944783
New (old) algorithm, least wrongly lemmatised (MIN(diff)).
Suffix only no
Redo training yes
cutoff 0
fraction 9856.000000
iterations 2
trainlines 2831993.000000
rules 60440.000000 ( 471048.500000)
rules% 2.134186 ( 16.633110)
same%stdev 5.419039
ambi1%stdev 0.845924
ambi2%stdev 0.991183
ambi3%stdev 0.000000
diff%stdev 7.256146
same% 91.861421 ( 87.158325)
ambi1% 1.361868 ( 1.721971)
ambi2% 1.312323 ( 1.277280)
ambi3% 0.000000 ( 0.200595)
diff% 5.464388 ( 9.641830)
amb.rules% 3.111632 ( 3.905551)
false_amb% 0.374604 ( 1.841603)
false_not_amb% 32.809290 ( 33.482369)
true_amb% 2.737028 ( 2.063949)
true_not_amb% 64.079078 ( 62.612079)
precision 0.785095 ( 0.359125)
recall 0.076999 ( 0.058064)
bests[16].suffixonly == [false]
bests[16].langbase == [isC0]
comp = comp_parms0_off
bests[16].rows == [1]
R->R W->R R->W W->W
0.247669 0.636764 -0.729558 0.022230 -0.004214 0.020625
*/
//iteration:20.6
/*weight ( used): 1.05436295090904787e+04 suffix only: no */
/* number of nodes: 336797, nodes/line: inf weight ( used): 1.05436295090904787e+04 blobs 1 lines 0 * fraction 1.00000000000000000e+00 = 0 lines*/
{{
2.47669087481595079e-01, 6.36764047976876468e-01, -7.29557569755324042e-01, 2.22303428808458027e-02, -4.21447897318842114e-03, 2.06245665463890698e-02
}}
};
#if 1
static bestParms best_is =
{
false,
"is",
1,
//iteration:18.2
/*weight (not used): 1.34340843669173279e+05 suffix only: no */
/* number of nodes: 145852, nodes/line: 1.01285478076873131e-01 weight (not used): 1.34340843669173279e+05 blobs 2809220 lines 2873370 * fraction 5.01187233627272799e-01 = 1440009 lines*/
{{
0.00000000000000000e+00, 6.96451349087997107e-01, -7.13849249589145862e-01, 7.33128038921041919e-02
}}
};
#elif 1
static bestParms best_is =
{
false,
"is",
1,
//iteration:12.0
/*weight (not used): 3.09072013007138085e+04 suffix only: no */
/* number of nodes: 32528, nodes/line: 1.79481661728272457e-01 weight (not used): 3.09072013007138085e+04 blobs 2809220 lines 2873370 * fraction 6.30957344480193721e-02 = 181233 lines*/
{
0.00000000000000000e+00, 7.04722964067779345e-01, -7.05629153519126029e-01, 7.38447128737378389e-02
}
/*
0 0.985600 2 2831993.000000 471048.500000 279488.000000 16.633110 9.868951 36063.500000 712.500000 528.500000 83.000000 3989.500000 36331.500000 392.000000 307.500000 0.000000 4346.000000 0.176020 0.158931 0.029052 0.006836 0.052977 0.312735 0.047850 0.025634 0.000000 0.239251 87.158325 1.721971 1.277280 0.200595 9.641830 3.905551 87.806028 0.947386 0.743166 0.000000 10.503420 2.399884 3.385939 3.707374 0.519612 92.387075 0.071263 0.122927 1.883897 3.710999 0.515987 93.889117 0.120451 0.122070
cutoff 0 Affix a 1.518217 b 0.742218
cutoff 0 Suffix a -0.112324 b 0.878094
1 0.985600 2 2831993.000000 128789.000000 145060.000000 4.547645 5.122188 36520.000000 0.000000 0.000000 0.000000 4857.000000 36701.000000 424.000000 408.000000 0.000000 3844.000000 0.051268 0.000000 0.000000 0.000000 0.051268 0.218744 0.092283 0.068357 0.000000 0.194819 88.261595 0.000000 0.000000 0.000000 11.738405 0.000000 88.699036 1.024724 0.986055 0.000000 9.290185 2.352756 0.000000 4.226986 0.000000 95.773014 0.000000 0.000000 1.678469 3.552698 0.674288 94.094545 0.167266 0.159520
cutoff 1 Affix a 1.181955 b 0.721542
cutoff 1 Suffix a -0.733958 b 0.836893
2 0.985600 2 2831993.000000 84003.000000 55547.000000 2.966215 1.961410 36089.500000 0.000000 0.000000 0.000000 5287.500000 36932.500000 351.500000 319.500000 0.000000 3773.500000 0.063231 0.000000 0.000000 0.000000 0.063231 0.124752 0.008545 0.052977 0.000000 0.169185 87.221162 0.000000 0.000000 0.000000 12.778838 0.000000 89.258525 0.849506 0.772168 0.000000 9.119801 1.813810 0.000000 4.226986 0.000000 95.773014 0.000000 0.000000 1.340116 3.753293 0.473693 94.432898 0.150192 0.112064
cutoff 2 Affix a 0.232484 b 0.725173
cutoff 2 Suffix a -1.605792 b 0.868583
3 0.985600 2 2831993.000000 63704.000000 38924.500000 2.249441 1.374456 35649.000000 0.000000 0.000000 0.000000 5728.000000 36852.000000 316.500000 290.000000 0.000000 3918.500000 0.017089 0.000000 0.000000 0.000000 0.017089 0.143551 0.008545 0.068357 0.000000 0.203363 86.156560 0.000000 0.000000 0.000000 13.843440 0.000000 89.063973 0.764918 0.700872 0.000000 9.470237 1.657926 0.000000 4.226986 0.000000 95.773014 0.000000 0.000000 1.236194 3.805254 0.421732 94.536820 0.145720 0.099771
cutoff 3 Affix a -0.574206 b 0.758604
cutoff 3 Suffix a -2.263068 b 0.896039
4 0.985600 2 2831993.000000 51623.000000 31706.000000 1.822851 1.119565 35334.000000 0.000000 0.000000 0.000000 6043.000000 36747.500000 303.000000 251.000000 0.000000 4075.500000 0.061522 0.000000 0.000000 0.000000 0.061522 0.152095 0.017089 0.044432 0.000000 0.179438 85.395268 0.000000 0.000000 0.000000 14.604732 0.000000 88.811417 0.732291 0.606617 0.000000 9.849675 1.534669 0.000000 4.226986 0.000000 95.773014 0.000000 0.000000 1.155231 3.847548 0.379438 94.617783 0.141060 0.089766
cutoff 4 Affix a -1.201505 b 0.788680
cutoff 4 Suffix a -2.871889 b 0.924806
5 0.985600 2 2831993.000000 43987.500000 27360.000000 1.553235 0.966104 35034.500000 0.000000 0.000000 0.000000 6342.500000 36633.500000 293.500000 228.500000 0.000000 4221.500000 0.093992 0.000000 0.000000 0.000000 0.093992 0.073484 0.049559 0.052977 0.000000 0.176020 84.671436 0.000000 0.000000 0.000000 15.328564 0.000000 88.535902 0.709331 0.552239 0.000000 10.202528 1.451289 0.000000 4.226986 0.000000 95.773014 0.000000 0.000000 1.096020 3.871716 0.355270 94.676994 0.139469 0.084048
cutoff 5 Affix a -1.734091 b 0.815366
cutoff 5 Suffix a -3.315163 b 0.944783
New (old) algorithm, least wrongly lemmatised (MIN(diff)).
Suffix only no
cutoff 2
fraction 9856.000000
iterations 2
trainlines 2831993.000000
rules 55547.000000 (84003.000000)
rules% 1.961410 (2.966215)
same%stdev 0.124752
ambi1%stdev 0.008545
ambi2%stdev 0.052977
ambi3%stdev 0.000000
diff%stdev 0.169185
same% 89.258525 (87.221162)
ambi1% 0.849506 (0.000000)
ambi2% 0.772168 (0.000000)
ambi3% 0.000000 (0.000000)
diff% 9.119801 (12.778838)
amb.rules% 1.813810 (0.000000)
false_amb% 1.340116 (0.000000)
false_not_amb% 3.753293 (4.226986)
true_amb% 0.473693 (0.000000)
true_not_amb% 94.432898 (95.773014)
precision 0.150192 (0.000000)
recall 0.112064 (0.000000)
bests[10].suffixonly == [false]
bests[10].langbase == [is]
comp = comp_parms0_off
bests[10].rows == [1]
R->R W->R R->W W->W
0.000000 0.704723 -0.705629 0.073845
*/
};
#elif 1
static bestParms best_is =
{
false,
"is",
1,
//iteration:15.-1
/* number of nodes: 111593, nodes/line: 1.06693832594907057e-01 weight: 1.05561353388409800e+05 blobs 1 lines 5881633 * fraction 1.77827941003892431e-01 = 1045918 lines*/
{ // # decisions
5.66585282075018903e-03, 5.64699254907257475e-01, -8.21269355775872678e-01, 8.12360442321313353e-02, //4842840
-3.53836789873742896e-01, 6.35502799370625149e-01, 3.77862909934449431e-01, -5.72848443674078389e-01, //0
2.85433068754089581e-01, -3.75433738261612693e-01, -3.36787965276463597e-01, -8.14954807263245429e-01, //0
-8.90671312833910434e-01, -3.69188910551492833e-01, -2.63268176968452838e-01, -3.30760913081912244e-02 //0
} //(0 unresolved comparisons)
/*
0 0.985600 2 2831993.000000 471048.500000 289516.500000 16.633110 10.223066 36063.500000 712.500000 528.500000 83.000000 3989.500000 36264.000000 362.000000 305.500000 0.000000 4445.500000 0.176020 0.158931 0.029052 0.006836 0.052977 0.276848 0.061522 0.029052 0.000000 0.244378 87.158325 1.721971 1.277280 0.200595 9.641830 3.905551 87.642893 0.874882 0.738333 0.000000 10.743892 2.341881 1.841603 33.482369 2.063949 62.612079 0.359125 0.058064 0.964304 34.168741 1.377577 63.489378 0.416667 0.038754
1 0.985600 2 2831993.000000 128789.000000 148030.500000 4.547645 5.227079 36520.000000 0.000000 0.000000 0.000000 4857.000000 36661.500000 382.500000 410.500000 0.000000 3922.500000 0.051268 0.000000 0.000000 0.000000 0.051268 0.220453 0.015380 0.073484 0.000000 0.278557 88.261595 0.000000 0.000000 0.000000 11.738405 0.000000 88.603572 0.924427 0.992097 0.000000 9.479904 2.309254 0.000000 35.546318 0.000000 64.453682 0.000000 0.000000 0.856756 34.093820 1.452498 63.596926 0.458779 0.040862
2 0.985600 2 2831993.000000 84003.000000 57418.500000 2.966215 2.027494 36089.500000 0.000000 0.000000 0.000000 5287.500000 36885.000000 307.500000 310.500000 0.000000 3874.000000 0.063231 0.000000 0.000000 0.000000 0.063231 0.140133 0.022216 0.059813 0.000000 0.177729 87.221162 0.000000 0.000000 0.000000 12.778838 0.000000 89.143727 0.743166 0.750417 0.000000 9.362689 1.737680 0.000000 35.546318 0.000000 64.453682 0.000000 0.000000 0.694830 34.503468 1.042850 63.758852 0.428713 0.029338
3 0.985600 2 2831993.000000 63704.000000 40149.000000 2.249441 1.417694 35649.000000 0.000000 0.000000 0.000000 5728.000000 36788.000000 286.000000 271.500000 0.000000 4031.500000 0.017089 0.000000 0.000000 0.000000 0.017089 0.256341 0.058104 0.008545 0.000000 0.206781 86.156560 0.000000 0.000000 0.000000 13.843440 0.000000 88.909297 0.691205 0.656162 0.000000 9.743336 1.590255 0.000000 35.546318 0.000000 64.453682 0.000000 0.000000 0.631994 34.588056 0.958262 63.821688 0.431213 0.026958
4 0.985600 2 2831993.000000 51623.000000 32297.000000 1.822851 1.140434 35334.000000 0.000000 0.000000 0.000000 6043.000000 36652.500000 283.000000 234.000000 0.000000 4207.500000 0.061522 0.000000 0.000000 0.000000 0.061522 0.251214 0.061522 0.006836 0.000000 0.182856 85.395268 0.000000 0.000000 0.000000 14.604732 0.000000 88.581821 0.683955 0.565532 0.000000 10.168693 1.494792 0.000000 35.546318 0.000000 64.453682 0.000000 0.000000 0.600575 34.652101 0.894217 63.853107 0.426759 0.025156
5 0.985600 2 2831993.000000 43987.500000 27785.500000 1.553235 0.981129 35034.500000 0.000000 0.000000 0.000000 6342.500000 36539.000000 272.500000 218.500000 0.000000 4347.000000 0.093992 0.000000 0.000000 0.000000 0.093992 0.215326 0.035888 0.005127 0.000000 0.174312 84.671436 0.000000 0.000000 0.000000 15.328564 0.000000 88.307514 0.658578 0.528071 0.000000 10.505837 1.436789 0.000000 35.546318 0.000000 64.453682 0.000000 0.000000 0.592116 34.701646 0.844672 63.861566 0.416319 0.023763
New (old) algorithm, least wrongly lemmatised (MIN(diff)).
Suffix only no
cutoff 2
fraction 9856.000000
iterations 2
trainlines 2831993.000000
rules 57418.500000 (84003.000000)
rules% 2.027494 (2.966215)
same%stdev 0.140133
ambi1%stdev 0.022216
ambi2%stdev 0.059813
ambi3%stdev 0.000000
diff%stdev 0.177729
same% 89.143727 (87.221162)
ambi1% 0.743166 (0.000000)
ambi2% 0.750417 (0.000000)
ambi3% 0.000000 (0.000000)
diff% 9.362689 (12.778838)
amb.rules% 1.737680 (0.000000)
false_amb% 0.694830 (0.000000)
false_not_amb% 34.503468 (35.546318)
true_amb% 1.042850 (0.000000)
true_not_amb% 63.758852 (64.453682)
precision 0.428713 (0.000000)
recall 0.029338 (0.000000)
bests[10].suffixonly == [false]
bests[10].langbase == [is]
comp = comp_parms0_off
bests[10].rows == [1]
R->R W->R R->W W->W
0.005666 0.564699 -0.821269 0.081236
*/
};
#elif 1
static bestParms best_is =
{
false,
"is",
4,
//iteration:12.-1
/* number of nodes: 60151, nodes/line: 0.162086 weight: 57935.959065 blobs 1 lines 5881633 * fraction 0.063096 = 371105 lines*/
{ // # decisions
0.013188, 0.712282, -0.695044, 0.096930, //2283425
-0.235599, 0.552619, 0.471857, -0.645334, //43635
0.170415, -0.386280, -0.498239, -0.757301, //2353
-0.956702, -0.195077, -0.214532, 0.025362 //0
} //(0 unresolved comparisons)
// Same parameters since:
//iteration:1.41
/* number of nodes: 3149, nodes/line: 0.379032 weight: 3077.123342 blobs 1 lines 5881633 * fraction 0.001413 = 8308 lines*/
/*
{ // # decisions
0.013188, 0.712282, -0.695044, 0.096930, //100450
-0.235599, 0.552619, 0.471857, -0.645334, //2918
0.170415, -0.386280, -0.498239, -0.757301, //124
-0.956702, -0.195077, -0.214532, 0.025362 //0
} //(0 unresolved comparisons)
*/
};
#endif
static bestParms best_nlC0 = // Dutch, ambiguous training pairs in training set derived from CELEX
// weight function (graph.h) rcount*rcount*exp(-2.0*rcount/1.0)
// Aiming at cutoff == 0, because maximum penalty is for 1 word/lemma pairs for a rule.
{
false,
"nlC0",
1,
/* Figures for SLOW 0
cutoff fraction iterations trainlines suffixrules affixrules suffix% affix% s-same s-ambiguous s-different a-same a-ambiguous a-different s-same-stddev% s-amb-stddev% s-diff-stddev% a-same-stddev% a-amb-stddev% a-diff-stddev% s-same% s-ambiguous% s-different% s-amb.rules% a-same% a-ambiguous% a-different% a-amb.rules% s_false_amb s_false_not_amb s_true_amb s_true_not_amb s_precision s_recall a_false_amb a_false_not_amb a_true_amb a_true_not_amb a_precision a_recall
0 0.985600 7 306848.000000 73132.000000 51919.000000 23.833299 16.920104 3635.857143 78.000000 117.000000 4.714286 648.428571 3797.285714 47.285714 28.857143 0.000000 610.571429 0.368578 0.192276 0.216604 0.027956 0.556177 0.613962 0.144366 0.137131 0.000000 0.612630 81.085128 1.739518 2.609277 0.105136 14.460940 4.982796 84.685230 1.054543 0.643558 0.000000 13.616669 2.182363 4.610042 5.428826 0.372754 89.588378 0.038858 0.064250 1.815981 5.435198 0.366382 92.382439 0.091633 0.063152
1 0.985600 7 306848.000000 18391.714286 24506.428571 5.993754 7.986504 3743.714286 0.000000 0.000000 0.000000 740.285714 3857.285714 44.714286 38.571429 0.000000 543.428571 0.341564 0.000000 0.000000 0.000000 0.341564 0.260942 0.167244 0.162285 0.000000 0.313508 83.490506 0.000000 0.000000 0.000000 16.509494 0.000000 86.023321 0.997196 0.860201 0.000000 12.119281 2.121830 0.000000 5.801580 0.000000 94.198420 0.000000 0.000000 1.535619 5.215369 0.586211 92.662801 0.160279 0.101043
2 0.985600 7 306848.000000 11355.142857 8512.857143 3.700576 2.774291 3696.857143 0.000000 0.000000 0.000000 787.142857 3886.285714 36.571429 31.000000 0.000000 530.142857 0.396739 0.000000 0.000000 0.000000 0.396739 0.408533 0.152814 0.077255 0.000000 0.357852 82.445521 0.000000 0.000000 0.000000 17.554479 0.000000 86.670065 0.815598 0.691347 0.000000 11.822990 1.710845 0.000000 5.801580 0.000000 94.198420 0.000000 0.000000 1.268001 5.358736 0.442844 92.930419 0.148663 0.076332
3 0.985600 7 306848.000000 8112.285714 5634.571429 2.643747 1.836274 3652.571429 0.000000 0.000000 0.000000 831.428571 3872.285714 33.000000 27.000000 0.000000 551.714286 0.255206 0.000000 0.000000 0.000000 0.255206 0.329961 0.112985 0.082445 0.000000 0.356293 81.457882 0.000000 0.000000 0.000000 18.542118 0.000000 86.357844 0.735950 0.602141 0.000000 12.304065 1.532433 0.000000 5.801580 0.000000 94.198420 0.000000 0.000000 1.156493 5.425640 0.375940 93.041927 0.139810 0.064800
4 0.985600 7 306848.000000 6239.285714 4347.571429 2.033347 1.416849 3615.000000 0.000000 0.000000 0.000000 869.000000 3850.571429 35.714286 22.571429 0.000000 575.142857 0.273440 0.000000 0.000000 0.000000 0.273440 0.311652 0.095241 0.028377 0.000000 0.290613 80.619982 0.000000 0.000000 0.000000 19.380018 0.000000 85.873582 0.796483 0.503377 0.000000 12.826558 1.491016 0.000000 5.801580 0.000000 94.198420 0.000000 0.000000 1.156493 5.467057 0.334523 93.041927 0.126354 0.057661
5 0.985600 7 306848.000000 5044.285714 3567.571429 1.643904 1.162651 3592.714286 0.000000 0.000000 0.000000 891.285714 3839.428571 28.428571 20.857143 0.000000 595.285714 0.262209 0.000000 0.000000 0.000000 0.262209 0.328738 0.117908 0.037381 0.000000 0.347817 80.122977 0.000000 0.000000 0.000000 19.877023 0.000000 85.625080 0.634000 0.465146 0.000000 13.275774 1.325347 0.000000 5.801580 0.000000 94.198420 0.000000 0.000000 1.022684 5.498917 0.302663 93.175736 0.128901 0.052169
cutoff 0 Affix a 0.534867 b 0.817364
Suffix a -0.706998 b 0.933881
cutoff 1 Affix a 0.011389 b 0.800182
Suffix a -2.482111 b 0.967669
cutoff 2 Affix a -1.095794 b 0.809484
Suffix a -3.356905 b 0.996574
cutoff 3 Affix a -1.997422 b 0.849935
Suffix a -3.794027 b 1.003571
cutoff 4 Affix a -2.581689 b 0.876345
Suffix a -4.020226 b 1.000475
cutoff 5 Affix a -2.983694 b 0.892699
Suffix a -4.205819 b 0.997875
New (old) algorithm, least wrongly lemmatised (MIN(diff)).
Suffix only no
cutoff 2
fraction 9856.000000
iterations 7
trainlines 306848.000000
rules 8512.857143 (11355.142857)
rules% 2.774291 (3.700576)
same%stdev 0.408533
ambi1%stdev 0.152814
ambi2%stdev 0.077255
ambi3%stdev 0.000000
diff%stdev 0.357852
same% 86.670065 (82.445521)
ambi1% 0.815598 (0.000000)
ambi2% 0.691347 (0.000000)
ambi3% 0.000000 (0.000000)
diff% 11.822990 (17.554479)
amb.rules% 1.710845 (0.000000)
false_amb% 1.268001 (0.000000)
false_not_amb% 5.358736 (5.801580)
true_amb% 0.442844 (0.000000)
true_not_amb% 92.930419 (94.198420)
precision 0.148663 (0.000000)
recall 0.076332 (0.000000)
bests[11].suffixonly == [false]
bests[11].langbase == [nlC0]
comp = comp_parms0_off
bests[11].rows == [1]
R->R W->R R->W W->W
0.280881 0.707997 -0.646540 0.002056 0.042723 0.001589
*/
/* Figures for SLOW 1
cutoff fraction iterations trainlines suffixrules affixrules suffix% affix% s-same s-ambiguous s-different a-same a-ambiguous a-different s-same-stddev% s-amb-stddev% s-diff-stddev% a-same-stddev% a-amb-stddev% a-diff-stddev% s-same% s-ambiguous% s-different% s-amb.rules% a-same% a-ambiguous% a-different% a-amb.rules% s_false_amb s_false_not_amb s_true_amb s_true_not_amb s_precision s_recall a_false_amb a_false_not_amb a_true_amb a_true_not_amb a_precision a_recall
0 0.985600 7 306848.000000 73132.000000 45134.714286 23.833299 14.709144 3635.857143 78.000000 117.000000 4.714286 648.428571 3830.714286 108.142857 442.714286 0.000000 102.428571 0.368578 0.192276 0.216604 0.027956 0.556177 1.713095 1.518002 2.627062 0.000000 2.779675 81.085128 1.739518 2.609277 0.105136 14.460940 4.982796 85.430738 2.411750 9.873200 0.000000 2.284312 12.648146 4.610042 5.428826 0.372754 89.588378 0.038858 0.064250 8.780426 1.933860 3.867720 85.417994 0.180494 0.666667
1 0.985600 7 306848.000000 18391.714286 21055.714286 5.993754 6.861936 3743.714286 0.000000 0.000000 0.000000 740.285714 3788.857143 93.857143 342.571429 0.000000 258.714286 0.341564 0.000000 0.000000 0.000000 0.341564 0.875026 0.607954 1.583138 0.000000 1.711304 83.490506 0.000000 0.000000 0.000000 16.509494 0.000000 84.497260 2.093157 7.639862 0.000000 5.769721 10.198165 0.000000 5.801580 0.000000 94.198420 0.000000 0.000000 7.795973 3.399388 2.402192 86.402447 0.133499 0.414058
2 0.985600 7 306848.000000 11355.142857 8544.571429 3.700576 2.784627 3696.857143 0.000000 0.000000 0.000000 787.142857 3744.428571 67.571429 268.428571 0.000000 403.571429 0.396739 0.000000 0.000000 0.000000 0.396739 0.467952 0.257791 0.758127 0.000000 0.783612 82.445521 0.000000 0.000000 0.000000 17.554479 0.000000 83.506436 1.506945 5.986364 0.000000 9.000255 8.270677 0.000000 5.801580 0.000000 94.198420 0.000000 0.000000 7.114184 4.645087 1.156493 87.084236 0.075171 0.199341
3 0.985600 7 306848.000000 8112.285714 5744.714286 2.643747 1.872169 3652.571429 0.000000 0.000000 0.000000 831.428571 3696.571429 62.714286 263.000000 0.000000 461.714286 0.255206 0.000000 0.000000 0.000000 0.255206 0.324679 0.181961 0.539402 0.000000 0.600171 81.457882 0.000000 0.000000 0.000000 18.542118 0.000000 82.439149 1.398624 5.865299 0.000000 10.296929 8.066777 0.000000 5.801580 0.000000 94.198420 0.000000 0.000000 7.050465 4.785268 1.016312 87.147955 0.067229 0.175178
4 0.985600 7 306848.000000 6239.285714 4401.142857 2.033347 1.434307 3615.000000 0.000000 0.000000 0.000000 869.000000 3655.142857 64.428571 270.000000 0.000000 494.428571 0.273440 0.000000 0.000000 0.000000 0.273440 0.327619 0.260351 0.471155 0.000000 0.436422 80.619982 0.000000 0.000000 0.000000 19.380018 0.000000 81.515229 1.436855 6.021409 0.000000 11.026507 8.471390 0.000000 5.801580 0.000000 94.198420 0.000000 0.000000 7.423219 4.753409 1.048171 86.775201 0.065945 0.180670
5 0.985600 7 306848.000000 5044.285714 3640.000000 1.643904 1.186255 3592.714286 0.000000 0.000000 0.000000 891.285714 3620.428571 61.285714 278.714286 0.000000 523.571429 0.262209 0.000000 0.000000 0.000000 0.262209 0.257148 0.212241 0.442619 0.000000 0.381709 80.122977 0.000000 0.000000 0.000000 19.877023 0.000000 80.741048 1.366764 6.215751 0.000000 11.676437 8.640245 0.000000 5.801580 0.000000 94.198420 0.000000 0.000000 7.620747 4.782082 1.019498 86.577673 0.062696 0.175728
cutoff 0 Affix a 0.798721 b 0.789038: N(rules)= 2.222696*N(trainpairs)^0.789038
Suffix a -0.706998 b 0.933881: N(rules)= 0.493123*N(trainpairs)^0.933881
cutoff 1 Affix a 0.293562 b 0.770051: N(rules)= 1.341196*N(trainpairs)^0.770051
Suffix a -2.482111 b 0.967669: N(rules)= 0.083567*N(trainpairs)^0.967669
cutoff 2 Affix a -1.114487 b 0.811816: N(rules)= 0.328084*N(trainpairs)^0.811816
Suffix a -3.356905 b 0.996574: N(rules)= 0.034843*N(trainpairs)^0.996574
cutoff 3 Affix a -2.040671 b 0.854564: N(rules)= 0.129941*N(trainpairs)^0.854564
Suffix a -3.794027 b 1.003571: N(rules)= 0.022505*N(trainpairs)^1.003571
cutoff 4 Affix a -2.607640 b 0.879272: N(rules)= 0.073708*N(trainpairs)^0.879272
Suffix a -4.020226 b 1.000475: N(rules)= 0.017949*N(trainpairs)^1.000475
cutoff 5 Affix a -3.025073 b 0.897062: N(rules)= 0.048554*N(trainpairs)^0.897062
Suffix a -4.205819 b 0.997875: N(rules)= 0.014909*N(trainpairs)^0.997875
New (old) algorithm, least wrongly lemmatised (MIN(diff)).
Suffix only no
cutoff 0
fraction 9856.000000
iterations 7
trainlines 306848.000000
rules 45134.714286 ( 73132.000000)
rules% 14.709144 ( 23.833299)
same%stdev 1.713095
ambi1%stdev 1.518002
ambi2%stdev 2.627062
ambi3%stdev 0.000000
diff%stdev 2.779675
same% 85.430738 ( 81.085128)
ambi1% 2.411750 ( 1.739518)
ambi2% 9.873200 ( 2.609277)
ambi3% 0.000000 ( 0.105136)
diff% 2.284312 ( 14.460940)
amb.rules% 12.648146 ( 4.982796)
false_amb% 8.780426 ( 4.610042)
false_not_amb% 1.933860 ( 5.428826)
true_amb% 3.867720 ( 0.372754)
true_not_amb% 85.417994 ( 89.588378)
precision 0.180494 ( 0.038858)
recall 0.666667 ( 0.064250)
bests[11].suffixonly == [false]
bests[11].langbase == [nlC0]
comp = comp_parms0_off
bests[11].rows == [1]
R->R W->R R->W W->W
0.280881 0.707997 -0.646540 0.002056 0.042723 0.001589
*/
//iteration:20.2
/*weight ( used): 1.44449673909528588e+03 suffix only: no */
/* number of nodes: 45281, nodes/line: 1.34238713139272547e-01 weight ( used): 1.44449673909528588e+03 blobs 1 lines 337317 * fraction 1.00000000000000000e+00 = 337317 lines*/
{{ // These were computed using dict_nl_without_doubles_UTF8, a file with doublets when only looking a word and lemma
2.80881366831663093e-01, 7.07996695016342659e-01, -6.46540271422676893e-01, 2.05573578451381843e-03, 4.27231086744624400e-02, 1.58901761416129893e-03
}}
};
static bestParms best_nlC1 = // Dutch, ambiguous training pairs in training set derived from CELEX
// weight function (graph.h) rcount*rcount*exp(-2.0*rcount/2.0)
// Aiming at cutoff == 1, because maximum penalty is for 2 word/lemma pairs for a rule.
{
false,
"nlC1",
1,
/*
cutoff fraction iterations trainlines suffixrules affixrules suffix% affix% s-same s-ambiguous s-different a-same a-ambiguous a-different s-same-stddev% s-amb-stddev% s-diff-stddev% a-same-stddev% a-amb-stddev% a-diff-stddev% s-same% s-ambiguous% s-different% s-amb.rules% a-same% a-ambiguous% a-different% a-amb.rules% s_false_amb s_false_not_amb s_true_amb s_true_not_amb s_precision s_recall a_false_amb a_false_not_amb a_true_amb a_true_not_amb a_precision a_recall
0 0.985600 7 306848.000000 73132.000000 46062.571429 23.833299 15.011527 3635.857143 78.000000 117.000000 4.714286 648.428571 3755.857143 64.000000 40.142857 0.000000 624.000000 0.368578 0.192276 0.216604 0.027956 0.556177 0.444807 0.183452 0.135916 0.000000 0.461737 81.085128 1.739518 2.609277 0.105136 14.460940 4.982796 83.761310 1.427297 0.895247 0.000000 13.916146 2.905569 4.610042 5.428826 0.372754 89.588378 0.038858 0.064250 2.532815 5.428826 0.372754 91.665605 0.068541 0.064250
1 0.985600 7 306848.000000 18391.714286 22782.285714 5.993754 7.424616 3743.714286 0.000000 0.000000 0.000000 740.285714 3821.428571 69.285714 47.714286 0.000000 545.571429 0.341564 0.000000 0.000000 0.000000 0.341564 0.277609 0.142051 0.110548 0.000000 0.298889 83.490506 0.000000 0.000000 0.000000 16.509494 0.000000 85.223652 1.545177 1.064101 0.000000 12.167070 3.080795 0.000000 5.801580 0.000000 94.198420 0.000000 0.000000 2.312986 5.033771 0.767809 91.885434 0.142351 0.132345
2 0.985600 7 306848.000000 11355.142857 8143.714286 3.700576 2.653990 3696.857143 0.000000 0.000000 0.000000 787.142857 3857.285714 57.857143 34.285714 0.000000 534.571429 0.396739 0.000000 0.000000 0.000000 0.396739 0.298017 0.105729 0.157018 0.000000 0.306015 82.445521 0.000000 0.000000 0.000000 17.554479 0.000000 86.023321 1.290302 0.764623 0.000000 11.921754 2.363961 0.000000 5.801580 0.000000 94.198420 0.000000 0.000000 1.892443 5.330062 0.471518 92.305977 0.110778 0.081274
3 0.985600 7 306848.000000 8112.285714 5475.714286 2.643747 1.784504 3652.571429 0.000000 0.000000 0.000000 831.428571 3845.714286 50.571429 30.857143 0.000000 556.857143 0.255206 0.000000 0.000000 0.000000 0.255206 0.205091 0.174112 0.113299 0.000000 0.281926 81.457882 0.000000 0.000000 0.000000 18.542118 0.000000 85.765261 1.127820 0.688161 0.000000 12.418759 2.121830 0.000000 5.801580 0.000000 94.198420 0.000000 0.000000 1.701287 5.381037 0.420543 92.497133 0.110000 0.072488
4 0.985600 7 306848.000000 6239.285714 4244.428571 2.033347 1.383235 3615.000000 0.000000 0.000000 0.000000 869.000000 3826.142857 51.714286 28.285714 0.000000 577.857143 0.273440 0.000000 0.000000 0.000000 0.273440 0.224655 0.192584 0.151100 0.000000 0.308597 80.619982 0.000000 0.000000 0.000000 19.380018 0.000000 85.328788 1.153307 0.630814 0.000000 12.887091 2.096343 0.000000 5.801580 0.000000 94.198420 0.000000 0.000000 1.723589 5.428826 0.372754 92.474831 0.097581 0.064250
5 0.985600 7 306848.000000 5044.285714 3439.571429 1.643904 1.120937 3592.714286 0.000000 0.000000 0.000000 891.285714 3807.571429 49.571429 27.428571 0.000000 599.428571 0.262209 0.000000 0.000000 0.000000 0.262209 0.228471 0.106065 0.100445 0.000000 0.321600 80.122977 0.000000 0.000000 0.000000 19.877023 0.000000 84.914617 1.105518 0.611699 0.000000 13.368166 2.042182 0.000000 5.801580 0.000000 94.198420 0.000000 0.000000 1.720403 5.479801 0.321779 92.478017 0.085521 0.055464
cutoff 0 Affix a 0.384392 b 0.819628
Suffix a -0.706998 b 0.933881
cutoff 1 Affix a -0.001749 b 0.794660
Suffix a -2.482111 b 0.967669
cutoff 2 Affix a -1.188392 b 0.811560
Suffix a -3.356905 b 0.996574
cutoff 3 Affix a -2.070862 b 0.850851
Suffix a -3.794027 b 1.003571
cutoff 4 Affix a -2.610700 b 0.874822
Suffix a -4.020226 b 1.000475
cutoff 5 Affix a -2.980595 b 0.887978
Suffix a -4.205819 b 0.997875
New (old) algorithm, least wrongly lemmatised (MIN(diff)).
Suffix only no
cutoff 2
fraction 9856.000000
iterations 7
trainlines 306848.000000
rules 8143.714286 (11355.142857)
rules% 2.653990 (3.700576)
same%stdev 0.298017
ambi1%stdev 0.105729
ambi2%stdev 0.157018
ambi3%stdev 0.000000
diff%stdev 0.306015
same% 86.023321 (82.445521)
ambi1% 1.290302 (0.000000)
ambi2% 0.764623 (0.000000)
ambi3% 0.000000 (0.000000)
diff% 11.921754 (17.554479)
amb.rules% 2.363961 (0.000000)
false_amb% 1.892443 (0.000000)
false_not_amb% 5.330062 (5.801580)
true_amb% 0.471518 (0.000000)
true_not_amb% 92.305977 (94.198420)
precision 0.110778 (0.000000)
recall 0.081274 (0.000000)
bests[12].suffixonly == [false]
bests[12].langbase == [nlC1]
comp = comp_parms0_off
bests[12].rows == [1]
R->R W->R R->W W->W
0.069568 0.656948 -0.726409 0.186749 -0.004976 0.031780
*/
//iteration:20.3
/*weight ( used): 7.72334598212476067e+03 suffix only: no */
/* number of nodes: 40017, nodes/line: 1.28534811712255725e-01 weight ( used): 7.72334598212476067e+03 blobs 1 lines 311332 * fraction 1.00000000000000000e+00 = 311332 lines*/
{{
6.95681858201713382e-02, 6.56948240807808115e-01, -7.26408565910014570e-01, 1.86748935639343000e-01, -4.97642864640904150e-03, 3.17795844729270860e-02
}}
};
static bestParms best_nlC2 = // Dutch, ambiguous training pairs in training set derived from CELEX
// weight function (graph.h) rcount*rcount*exp(-2.0*rcount/3.0)
// Aiming at cutoff == 2, because maximum penalty is for 3 word/lemma pairs for a rule.
{
false,
"nlC2",
1,
/* SLOW 0
cutoff fraction iterations trainlines suffixrules affixrules suffix% affix% s-same s-ambiguous s-different a-same a-ambiguous a-different s-same-stddev% s-amb-stddev% s-diff-stddev% a-same-stddev% a-amb-stddev% a-diff-stddev% s-same% s-ambiguous% s-different% s-amb.rules% a-same% a-ambiguous% a-different% a-amb.rules% s_false_amb s_false_not_amb s_true_amb s_true_not_amb s_precision s_recall a_false_amb a_false_not_amb a_true_amb a_true_not_amb a_precision a_recall
0 0.985600 7 306848.000000 73132.000000 43253.857143 23.833299 14.096183 3635.857143 78.000000 117.000000 4.714286 648.428571 3766.428571 54.714286 38.142857 0.000000 624.714286 0.368578 0.192276 0.216604 0.027956 0.556177 0.521089 0.122634 0.117606 0.000000 0.496464 81.085128 1.739518 2.609277 0.105136 14.460940 4.982796 83.997069 1.220212 0.850644 0.000000 13.932076 2.606091 4.610042 5.428826 0.372754 89.588378 0.038858 0.064250 2.233338 5.428826 0.372754 91.965082 0.077024 0.064250
1 0.985600 7 306848.000000 18391.714286 21760.000000 5.993754 7.091459 3743.714286 0.000000 0.000000 0.000000 740.285714 3831.857143 55.428571 47.714286 0.000000 549.000000 0.341564 0.000000 0.000000 0.000000 0.341564 0.357620 0.186079 0.099498 0.000000 0.409401 83.490506 0.000000 0.000000 0.000000 16.509494 0.000000 85.456225 1.236141 1.064101 0.000000 12.243533 2.669810 0.000000 5.801580 0.000000 94.198420 0.000000 0.000000 1.902001 5.033771 0.767809 92.296419 0.167944 0.132345
2 0.985600 7 306848.000000 11355.142857 8030.142857 3.700576 2.616977 3696.857143 0.000000 0.000000 0.000000 787.142857 3868.428571 43.142857 32.285714 0.000000 540.142857 0.396739 0.000000 0.000000 0.000000 0.396739 0.345939 0.117606 0.131758 0.000000 0.327113 82.445521 0.000000 0.000000 0.000000 17.554479 0.000000 86.271824 0.962151 0.720020 0.000000 12.046005 1.876513 0.000000 5.801580 0.000000 94.198420 0.000000 0.000000 1.465528 5.390595 0.410985 92.732892 0.122974 0.070840
3 0.985600 7 306848.000000 8112.285714 5347.000000 2.643747 1.742557 3652.571429 0.000000 0.000000 0.000000 831.428571 3857.571429 37.428571 28.571429 0.000000 560.428571 0.255206 0.000000 0.000000 0.000000 0.255206 0.209546 0.163810 0.088138 0.000000 0.257791 81.457882 0.000000 0.000000 0.000000 18.542118 0.000000 86.029693 0.834714 0.637186 0.000000 12.498407 1.663056 0.000000 5.801580 0.000000 94.198420 0.000000 0.000000 1.293488 5.432012 0.369568 92.904932 0.125000 0.063701
4 0.985600 7 306848.000000 6239.285714 4134.000000 2.033347 1.347247 3615.000000 0.000000 0.000000 0.000000 869.000000 3836.285714 36.142857 26.714286 0.000000 584.857143 0.273440 0.000000 0.000000 0.000000 0.273440 0.171646 0.125780 0.106733 0.000000 0.262435 80.619982 0.000000 0.000000 0.000000 19.380018 0.000000 85.554989 0.806041 0.595769 0.000000 13.043201 1.615267 0.000000 5.801580 0.000000 94.198420 0.000000 0.000000 1.277558 5.463872 0.337709 92.920861 0.116740 0.058210
5 0.985600 7 306848.000000 5044.285714 3407.285714 1.643904 1.110415 3592.714286 0.000000 0.000000 0.000000 891.285714 3818.142857 33.714286 24.428571 0.000000 607.714286 0.262209 0.000000 0.000000 0.000000 0.262209 0.201303 0.120589 0.088138 0.000000 0.283769 80.122977 0.000000 0.000000 0.000000 19.877023 0.000000 85.150376 0.751880 0.544794 0.000000 13.552950 1.516503 0.000000 5.801580 0.000000 94.198420 0.000000 0.000000 1.217026 5.502103 0.299478 92.981394 0.109557 0.051620
cutoff 0 Affix a 0.416960 b 0.814680
Suffix a -0.706998 b 0.933881
cutoff 1 Affix a 0.031775 b 0.790236
Suffix a -2.482111 b 0.967669
cutoff 2 Affix a -1.153585 b 0.808514
Suffix a -3.356905 b 0.996574
cutoff 3 Affix a -2.028061 b 0.846547
Suffix a -3.794027 b 1.003571
cutoff 4 Affix a -2.571438 b 0.870756
Suffix a -4.020226 b 1.000475
cutoff 5 Affix a -2.961484 b 0.885966
Suffix a -4.205819 b 0.997875
New (old) algorithm, least wrongly lemmatised (MIN(diff)).
Suffix only no
cutoff 2
fraction 9856.000000
iterations 7
trainlines 306848.000000
rules 8030.142857 (11355.142857)
rules% 2.616977 (3.700576)
same%stdev 0.345939
ambi1%stdev 0.117606
ambi2%stdev 0.131758
ambi3%stdev 0.000000
diff%stdev 0.327113
same% 86.271824 (82.445521)
ambi1% 0.962151 (0.000000)
ambi2% 0.720020 (0.000000)
ambi3% 0.000000 (0.000000)
diff% 12.046005 (17.554479)
amb.rules% 1.876513 (0.000000)
false_amb% 1.465528 (0.000000)
false_not_amb% 5.390595 (5.801580)
true_amb% 0.410985 (0.000000)
true_not_amb% 92.732892 (94.198420)
precision 0.122974 (0.000000)
recall 0.070840 (0.000000)
bests[13].suffixonly == [false]
bests[13].langbase == [nlC2]
comp = comp_parms0_off
bests[13].rows == [1]
R->R W->R R->W W->W
0.052945 0.603264 -0.766378 0.211174 0.017177 0.032318
*/
/*
cutoff fraction iterations trainlines suffixrules affixrules suffix% affix% s-same s-ambiguous s-different a-same a-ambiguous a-different s-same-stddev% s-amb-stddev% s-diff-stddev% a-same-stddev% a-amb-stddev% a-diff-stddev% s-same% s-ambiguous% s-different% s-amb.rules% a-same% a-ambiguous% a-different% a-amb.rules% s_false_amb s_false_not_amb s_true_amb s_true_not_amb s_precision s_recall a_false_amb a_false_not_amb a_true_amb a_true_not_amb a_precision a_recall
0 0.985600 7 306848.000000 73132.000000 38529.714286 23.833299 12.556612 3635.857143 78.000000 117.000000 4.714286 648.428571 3281.428571 0.000000 0.000000 0.000000 1202.571429 0.368578 0.192276 0.216604 0.027956 0.556177 0.528513 0.000000 0.000000 0.000000 0.528513 81.085128 1.739518 2.609277 0.105136 14.460940 4.982796 73.180833 0.000000 0.000000 0.000000 26.819167 0.000000 4.610042 5.428826 0.372754 89.588378 0.038858 0.064250 0.000000 5.801580 0.000000 94.198420 0.000000 0.000000
1 0.985600 7 306848.000000 18391.714286 18986.142857 5.993754 6.187475 3743.714286 0.000000 0.000000 0.000000 740.285714 3283.428571 0.000000 0.000000 0.000000 1200.571429 0.341564 0.000000 0.000000 0.000000 0.341564 0.617750 0.000000 0.000000 0.000000 0.617750 83.490506 0.000000 0.000000 0.000000 16.509494 0.000000 73.225436 0.000000 0.000000 0.000000 26.774564 0.000000 0.000000 5.801580 0.000000 94.198420 0.000000 0.000000 0.000000 5.801580 0.000000 94.198420 0.000000 0.000000
2 0.985600 7 306848.000000 11355.142857 7903.142857 3.700576 2.575589 3696.857143 0.000000 0.000000 0.000000 787.142857 3218.000000 0.000000 0.000000 0.000000 1266.000000 0.396739 0.000000 0.000000 0.000000 0.396739 0.605573 0.000000 0.000000 0.000000 0.605573 82.445521 0.000000 0.000000 0.000000 17.554479 0.000000 71.766280 0.000000 0.000000 0.000000 28.233720 0.000000 0.000000 5.801580 0.000000 94.198420 0.000000 0.000000 0.000000 5.801580 0.000000 94.198420 0.000000 0.000000
3 0.985600 7 306848.000000 8112.285714 5368.714286 2.643747 1.749633 3652.571429 0.000000 0.000000 0.000000 831.428571 3149.428571 0.000000 0.000000 0.000000 1334.571429 0.255206 0.000000 0.000000 0.000000 0.255206 0.594222 0.000000 0.000000 0.000000 0.594222 81.457882 0.000000 0.000000 0.000000 18.542118 0.000000 70.237033 0.000000 0.000000 0.000000 29.762967 0.000000 0.000000 5.801580 0.000000 94.198420 0.000000 0.000000 0.000000 5.801580 0.000000 94.198420 0.000000 0.000000
4 0.985600 7 306848.000000 6239.285714 4172.571429 2.033347 1.359817 3615.000000 0.000000 0.000000 0.000000 869.000000 3087.428571 0.000000 0.000000 0.000000 1396.571429 0.273440 0.000000 0.000000 0.000000 0.273440 0.489086 0.000000 0.000000 0.000000 0.489086 80.619982 0.000000 0.000000 0.000000 19.380018 0.000000 68.854339 0.000000 0.000000 0.000000 31.145661 0.000000 0.000000 5.801580 0.000000 94.198420 0.000000 0.000000 0.000000 5.801580 0.000000 94.198420 0.000000 0.000000
5 0.985600 7 306848.000000 5044.285714 3460.571429 1.643904 1.127780 3592.714286 0.000000 0.000000 0.000000 891.285714 3048.142857 0.000000 0.000000 0.000000 1435.857143 0.262209 0.000000 0.000000 0.000000 0.262209 0.504696 0.000000 0.000000 0.000000 0.504696 80.122977 0.000000 0.000000 0.000000 19.877023 0.000000 67.978208 0.000000 0.000000 0.000000 32.021792 0.000000 0.000000 5.801580 0.000000 94.198420 0.000000 0.000000 0.000000 5.801580 0.000000 94.198420 0.000000 0.000000
cutoff 0 Affix a 0.661338 b 0.786985: N(rules)= 1.937383*N(trainpairs)^0.786985
Suffix a -0.706998 b 0.933881: N(rules)= 0.493123*N(trainpairs)^0.933881
cutoff 1 Affix a 0.321587 b 0.758378: N(rules)= 1.379315*N(trainpairs)^0.758378
Suffix a -2.482111 b 0.967669: N(rules)= 0.083567*N(trainpairs)^0.967669
cutoff 2 Affix a -1.091314 b 0.802674: N(rules)= 0.335775*N(trainpairs)^0.802674
Suffix a -3.356905 b 0.996574: N(rules)= 0.034843*N(trainpairs)^0.996574
cutoff 3 Affix a -2.002932 b 0.844252: N(rules)= 0.134939*N(trainpairs)^0.844252
Suffix a -3.794027 b 1.003571: N(rules)= 0.022505*N(trainpairs)^1.003571
cutoff 4 Affix a -2.546591 b 0.868571: N(rules)= 0.078348*N(trainpairs)^0.868571
Suffix a -4.020226 b 1.000475: N(rules)= 0.017949*N(trainpairs)^1.000475
cutoff 5 Affix a -2.964140 b 0.886552: N(rules)= 0.051605*N(trainpairs)^0.886552
Suffix a -4.205819 b 0.997875: N(rules)= 0.014909*N(trainpairs)^0.997875
New (old) algorithm, least wrongly lemmatised (MIN(diff)).
Suffix only no
Redo training yes (SLOW 1)
cutoff 1
fraction 9856.000000
iterations 7
trainlines 306848.000000
rules 18986.142857 ( 18391.714286)
rules% 6.187475 ( 5.993754)
same%stdev 0.617750
ambi1%stdev 0.000000
ambi2%stdev 0.000000
ambi3%stdev 0.000000
diff%stdev 0.617750
same% 73.225436 ( 83.490506)
ambi1% 0.000000 ( 0.000000)
ambi2% 0.000000 ( 0.000000)
ambi3% 0.000000 ( 0.000000)
diff% 26.774564 ( 16.509494)
amb.rules% 0.000000 ( 0.000000)
false_amb% 0.000000 ( 0.000000)
false_not_amb% 5.801580 ( 5.801580)
true_amb% 0.000000 ( 0.000000)
true_not_amb% 94.198420 ( 94.198420)
precision 0.000000 ( 0.000000)
recall 0.000000 ( 0.000000)
bests[13].suffixonly == [false]
bests[13].langbase == [nlC2]
comp = comp_parms0_off
bests[13].rows == [1]
R->R W->R R->W W->W
0.052945 0.603264 -0.766378 0.211174 0.017177 0.032318
*/
//iteration:20.1
/*weight ( used): 1.74855637296820641e+04 suffix only: no */
/* number of nodes: 37996, nodes/line: 1.22043349222052344e-01 weight ( used): 1.74855637296820641e+04 blobs 1 lines 311332 * fraction 1.00000000000000000e+00 = 311332 lines*/
{{
5.29451965787332626e-02, 6.03263920561431743e-01, -7.66378180909070839e-01, 2.11173926307903825e-01, 1.71771467887680100e-02, 3.23179611128934577e-02
}}
};
static bestParms best_daC0 =
{
false,
"daC0",
1,
//iteration:20.3
/*weight ( used): 1.75942185857700269e+03 suffix only: no */
/* number of nodes: 59989, nodes/line: inf weight ( used): 1.75942185857700269e+03 blobs 1 lines 0 * fraction 1.00000000000000000e+00 = 0 lines*/
/*
cutoff fraction iterations trainlines suffixrules affixrules suffix% affix% s-same s-ambiguous s-different a-same a-ambiguous a-different s-same-stddev% s-amb-stddev% s-diff-stddev% a-same-stddev% a-amb-stddev% a-diff-stddev% s-same% s-ambiguous% s-different% s-amb.rules% a-same% a-ambiguous% a-different% a-amb.rules% s_false_amb s_false_not_amb s_true_amb s_true_not_amb s_precision s_recall a_false_amb a_false_not_amb a_true_amb a_true_not_amb a_precision a_recall
0 0.985600 5 555065.000000 81861.400000 65106.200000 14.748075 11.729473 6963.600000 61.000000 134.400000 6.400000 944.400000 7061.200000 39.200000 28.400000 0.000000 981.000000 6.489145 0.130178 0.211105 0.014057 6.794025 6.658768 0.038417 0.018684 0.000000 6.656800 85.866483 0.752176 1.657254 0.078917 11.645170 2.924856 87.069965 0.483366 0.350194 0.000000 12.096476 1.141828 2.525340 7.714124 0.399517 89.361020 0.073303 0.049240 0.855755 7.827567 0.286074 91.030605 0.143210 0.035258
1 0.985600 5 555065.000000 25122.800000 32643.000000 4.526101 5.880933 7021.000000 0.000000 0.000000 0.000000 1088.800000 7092.400000 38.000000 39.800000 0.000000 939.600000 6.473833 0.000000 0.000000 0.000000 6.473833 6.727077 0.077506 0.052589 0.000000 6.732606 86.574268 0.000000 0.000000 0.000000 13.425732 0.000000 87.454684 0.468569 0.490764 0.000000 11.585982 1.181287 0.000000 8.113640 0.000000 91.886360 0.000000 0.000000 0.791635 7.723988 0.389652 91.094725 0.197500 0.048024
2 0.985600 5 555065.000000 16595.200000 13356.000000 2.989776 2.406205 6952.200000 0.000000 0.000000 0.000000 1157.600000 7105.800000 37.600000 36.400000 0.000000 930.000000 6.380172 0.000000 0.000000 0.000000 6.380172 6.751026 0.046619 0.109794 0.000000 6.819038 85.725912 0.000000 0.000000 0.000000 14.274088 0.000000 87.619917 0.463637 0.448840 0.000000 11.467607 1.053047 0.000000 8.113640 0.000000 91.886360 0.000000 0.000000 0.695455 7.756048 0.357592 91.190905 0.204513 0.044073
3 0.985600 5 555065.000000 12638.600000 9456.800000 2.276959 1.703728 6886.600000 0.000000 0.000000 0.000000 1223.200000 7076.000000 36.400000 30.400000 0.000000 967.000000 6.294505 0.000000 0.000000 0.000000 6.294505 6.669820 0.057565 0.117165 0.000000 6.738610 84.917014 0.000000 0.000000 0.000000 15.082986 0.000000 87.252460 0.448840 0.374855 0.000000 11.923845 0.956867 0.000000 8.113640 0.000000 91.886360 0.000000 0.000000 0.638733 7.795507 0.318134 91.247626 0.199382 0.039210
4 0.985600 5 555065.000000 10397.800000 7534.400000 1.873258 1.357391 6834.600000 0.000000 0.000000 0.000000 1275.200000 7049.200000 34.600000 26.800000 0.000000 999.200000 6.317295 0.000000 0.000000 0.000000 6.317295 6.733371 0.056237 0.063712 0.000000 6.736659 84.275814 0.000000 0.000000 0.000000 15.724186 0.000000 86.921996 0.426644 0.330464 0.000000 12.320896 0.892747 0.000000 8.113640 0.000000 91.886360 0.000000 0.000000 0.594343 7.815236 0.298404 91.292017 0.200663 0.036778
5 0.985600 5 555065.000000 8830.000000 6346.200000 1.590805 1.143326 6782.200000 0.000000 0.000000 0.000000 1327.600000 7010.000000 31.800000 26.200000 0.000000 1041.800000 6.337696 0.000000 0.000000 0.000000 6.337696 6.787495 0.081038 0.057433 0.000000 6.836868 83.629683 0.000000 0.000000 0.000000 16.370317 0.000000 86.438630 0.392118 0.323066 0.000000 12.846186 0.853289 0.000000 8.113640 0.000000 91.886360 0.000000 0.000000 0.586944 7.847296 0.266344 91.299416 0.184932 0.032827
cutoff 0 Affix a 1.073154 b 0.757703: N(rules)= 2.924588*N(trainpairs)^0.757703
Suffix a -0.095489 b 0.854699: N(rules)= 0.908928*N(trainpairs)^0.854699
cutoff 1 Affix a -0.254162 b 0.813340: N(rules)= 0.775566*N(trainpairs)^0.813340
Suffix a -1.274755 b 0.858995: N(rules)= 0.279499*N(trainpairs)^0.858995
cutoff 2 Affix a -1.775778 b 0.868243: N(rules)= 0.169352*N(trainpairs)^0.868243
Suffix a -2.211908 b 0.899654: N(rules)= 0.109492*N(trainpairs)^0.899654
cutoff 3 Affix a -2.803938 b 0.922054: N(rules)= 0.060571*N(trainpairs)^0.922054
Suffix a -2.901611 b 0.932627: N(rules)= 0.054935*N(trainpairs)^0.932627
cutoff 4 Affix a -3.366096 b 0.946465: N(rules)= 0.034524*N(trainpairs)^0.946465
Suffix a -3.423357 b 0.957609: N(rules)= 0.032603*N(trainpairs)^0.957609
cutoff 5 Affix a -3.846574 b 0.971006: N(rules)= 0.021353*N(trainpairs)^0.971006
Suffix a -3.811529 b 0.975132: N(rules)= 0.022114*N(trainpairs)^0.975132
New (old) algorithm, least wrongly lemmatised (MIN(diff)).
Suffix only no
Redo training no
cutoff 2
fraction 9856.000000
iterations 5
trainlines 555065.000000
rules 13356.000000 ( 16595.200000)
rules% 2.406205 ( 2.989776)
same%stdev 6.751026
ambi1%stdev 0.046619
ambi2%stdev 0.109794
ambi3%stdev 0.000000
diff%stdev 6.819038
same% 87.619917 ( 85.725912)
ambi1% 0.463637 ( 0.000000)
ambi2% 0.448840 ( 0.000000)
ambi3% 0.000000 ( 0.000000)
diff% 11.467607 ( 14.274088)
amb.rules% 1.053047 ( 0.000000)
false_amb% 0.695455 ( 0.000000)
false_not_amb% 7.756048 ( 8.113640)
true_amb% 0.357592 ( 0.000000)
true_not_amb% 91.190905 ( 91.886360)
precision 0.204513 ( 0.000000)
recall 0.044073 ( 0.000000)
bests[1].suffixonly == [false]
bests[1].langbase == [daC0]
comp = comp_parms0_off
bests[1].rows == [1]
R->R W->R R->W W->W
0.337801 0.729767 -0.592110 0.028323 0.043129 0.008601
*/
{{
3.37800842415481084e-01, 7.29766766436709680e-01, -5.92110476598071700e-01, 2.83227179535092063e-02, 4.31287955142629492e-02, 8.60067531762726684e-03
}}
};
static bestParms best_daC1 =
{
false,
"daC1",
1,
/*
cutoff fraction iterations trainlines suffixrules affixrules suffix% affix% s-same s-ambiguous s-different a-same a-ambiguous a-different s-same-stddev% s-amb-stddev% s-diff-stddev% a-same-stddev% a-amb-stddev% a-diff-stddev% s-same% s-ambiguous% s-different% s-amb.rules% a-same% a-ambiguous% a-different% a-amb.rules% s_false_amb s_false_not_amb s_true_amb s_true_not_amb s_precision s_recall a_false_amb a_false_not_amb a_true_amb a_true_not_amb a_precision a_recall
0 0.985600 5 555065.000000 81861.400000 60375.000000 14.748075 10.877104 6963.600000 61.000000 134.400000 6.400000 944.400000 7025.800000 38.400000 24.400000 0.000000 1021.200000 6.489145 0.130178 0.211105 0.014057 6.794025 6.654557 0.059535 0.048222 0.000000 6.654642 85.866483 0.752176 1.657254 0.078917 11.645170 2.924856 86.633456 0.473501 0.300871 0.000000 12.592172 1.107302 2.525340 7.714124 0.399517 89.361020 0.073303 0.049240 0.816296 7.822634 0.291006 91.070063 0.151282 0.035866
1 0.985600 5 555065.000000 25122.800000 32443.200000 4.526101 5.844937 7021.000000 0.000000 0.000000 0.000000 1088.800000 7072.000000 33.000000 31.000000 0.000000 973.800000 6.473833 0.000000 0.000000 0.000000 6.473833 6.710209 0.048542 0.046133 0.000000 6.724254 86.574268 0.000000 0.000000 0.000000 13.425732 0.000000 87.203137 0.406915 0.382254 0.000000 12.007694 1.011122 0.000000 8.113640 0.000000 91.886360 0.000000 0.000000 0.685590 7.788108 0.325532 91.200769 0.191860 0.040122
2 0.985600 5 555065.000000 16595.200000 14189.000000 2.989776 2.556277 6952.200000 0.000000 0.000000 0.000000 1157.600000 7093.000000 33.200000 24.800000 0.000000 958.800000 6.380172 0.000000 0.000000 0.000000 6.380172 6.759828 0.066623 0.061275 0.000000 6.808558 85.725912 0.000000 0.000000 0.000000 14.274088 0.000000 87.462083 0.409381 0.305803 0.000000 11.822733 0.865619 0.000000 8.113640 0.000000 91.886360 0.000000 0.000000 0.601741 7.849762 0.263878 91.284619 0.179832 0.032523
3 0.985600 5 555065.000000 12638.600000 10146.400000 2.276959 1.827966 6886.600000 0.000000 0.000000 0.000000 1223.200000 7067.200000 27.200000 19.400000 0.000000 996.000000 6.294505 0.000000 0.000000 0.000000 6.294505 6.663662 0.048064 0.039564 0.000000 6.730579 84.917014 0.000000 0.000000 0.000000 15.082986 0.000000 87.143949 0.335397 0.239217 0.000000 12.281437 0.727515 0.000000 8.113640 0.000000 91.886360 0.000000 0.000000 0.532689 7.918814 0.194826 91.353671 0.154599 0.024012
4 0.985600 5 555065.000000 10397.800000 8153.400000 1.873258 1.468909 6834.600000 0.000000 0.000000 0.000000 1275.200000 7042.600000 23.000000 17.800000 0.000000 1026.400000 6.317295 0.000000 0.000000 0.000000 6.317295 6.694804 0.048550 0.048851 0.000000 6.721411 84.275814 0.000000 0.000000 0.000000 15.724186 0.000000 86.840613 0.283607 0.219488 0.000000 12.656292 0.675726 0.000000 8.113640 0.000000 91.886360 0.000000 0.000000 0.495697 7.933611 0.180029 91.390663 0.153684 0.022188
5 0.985600 5 555065.000000 8830.000000 6905.400000 1.590805 1.244071 6782.200000 0.000000 0.000000 0.000000 1327.600000 7003.200000 20.600000 15.400000 0.000000 1070.600000 6.337696 0.000000 0.000000 0.000000 6.337696 6.733081 0.043238 0.020628 0.000000 6.768454 83.629683 0.000000 0.000000 0.000000 16.370317 0.000000 86.354781 0.254014 0.189894 0.000000 13.201312 0.611606 0.000000 8.113640 0.000000 91.886360 0.000000 0.000000 0.463637 7.965671 0.147969 91.422723 0.137615 0.018237
cutoff 0 Affix a 1.036474 b 0.756051: N(rules)= 2.819260*N(trainpairs)^0.756051
Suffix a -0.095489 b 0.854699: N(rules)= 0.908928*N(trainpairs)^0.854699
cutoff 1 Affix a 0.279046 b 0.768751: N(rules)= 1.321868*N(trainpairs)^0.768751
Suffix a -1.274755 b 0.858995: N(rules)= 0.279499*N(trainpairs)^0.858995
cutoff 2 Affix a -1.140317 b 0.820294: N(rules)= 0.319718*N(trainpairs)^0.820294
Suffix a -2.211908 b 0.899654: N(rules)= 0.109492*N(trainpairs)^0.899654
cutoff 3 Affix a -2.173315 b 0.875709: N(rules)= 0.113800*N(trainpairs)^0.875709
Suffix a -2.901611 b 0.932627: N(rules)= 0.054935*N(trainpairs)^0.932627
cutoff 4 Affix a -2.860454 b 0.911966: N(rules)= 0.057243*N(trainpairs)^0.911966
Suffix a -3.423357 b 0.957609: N(rules)= 0.032603*N(trainpairs)^0.957609
cutoff 5 Affix a -3.347758 b 0.936701: N(rules)= 0.035163*N(trainpairs)^0.936701
Suffix a -3.811529 b 0.975132: N(rules)= 0.022114*N(trainpairs)^0.975132
New (old) algorithm, least wrongly lemmatised (MIN(diff)).
Suffix only no
Redo training no
cutoff 2
fraction 9856.000000
iterations 5
trainlines 555065.000000
rules 14189.000000 ( 16595.200000)
rules% 2.556277 ( 2.989776)
same%stdev 6.759828
ambi1%stdev 0.066623
ambi2%stdev 0.061275
ambi3%stdev 0.000000
diff%stdev 6.808558
same% 87.462083 ( 85.725912)
ambi1% 0.409381 ( 0.000000)
ambi2% 0.305803 ( 0.000000)
ambi3% 0.000000 ( 0.000000)
diff% 11.822733 ( 14.274088)
amb.rules% 0.865619 ( 0.000000)
false_amb% 0.601741 ( 0.000000)
false_not_amb% 7.849762 ( 8.113640)
true_amb% 0.263878 ( 0.000000)
true_not_amb% 91.284619 ( 91.886360)
precision 0.179832 ( 0.000000)
recall 0.032523 ( 0.000000)
bests[2].suffixonly == [false]
bests[2].langbase == [daC1]
comp = comp_parms0_off
bests[2].rows == [1]
R->R W->R R->W W->W
0.138401 0.770176 -0.594523 0.183745 -0.001895 0.021206
*/
//iteration:20.1
/*weight ( used): 1.10455190216752599e+04 suffix only: no */
/* number of nodes: 59917, nodes/line: 1.00941233155262197e-01 weight ( used): 1.10455190216752599e+04 blobs 1 lines 593583 * fraction 1.00000000000000000e+00 = 593583 lines*/
{{
1.38401137522408346e-01, 7.70176363116090945e-01, -5.94523323442289975e-01, 1.83744971823973091e-01, -1.89503087444323529e-03, 2.12062938841620120e-02
}}
};
static bestParms best_da3 =
{
false,
"da3",
1,
//iteration:20.5
/*weight (not used): 5.00470146882884801e+04 suffix only: no */
/* number of nodes: 54059, nodes/line: 9.10723521394649072e-02 weight (not used): 5.00470146882884801e+04 blobs 1 lines 593583 * fraction 1.00000000000000000e+00 = 593583 lines*/
{{ // LOWEST fraction 0.001
0.00000000000000000e+00, 7.35136672243838163e-01, -6.17008258102692664e-01, 2.80846724309429585e-01
}}
};
static bestParms best_en4 = // English, ambiguous training pairs in training set derived from CELEX
{
false,
"en4",
1,
//iteration:20.5
/*weight (not used): 1.02754092132184778e+04 suffix only: no */
/* number of nodes: 10801, nodes/line: 1.37949091280636538e-01 weight (not used): 1.02754092132184778e+04 blobs 1 lines 78297 * fraction 1.00000000000000000e+00 = 78297 lines*/
{{
1.06322242479528462e-03, 7.78077313282915517e-01, -5.98834680747542203e-01, 1.89714494033809300e-01
}}
};
static bestParms best_en4_suffix = // English, ambiguous training pairs in training set derived from CELEX
{
true,
"en4",
1,
//iteration:20.5
/*weight (not used): 1.23994408913745046e+04 suffix only: yes */
/* number of nodes: 13281, nodes/line: 1.69623357216751591e-01 weight (not used): 1.23994408913745046e+04 blobs 1 lines 78297 * fraction 1.00000000000000000e+00 = 78297 lines*/
{{
7.47138752010872206e-03, 7.20070629992721201e-01, -6.91457516803205441e-01, 5.76972152426369414e-02
}}
};
static bestParms best_en4W = // English, ambiguous training pairs in training set derived from CELEX
{
false,
"en4W",
1,
/*
0 0.985600 14 77169.000000 14451.285714 11095.214286 18.726802 14.377813 935.285714 5.428571 15.500000 0.000000 171.785714 934.928571 14.428571 7.500000 0.000000 171.142857 1.122684 0.142160 0.476458 0.000000 1.182352 1.080444 0.338409 0.207908 0.000000 0.959778 82.915400 0.481256 1.374113 0.000000 15.229230 1.994681 82.883739 1.279129 0.664894 0.000000 15.172239 2.121327 1.956687 4.951874 0.037994 93.053445 0.009615 0.007614 2.070669 4.939210 0.050659 92.939463 0.012085 0.010152
1 0.985600 14 77169.000000 2563.357143 5913.000000 3.321745 7.662403 975.428571 0.000000 0.000000 0.000000 152.571429 955.357143 14.357143 10.428571 0.000000 147.857143 1.082261 0.000000 0.000000 0.000000 1.082261 1.168686 0.270705 0.213544 0.000000 1.031431 86.474164 0.000000 0.000000 0.000000 13.525836 0.000000 84.694782 1.272796 0.924519 0.000000 13.107903 2.368288 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000 1.652736 4.274316 0.715552 93.357396 0.177953 0.143401
2 0.985600 14 77169.000000 1454.357143 1465.785714 1.884639 1.899449 971.785714 0.000000 0.000000 0.000000 156.214286 979.500000 6.285714 3.785714 0.000000 138.428571 1.146528 0.000000 0.000000 0.000000 1.146528 0.938049 0.244469 0.130933 0.000000 0.918487 86.151216 0.000000 0.000000 0.000000 13.848784 0.000000 86.835106 0.557244 0.335613 0.000000 12.272036 1.006839 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000 0.797872 4.780902 0.208967 94.212259 0.115789 0.041878
3 0.985600 14 77169.000000 1004.857143 766.642857 1.302151 0.993460 965.928571 0.000000 0.000000 0.000000 162.071429 980.000000 4.714286 3.357143 0.000000 139.928571 1.079324 0.000000 0.000000 0.000000 1.079324 0.901403 0.194938 0.195491 0.000000 0.937864 85.631966 0.000000 0.000000 0.000000 14.368034 0.000000 86.879433 0.417933 0.297619 0.000000 12.405015 0.861196 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000 0.690223 4.818896 0.170973 94.319909 0.110204 0.034264
4 0.985600 14 77169.000000 782.785714 538.428571 1.014378 0.697727 962.642857 0.000000 0.000000 0.000000 165.357143 979.357143 3.428571 2.857143 0.000000 142.357143 0.977102 0.000000 0.000000 0.000000 0.977102 0.893052 0.146351 0.134030 0.000000 0.896431 85.340679 0.000000 0.000000 0.000000 14.659321 0.000000 86.822442 0.303951 0.253293 0.000000 12.620314 0.677558 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000 0.557244 4.869554 0.120314 94.452888 0.097436 0.024112
5 0.985600 14 77169.000000 653.928571 398.500000 0.847398 0.516399 960.142857 0.000000 0.000000 0.000000 167.857143 978.285714 2.214286 2.285714 0.000000 145.214286 1.007113 0.000000 0.000000 0.000000 1.007113 0.798576 0.126232 0.073176 0.000000 0.758786 85.119048 0.000000 0.000000 0.000000 14.880952 0.000000 86.727457 0.196302 0.202634 0.000000 12.873607 0.481256 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000 0.367275 4.875887 0.113982 94.642857 0.134328 0.022843
cutoff 0 Affix a -0.766864 b 0.896467
Suffix a -1.386611 b 0.965747
cutoff 1 Affix a -1.202355 b 0.878363
Suffix a -2.548348 b 0.910826
cutoff 2 Affix a -2.151304 b 0.835420
Suffix a -2.774658 b 0.877415
cutoff 3 Affix a -2.479762 b 0.808472
Suffix a -2.982081 b 0.866045
cutoff 4 Affix a -2.713938 b 0.798032
Suffix a -3.164283 b 0.861559
cutoff 5 Affix a -2.807226 b 0.783347
Suffix a -3.194672 b 0.847348
New (old) algorithm, least wrongly lemmatised (MIN(diff)).
Suffix only no
cutoff 2
fraction 9856.000000
iterations 14
trainlines 77169.000000
rules 1465.785714 (1454.357143)
rules% 1.899449 (1.884639)
same%stdev 0.938049
ambi1%stdev 0.244469
ambi2%stdev 0.130933
ambi3%stdev 0.000000
diff%stdev 0.918487
same% 86.835106 (86.151216)
ambi1% 0.557244 (0.000000)
ambi2% 0.335613 (0.000000)
ambi3% 0.000000 (0.000000)
diff% 12.272036 (13.848784)
amb.rules% 1.006839 (0.000000)
false_amb% 0.797872 (0.000000)
false_not_amb% 4.780902 (4.989868)
true_amb% 0.208967 (0.000000)
true_not_amb% 94.212259 (95.010132)
precision 0.115789 (0.000000)
recall 0.041878 (0.000000)
bests[5].suffixonly == [false]
bests[5].langbase == [en4W]
comp = comp_parms0_off
bests[5].rows == [1]
R->R W->R R->W W->W
0.004029 0.680122 -0.724874 0.109431
*/
//iteration:20.9
/*weight ( used): 1.13023631391451454e+04 suffix only: no */
/* number of nodes: 9850, nodes/line: 1.25803032044650487e-01 weight ( used): 1.13023631391451454e+04 blobs 1 lines 78297 * fraction 1.00000000000000000e+00 = 78297 lines*/
{{
4.02884374087577289e-03, 6.80122085729639836e-01, -7.24874245486841984e-01, 1.09430549440079072e-01
}}
};
/*
To compare: comp_sugar, with ambiguous training set instead of disambiguated set from ACL:
0 0.985600 14 77169.000000 14451.285714 18381.357143 18.726802 23.819613 935.285714 5.428571 15.500000 0.000000 171.785714 943.428571 10.071429 4.285714 0.000000 170.214286 1.122684 0.142160 0.476458 0.000000 1.182352 0.951464 0.215657 0.106772 0.000000 0.918793 82.915400 0.481256 1.374113 0.000000 15.229230 1.994681 83.637285 0.892857 0.379939 0.000000 15.089919 1.443769 1.956687 4.951874 0.037994 93.053445 0.009615 0.007614 1.405775 4.951874 0.037994 93.604357 0.013333 0.007614
1 0.985600 14 77169.000000 2563.357143 7041.000000 3.321745 9.124130 975.428571 0.000000 0.000000 0.000000 152.571429 966.928571 7.500000 6.785714 0.000000 146.785714 1.082261 0.000000 0.000000 0.000000 1.082261 0.890922 0.284093 0.241358 0.000000 0.767500 86.474164 0.000000 0.000000 0.000000 13.525836 0.000000 85.720618 0.664894 0.601570 0.000000 13.012918 1.494428 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000 1.038501 4.533941 0.455927 93.971631 0.180000 0.091371
2 0.985600 14 77169.000000 1454.357143 1365.714286 1.884639 1.769771 971.785714 0.000000 0.000000 0.000000 156.214286 980.357143 3.642857 4.428571 0.000000 139.571429 1.146528 0.000000 0.000000 0.000000 1.146528 0.871120 0.204558 0.165723 0.000000 0.792714 86.151216 0.000000 0.000000 0.000000 13.848784 0.000000 86.911094 0.322948 0.392604 0.000000 12.373354 0.861196 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000 0.658561 4.787234 0.202634 94.351570 0.133333 0.040609
3 0.985600 14 77169.000000 1004.857143 716.642857 1.302151 0.928667 965.928571 0.000000 0.000000 0.000000 162.071429 979.714286 2.071429 3.285714 0.000000 142.928571 1.079324 0.000000 0.000000 0.000000 1.079324 0.778644 0.101165 0.122587 0.000000 0.766937 85.631966 0.000000 0.000000 0.000000 14.368034 0.000000 86.854103 0.183637 0.291287 0.000000 12.670973 0.639564 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000 0.500253 4.850557 0.139311 94.509878 0.122222 0.027919
4 0.985600 14 77169.000000 782.785714 455.928571 1.014378 0.590818 962.642857 0.000000 0.000000 0.000000 165.357143 977.857143 2.142857 2.571429 0.000000 145.428571 0.977102 0.000000 0.000000 0.000000 0.977102 0.701383 0.119736 0.128772 0.000000 0.692585 85.340679 0.000000 0.000000 0.000000 14.659321 0.000000 86.689463 0.189970 0.227964 0.000000 12.892604 0.538247 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000 0.436930 4.888551 0.101317 94.573202 0.103896 0.020305
5 0.985600 14 77169.000000 653.928571 342.071429 0.847398 0.443276 960.142857 0.000000 0.000000 0.000000 167.857143 976.500000 1.571429 2.357143 0.000000 147.571429 1.007113 0.000000 0.000000 0.000000 1.007113 0.704730 0.102648 0.118467 0.000000 0.708976 85.119048 0.000000 0.000000 0.000000 14.880952 0.000000 86.569149 0.139311 0.208967 0.000000 13.082573 0.455927 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000 0.373607 4.907548 0.082320 94.636525 0.099237 0.016497
cutoff 0 Affix a -0.301365 b 0.902523
Suffix a -1.386611 b 0.965747
cutoff 1 Affix a -1.463083 b 0.912868
Suffix a -2.548348 b 0.910826
cutoff 2 Affix a -1.744098 b 0.787651
Suffix a -2.774658 b 0.877415
cutoff 3 Affix a -1.561877 b 0.715037
Suffix a -2.982081 b 0.866045
cutoff 4 Affix a -1.408205 b 0.667043
Suffix a -3.164283 b 0.861559
cutoff 5 Affix a -1.279623 b 0.632865
Suffix a -3.194672 b 0.847348
New (old) algorithm, least wrongly lemmatised (MIN(diff)).
Suffix only no
cutoff 2
fraction 9856.000000
iterations 14
trainlines 77169.000000
rules 1365.714286 (1454.357143)
rules% 1.769771 (1.884639)
same%stdev 0.871120
ambi1%stdev 0.204558
ambi2%stdev 0.165723
ambi3%stdev 0.000000
diff%stdev 0.792714
same% 86.911094 (86.151216)
ambi1% 0.322948 (0.000000)
ambi2% 0.392604 (0.000000)
ambi3% 0.000000 (0.000000)
diff% 12.373354 (13.848784)
amb.rules% 0.861196 (0.000000)
false_amb% 0.658561 (0.000000)
false_not_amb% 4.787234 (4.989868)
true_amb% 0.202634 (0.000000)
true_not_amb% 94.351570 (95.010132)
precision 0.133333 (0.000000)
recall 0.040609 (0.000000)
*/
static bestParms best_en6W = // English, ambiguous training pairs in training set derived from CELEX
// weight function (graph.h) 5.0*(rcount-1.0)*exp(-0.9*rcount)+1
{
false,
"en6W", /* Better than comp_sugar 20140915 */
1,
/*
0 0.985600 14 77169.000000 14451.285714 11093.357143 18.726802 14.375406 935.285714 5.428571 15.500000 0.000000 171.785714 941.428571 12.785714 6.714286 0.000000 167.071429 1.122684 0.142160 0.476458 0.000000 1.182352 0.930909 0.311360 0.212733 0.000000 0.831971 82.915400 0.481256 1.374113 0.000000 15.229230 1.994681 83.459980 1.133485 0.595238 0.000000 14.811297 1.918693 1.956687 4.951874 0.037994 93.053445 0.009615 0.007614 1.880699 4.951874 0.037994 93.129433 0.010000 0.007614
1 0.985600 14 77169.000000 2563.357143 5831.642857 3.321745 7.556976 975.428571 0.000000 0.000000 0.000000 152.571429 959.500000 12.071429 9.642857 0.000000 146.785714 1.082261 0.000000 0.000000 0.000000 1.082261 0.909914 0.298038 0.201581 0.000000 0.781550 86.474164 0.000000 0.000000 0.000000 13.525836 0.000000 85.062057 1.070162 0.854863 0.000000 13.012918 2.165653 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000 1.481763 4.305978 0.683891 93.528369 0.187500 0.137056
2 0.985600 14 77169.000000 1454.357143 1516.928571 1.884639 1.965723 971.785714 0.000000 0.000000 0.000000 156.214286 981.000000 5.928571 3.428571 0.000000 137.642857 1.146528 0.000000 0.000000 0.000000 1.146528 0.861625 0.242072 0.096579 0.000000 0.842186 86.151216 0.000000 0.000000 0.000000 13.848784 0.000000 86.968085 0.525583 0.303951 0.000000 12.202381 1.006839 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000 0.721884 4.704914 0.284954 94.288247 0.164835 0.057107
3 0.985600 14 77169.000000 1004.857143 781.857143 1.302151 1.013175 965.928571 0.000000 0.000000 0.000000 162.071429 981.428571 4.000000 2.285714 0.000000 140.285714 1.079324 0.000000 0.000000 0.000000 1.079324 0.778865 0.180683 0.100951 0.000000 0.826112 85.631966 0.000000 0.000000 0.000000 14.368034 0.000000 87.006079 0.354610 0.202634 0.000000 12.436677 0.728217 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000 0.563576 4.825228 0.164640 94.446555 0.127451 0.032995
4 0.985600 14 77169.000000 782.785714 548.285714 1.014378 0.710500 962.642857 0.000000 0.000000 0.000000 165.357143 980.071429 2.714286 2.285714 0.000000 142.928571 0.977102 0.000000 0.000000 0.000000 0.977102 0.842801 0.145166 0.117552 0.000000 0.829060 85.340679 0.000000 0.000000 0.000000 14.659321 0.000000 86.885765 0.240628 0.202634 0.000000 12.670973 0.595238 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000 0.493921 4.888551 0.101317 94.516211 0.093023 0.020305
5 0.985600 14 77169.000000 653.928571 400.928571 0.847398 0.519546 960.142857 0.000000 0.000000 0.000000 167.857143 979.500000 2.357143 1.785714 0.000000 144.357143 1.007113 0.000000 0.000000 0.000000 1.007113 0.759241 0.128268 0.079130 0.000000 0.772547 85.119048 0.000000 0.000000 0.000000 14.880952 0.000000 86.835106 0.208967 0.158308 0.000000 12.797619 0.455927 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000 0.341945 4.875887 0.113982 94.668186 0.142857 0.022843
cutoff 0 Affix a -0.751238 b 0.894845
Suffix a -1.386611 b 0.965747
cutoff 1 Affix a -1.168673 b 0.873893
Suffix a -2.548348 b 0.910826
cutoff 2 Affix a -2.186093 b 0.839892
Suffix a -2.774658 b 0.877415
cutoff 3 Affix a -2.451129 b 0.806491
Suffix a -2.982081 b 0.866045
cutoff 4 Affix a -2.687102 b 0.796650
Suffix a -3.164283 b 0.861559
cutoff 5 Affix a -2.783724 b 0.782550
Suffix a -3.194672 b 0.847348
New (old) algorithm, least wrongly lemmatised (MIN(diff)).
Suffix only no
cutoff 2
fraction 9856.000000
iterations 14
trainlines 77169.000000
rules 1516.928571 (1454.357143)
rules% 1.965723 (1.884639)
same%stdev 0.861625
ambi1%stdev 0.242072
ambi2%stdev 0.096579
ambi3%stdev 0.000000
diff%stdev 0.842186
same% 86.968085 (86.151216)
ambi1% 0.525583 (0.000000)
ambi2% 0.303951 (0.000000)
ambi3% 0.000000 (0.000000)
diff% 12.202381 (13.848784)
amb.rules% 1.006839 (0.000000)
false_amb% 0.721884 (0.000000)
false_not_amb% 4.704914 (4.989868)
true_amb% 0.284954 (0.000000)
true_not_amb% 94.288247 (95.010132)
precision 0.164835 (0.000000)
recall 0.057107 (0.000000)
bests[5].suffixonly == [false]
bests[5].langbase == [en6W]
comp = comp_parms0_off
bests[5].rows == [1]
R->R W->R R->W W->W
0.010390 0.693128 -0.691156 0.204301 0.005382 -0.000404
*/
//iteration:20.3
/*weight ( used): 1.12271613326426923e+04 suffix only: no */
/* number of nodes: 9840, nodes/line: 1.25675313230391966e-01 weight ( used): 1.12271613326426923e+04 blobs 1 lines 78297 * fraction 1.00000000000000000e+00 = 78297 lines*/
{{
1.03898721887023399e-02, 6.93128471853588057e-01, -6.91156252727169851e-01, 2.04300945832150471e-01, 5.38207657061103278e-03, -4.03932947156769432e-04
}}
};
static bestParms best_enC0 = // English, ambiguous training pairs in training set derived from CELEX
// weight function (graph.h) rcount*rcount*exp(-2.0*rcount/1.0)
// Aiming at cutoff == 0, because maximum penalty is for 1 word/lemma pairs for a rule.
// Not good, compared with nlC0
{
false,
"enC0",
1,
/*
cutoff fraction iterations trainlines suffixrules affixrules suffix% affix% s-same s-ambiguous s-different a-same a-ambiguous a-different s-same-stddev% s-amb-stddev% s-diff-stddev% a-same-stddev% a-amb-stddev% a-diff-stddev% s-same% s-ambiguous% s-different% s-amb.rules% a-same% a-ambiguous% a-different% a-amb.rules% s_false_amb s_false_not_amb s_true_amb s_true_not_amb s_precision s_recall a_false_amb a_false_not_amb a_true_amb a_true_not_amb a_precision a_recall
0 0.985600 14 77169.000000 14451.285714 14440.214286 18.726802 18.712455 935.285714 5.428571 15.500000 0.000000 171.785714 915.857143 0.000000 0.000000 0.000000 212.142857 1.122684 0.142160 0.476458 0.000000 1.182352 1.195915 0.000000 0.000000 0.000000 1.195915 82.915400 0.481256 1.374113 0.000000 15.229230 1.994681 81.193009 0.000000 0.000000 0.000000 18.806991 0.000000 1.956687 4.951874 0.037994 93.053445 0.009615 0.007614 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000
1 0.985600 14 77169.000000 2563.357143 6563.214286 3.321745 8.504988 975.428571 0.000000 0.000000 0.000000 152.571429 930.928571 0.000000 0.000000 0.000000 197.071429 1.082261 0.000000 0.000000 0.000000 1.082261 0.944929 0.000000 0.000000 0.000000 0.944929 86.474164 0.000000 0.000000 0.000000 13.525836 0.000000 82.529129 0.000000 0.000000 0.000000 17.470871 0.000000 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000
2 0.985600 14 77169.000000 1454.357143 1599.142857 1.884639 2.072261 971.785714 0.000000 0.000000 0.000000 156.214286 931.500000 0.000000 0.000000 0.000000 196.500000 1.146528 0.000000 0.000000 0.000000 1.146528 0.895852 0.000000 0.000000 0.000000 0.895852 86.151216 0.000000 0.000000 0.000000 13.848784 0.000000 82.579787 0.000000 0.000000 0.000000 17.420213 0.000000 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000
3 0.985600 14 77169.000000 1004.857143 845.428571 1.302151 1.095555 965.928571 0.000000 0.000000 0.000000 162.071429 928.500000 0.000000 0.000000 0.000000 199.500000 1.079324 0.000000 0.000000 0.000000 1.079324 0.951487 0.000000 0.000000 0.000000 0.951487 85.631966 0.000000 0.000000 0.000000 14.368034 0.000000 82.313830 0.000000 0.000000 0.000000 17.686170 0.000000 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000
4 0.985600 14 77169.000000 782.785714 602.357143 1.014378 0.780569 962.642857 0.000000 0.000000 0.000000 165.357143 918.357143 0.000000 0.000000 0.000000 209.642857 0.977102 0.000000 0.000000 0.000000 0.977102 0.972761 0.000000 0.000000 0.000000 0.972761 85.340679 0.000000 0.000000 0.000000 14.659321 0.000000 81.414640 0.000000 0.000000 0.000000 18.585360 0.000000 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000
5 0.985600 14 77169.000000 653.928571 465.142857 0.847398 0.602759 960.142857 0.000000 0.000000 0.000000 167.857143 909.357143 0.000000 0.000000 0.000000 218.642857 1.007113 0.000000 0.000000 0.000000 1.007113 0.908489 0.000000 0.000000 0.000000 0.908489 85.119048 0.000000 0.000000 0.000000 14.880952 0.000000 80.616768 0.000000 0.000000 0.000000 19.383232 0.000000 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000
cutoff 0 Affix a -0.366782 b 0.887585: N(rules)= 0.692961*N(trainpairs)^0.887585
Suffix a -1.386611 b 0.965747: N(rules)= 0.249921*N(trainpairs)^0.965747
cutoff 1 Affix a -0.918458 b 0.862552: N(rules)= 0.399134*N(trainpairs)^0.862552
Suffix a -2.548348 b 0.910826: N(rules)= 0.078211*N(trainpairs)^0.910826
cutoff 2 Affix a -2.127536 b 0.843785: N(rules)= 0.119131*N(trainpairs)^0.843785
Suffix a -2.774658 b 0.877415: N(rules)= 0.062371*N(trainpairs)^0.877415
cutoff 3 Affix a -2.457774 b 0.817145: N(rules)= 0.085625*N(trainpairs)^0.817145
Suffix a -2.982081 b 0.866045: N(rules)= 0.050687*N(trainpairs)^0.866045
cutoff 4 Affix a -2.524994 b 0.789327: N(rules)= 0.080059*N(trainpairs)^0.789327
Suffix a -3.164283 b 0.861559: N(rules)= 0.042244*N(trainpairs)^0.861559
cutoff 5 Affix a -2.538105 b 0.765681: N(rules)= 0.079016*N(trainpairs)^0.765681
Suffix a -3.194672 b 0.847348: N(rules)= 0.040980*N(trainpairs)^0.847348
New (old) algorithm, least wrongly lemmatised (MIN(diff)).
Suffix only no
Redo training yes
cutoff 2
fraction 9856.000000
iterations 14
trainlines 77169.000000
rules 1599.142857 ( 1454.357143)
rules% 2.072261 ( 1.884639)
same%stdev 0.895852
ambi1%stdev 0.000000
ambi2%stdev 0.000000
ambi3%stdev 0.000000
diff%stdev 0.895852
same% 82.579787 ( 86.151216)
ambi1% 0.000000 ( 0.000000)
ambi2% 0.000000 ( 0.000000)
ambi3% 0.000000 ( 0.000000)
diff% 17.420213 ( 13.848784)
amb.rules% 0.000000 ( 0.000000)
false_amb% 0.000000 ( 0.000000)
false_not_amb% 4.989868 ( 4.989868)
true_amb% 0.000000 ( 0.000000)
true_not_amb% 95.010132 ( 95.010132)
precision 0.000000 ( 0.000000)
recall 0.000000 ( 0.000000)
bests[11].suffixonly == [false]
bests[11].langbase == [enC0]
comp = comp_parms0_off
bests[11].rows == [1]
R->R W->R R->W W->W
0.107837 0.622363 -0.720085 0.215238 -0.009468 0.189990
*/
//iteration:20.6
/*weight ( used): 5.65781320119657494e+02 suffix only: no */
/* number of nodes: 14310, nodes/line: inf weight ( used): 5.65781320119657494e+02 blobs 1 lines 0 * fraction 1.00000000000000000e+00 = 0 lines*/
{{
1.07837025520926860e-01, 6.22362687879378429e-01, -7.20085301339376205e-01, 2.15237712457007291e-01, -9.46806103021151715e-03, 1.89989742917709065e-01
}}
};
static bestParms best_en6WS1 = // English, ambiguous training pairs in training set derived from CELEX
// weight function (graph.h) rcount*exp(-rcount/2.0)
// Aiming at cutoff == 2, because maximum penalty is for 2 word/lemma pairs for a rule.
{
false,
"en6WS1",
1,
/*
0 0.985600 14 77169.000000 14451.285714 12738.785714 18.726802 16.507646 935.285714 5.428571 15.500000 0.000000 171.785714 940.571429 10.357143 5.000000 0.000000 172.071429 1.122684 0.142160 0.476458 0.000000 1.182352 0.773413 0.327317 0.278179 0.000000 0.742912 82.915400 0.481256 1.374113 0.000000 15.229230 1.994681 83.383992 0.918186 0.443262 0.000000 15.254559 1.519757 1.956687 4.951874 0.037994 93.053445 0.009615 0.007614 1.481763 4.951874 0.037994 93.528369 0.012658 0.007614
1 0.985600 14 77169.000000 2563.357143 6752.357143 3.321745 8.750090 975.428571 0.000000 0.000000 0.000000 152.571429 960.857143 10.428571 8.000000 0.000000 148.714286 1.082261 0.000000 0.000000 0.000000 1.082261 0.678602 0.284017 0.235838 0.000000 0.706169 86.474164 0.000000 0.000000 0.000000 13.525836 0.000000 85.182371 0.924519 0.709220 0.000000 13.183891 1.855370 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000 1.348784 4.483283 0.506586 93.661348 0.158103 0.101523
2 0.985600 14 77169.000000 1454.357143 1770.214286 1.884639 2.293945 971.785714 0.000000 0.000000 0.000000 156.214286 975.357143 5.285714 3.071429 0.000000 144.285714 1.146528 0.000000 0.000000 0.000000 1.146528 0.881469 0.221095 0.207076 0.000000 0.907704 86.151216 0.000000 0.000000 0.000000 13.848784 0.000000 86.467832 0.468592 0.272290 0.000000 12.791287 0.930851 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000 0.778875 4.837893 0.151976 94.231256 0.088889 0.030457
3 0.985600 14 77169.000000 1004.857143 954.571429 1.302151 1.236988 965.928571 0.000000 0.000000 0.000000 162.071429 975.214286 4.642857 2.142857 0.000000 146.000000 1.079324 0.000000 0.000000 0.000000 1.079324 0.926655 0.157849 0.166244 0.000000 0.867220 85.631966 0.000000 0.000000 0.000000 14.368034 0.000000 86.455167 0.411601 0.189970 0.000000 12.943262 0.804205 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000 0.690223 4.875887 0.113982 94.319909 0.076271 0.022843
4 0.985600 14 77169.000000 782.785714 675.214286 1.014378 0.874981 962.642857 0.000000 0.000000 0.000000 165.357143 973.785714 3.428571 1.857143 0.000000 148.928571 0.977102 0.000000 0.000000 0.000000 0.977102 0.993406 0.154392 0.119736 0.000000 0.988089 85.340679 0.000000 0.000000 0.000000 14.659321 0.000000 86.328521 0.303951 0.164640 0.000000 13.202888 0.658561 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000 0.557244 4.888551 0.101317 94.452888 0.083333 0.020305
5 0.985600 14 77169.000000 653.928571 510.214286 0.847398 0.661165 960.142857 0.000000 0.000000 0.000000 167.857143 971.714286 3.357143 2.000000 0.000000 150.928571 1.007113 0.000000 0.000000 0.000000 1.007113 0.941701 0.192374 0.115327 0.000000 0.916345 85.119048 0.000000 0.000000 0.000000 14.880952 0.000000 86.144883 0.297619 0.177305 0.000000 13.380193 0.633232 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000 0.550912 4.907548 0.082320 94.459220 0.069519 0.016497
cutoff 0 Affix a -0.611390 b 0.895684
Suffix a -1.386611 b 0.965747
cutoff 1 Affix a -1.154468 b 0.886261
Suffix a -2.548348 b 0.910826
cutoff 2 Affix a -2.266144 b 0.864527
Suffix a -2.774658 b 0.877415
cutoff 3 Affix a -2.617687 b 0.842355
Suffix a -2.982081 b 0.866045
cutoff 4 Affix a -2.846212 b 0.831893
Suffix a -3.164283 b 0.861559
cutoff 5 Affix a -2.862054 b 0.808661
Suffix a -3.194672 b 0.847348
New (old) algorithm, least wrongly lemmatised (MIN(diff)).
Suffix only no
cutoff 2
fraction 9856.000000
iterations 14
trainlines 77169.000000
rules 1770.214286 (1454.357143)
rules% 2.293945 (1.884639)
same%stdev 0.881469
ambi1%stdev 0.221095
ambi2%stdev 0.207076
ambi3%stdev 0.000000
diff%stdev 0.907704
same% 86.467832 (86.151216)
ambi1% 0.468592 (0.000000)
ambi2% 0.272290 (0.000000)
ambi3% 0.000000 (0.000000)
diff% 12.791287 (13.848784)
amb.rules% 0.930851 (0.000000)
false_amb% 0.778875 (0.000000)
false_not_amb% 4.837893 (4.989868)
true_amb% 0.151976 (0.000000)
true_not_amb% 94.231256 (95.010132)
precision 0.088889 (0.000000)
recall 0.030457 (0.000000)
bests[8].suffixonly == [false]
bests[8].langbase == [en6WS1]
comp = comp_parms0_off
bests[8].rows == [1]
R->R W->R R->W W->W
0.099984 0.698758 -0.658037 0.246512 0.028495 0.084549
*/
//iteration:20.7
/*weight ( used): 4.15520713829023862e+03 suffix only: no */
/* number of nodes: 11740, nodes/line: 1.49941887939512380e-01 weight ( used): 4.15520713829023862e+03 blobs 1 lines 78297 * fraction 1.00000000000000000e+00 = 78297 lines*/
{{
9.99843486891121014e-02, 6.98757557409581676e-01, -6.58036595438773131e-01, 2.46512251235911778e-01, 2.84952417970883061e-02, 8.45492532733789354e-02
}}
};
static bestParms best_en6WS2 = // English, ambiguous training pairs in training set derived from CELEX
// weight function (graph.h) rcount*exp(-rcount/3.0)
// Aiming at cutoff == 2, because maximum penalty is for 3 word/lemma pairs for a rule.
{
false,
"en6WS2",
1,
/*
0 0.985600 14 77169.000000 14451.285714 11399.714286 18.726802 14.772401 935.285714 5.428571 15.500000 0.000000 171.785714 944.285714 11.000000 5.214286 0.000000 167.500000 1.122684 0.142160 0.476458 0.000000 1.182352 0.773971 0.243407 0.200292 0.000000 0.705588 82.915400 0.481256 1.374113 0.000000 15.229230 1.994681 83.713273 0.975177 0.462259 0.000000 14.849291 1.621074 1.956687 4.951874 0.037994 93.053445 0.009615 0.007614 1.583080 4.951874 0.037994 93.427052 0.011858 0.007614
1 0.985600 14 77169.000000 2563.357143 5901.071429 3.321745 7.646946 975.428571 0.000000 0.000000 0.000000 152.571429 963.714286 10.000000 7.500000 0.000000 146.785714 1.082261 0.000000 0.000000 0.000000 1.082261 0.701875 0.307101 0.154532 0.000000 0.566244 86.474164 0.000000 0.000000 0.000000 13.525836 0.000000 85.435664 0.886525 0.664894 0.000000 13.012918 1.804711 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000 1.329787 4.514944 0.474924 93.680344 0.151515 0.095178
2 0.985600 14 77169.000000 1454.357143 1501.357143 1.884639 1.945544 971.785714 0.000000 0.000000 0.000000 156.214286 980.785714 5.071429 2.571429 0.000000 139.571429 1.146528 0.000000 0.000000 0.000000 1.146528 0.858236 0.207076 0.137842 0.000000 0.818972 86.151216 0.000000 0.000000 0.000000 13.848784 0.000000 86.949088 0.449595 0.227964 0.000000 12.373354 0.848531 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000 0.645897 4.787234 0.202634 94.364235 0.135593 0.040609
3 0.985600 14 77169.000000 1004.857143 741.285714 1.302151 0.960600 965.928571 0.000000 0.000000 0.000000 162.071429 981.428571 2.714286 2.000000 0.000000 141.857143 1.079324 0.000000 0.000000 0.000000 1.079324 0.794999 0.112291 0.098351 0.000000 0.790532 85.631966 0.000000 0.000000 0.000000 14.368034 0.000000 87.006079 0.240628 0.177305 0.000000 12.575988 0.563576 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000 0.436930 4.863222 0.126646 94.573202 0.126582 0.025381
4 0.985600 14 77169.000000 782.785714 524.000000 1.014378 0.679029 962.642857 0.000000 0.000000 0.000000 165.357143 980.928571 2.142857 1.785714 0.000000 143.142857 0.977102 0.000000 0.000000 0.000000 0.977102 0.852784 0.114576 0.110937 0.000000 0.842365 85.340679 0.000000 0.000000 0.000000 14.659321 0.000000 86.961753 0.189970 0.158308 0.000000 12.689970 0.481256 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000 0.367275 4.875887 0.113982 94.642857 0.134328 0.022843
5 0.985600 14 77169.000000 653.928571 401.428571 0.847398 0.520194 960.142857 0.000000 0.000000 0.000000 167.857143 979.857143 2.071429 1.214286 0.000000 144.857143 1.007113 0.000000 0.000000 0.000000 1.007113 0.740788 0.122763 0.079130 0.000000 0.725116 85.119048 0.000000 0.000000 0.000000 14.880952 0.000000 86.866768 0.183637 0.107649 0.000000 12.841945 0.360942 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000 0.246960 4.875887 0.113982 94.763171 0.187500 0.022843
cutoff 0 Affix a -0.743918 b 0.896681
Suffix a -1.386611 b 0.965747
cutoff 1 Affix a -1.197041 b 0.877837
Suffix a -2.548348 b 0.910826
cutoff 2 Affix a -2.161264 b 0.837071
Suffix a -2.774658 b 0.877415
cutoff 3 Affix a -2.305112 b 0.789647
Suffix a -2.982081 b 0.866045
cutoff 4 Affix a -2.562766 b 0.782704
Suffix a -3.164283 b 0.861559
cutoff 5 Affix a -2.797124 b 0.784175
Suffix a -3.194672 b 0.847348
New (old) algorithm, least wrongly lemmatised (MIN(diff)).
Suffix only no
cutoff 2
fraction 9856.000000
iterations 14
trainlines 77169.000000
rules 1501.357143 (1454.357143)
rules% 1.945544 (1.884639)
same%stdev 0.858236
ambi1%stdev 0.207076
ambi2%stdev 0.137842
ambi3%stdev 0.000000
diff%stdev 0.818972
same% 86.949088 (86.151216)
ambi1% 0.449595 (0.000000)
ambi2% 0.227964 (0.000000)
ambi3% 0.000000 (0.000000)
diff% 12.373354 (13.848784)
amb.rules% 0.848531 (0.000000)
false_amb% 0.645897 (0.000000)
false_not_amb% 4.787234 (4.989868)
true_amb% 0.202634 (0.000000)
true_not_amb% 94.364235 (95.010132)
precision 0.135593 (0.000000)
recall 0.040609 (0.000000)
bests[7].suffixonly == [false]
bests[7].langbase == [en6WS2]
comp = comp_parms0_off
bests[7].rows == [1]
R->R W->R R->W W->W
0.040853 0.673407 -0.708980 0.204693 0.017152 -0.002812
*/
//iteration:20.3
/*weight ( used): 5.80760738773845060e+03 suffix only: no */
/* number of nodes: 10161, nodes/line: 1.29775087168090719e-01 weight ( used): 5.80760738773845060e+03 blobs 1 lines 78297 * fraction 1.00000000000000000e+00 = 78297 lines*/
{{
4.08527421595744900e-02, 6.73406780072449362e-01, -7.08980349688113631e-01, 2.04692793670780471e-01, 1.71516852929795953e-02, -2.81170366166549109e-03
}}
};
static bestParms best_en6WS3 = // English, ambiguous training pairs in training set derived from CELEX
// weight function (graph.h) rcount*exp(-rcount/4.0)
// Aiming at cutoff == 3, because maximum penalty is for 4 word/lemma pairs for a rule.
{
false,
"en6WS3",
1,
/*
0 0.985600 14 77169.000000 14451.285714 11318.285714 18.726802 14.666881 935.285714 5.428571 15.500000 0.000000 171.785714 942.142857 11.285714 6.071429 0.000000 168.500000 1.122684 0.142160 0.476458 0.000000 1.182352 1.015482 0.279109 0.175469 0.000000 0.819736 82.915400 0.481256 1.374113 0.000000 15.229230 1.994681 83.523303 1.000507 0.538247 0.000000 14.937943 1.728723 1.956687 4.951874 0.037994 93.053445 0.009615 0.007614 1.690729 4.951874 0.037994 93.319402 0.011111 0.007614
1 0.985600 14 77169.000000 2563.357143 5873.857143 3.321745 7.611680 975.428571 0.000000 0.000000 0.000000 152.571429 962.071429 11.571429 8.642857 0.000000 145.714286 1.082261 0.000000 0.000000 0.000000 1.082261 0.862652 0.279727 0.216057 0.000000 0.687956 86.474164 0.000000 0.000000 0.000000 13.525836 0.000000 85.290020 1.025836 0.766211 0.000000 12.917933 2.045339 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000 1.481763 4.426292 0.563576 93.528369 0.159785 0.112944
2 0.985600 14 77169.000000 1454.357143 1506.500000 1.884639 1.952209 971.785714 0.000000 0.000000 0.000000 156.214286 980.714286 5.285714 3.357143 0.000000 138.642857 1.146528 0.000000 0.000000 0.000000 1.146528 0.840620 0.229151 0.132898 0.000000 0.881469 86.151216 0.000000 0.000000 0.000000 13.848784 0.000000 86.942756 0.468592 0.297619 0.000000 12.291033 0.930851 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000 0.728217 4.787234 0.202634 94.281915 0.122137 0.040609
3 0.985600 14 77169.000000 1004.857143 772.357143 1.302151 1.000865 965.928571 0.000000 0.000000 0.000000 162.071429 980.642857 3.428571 2.571429 0.000000 141.357143 1.079324 0.000000 0.000000 0.000000 1.079324 0.806994 0.165723 0.108378 0.000000 0.874583 85.631966 0.000000 0.000000 0.000000 14.368034 0.000000 86.936424 0.303951 0.227964 0.000000 12.531662 0.677558 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000 0.550912 4.863222 0.126646 94.459220 0.103093 0.025381
4 0.985600 14 77169.000000 782.785714 543.642857 1.014378 0.704483 962.642857 0.000000 0.000000 0.000000 165.357143 979.571429 2.642857 2.214286 0.000000 143.571429 0.977102 0.000000 0.000000 0.000000 0.977102 0.841545 0.149994 0.110937 0.000000 0.876729 85.340679 0.000000 0.000000 0.000000 14.659321 0.000000 86.841439 0.234296 0.196302 0.000000 12.727964 0.563576 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000 0.462259 4.888551 0.101317 94.547872 0.098765 0.020305
5 0.985600 14 77169.000000 653.928571 400.714286 0.847398 0.519268 960.142857 0.000000 0.000000 0.000000 167.857143 978.928571 2.071429 1.857143 0.000000 145.142857 1.007113 0.000000 0.000000 0.000000 1.007113 0.723954 0.122763 0.109172 0.000000 0.743232 85.119048 0.000000 0.000000 0.000000 14.880952 0.000000 86.784448 0.183637 0.164640 0.000000 12.867275 0.417933 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000 0.316616 4.888551 0.101317 94.693516 0.137931 0.020305
cutoff 0 Affix a -0.755377 b 0.897222
Suffix a -1.386611 b 0.965747
cutoff 1 Affix a -1.217108 b 0.879540
Suffix a -2.548348 b 0.910826
cutoff 2 Affix a -2.116345 b 0.832979
Suffix a -2.774658 b 0.877415
cutoff 3 Affix a -2.356162 b 0.796254
Suffix a -2.982081 b 0.866045
cutoff 4 Affix a -2.473309 b 0.774972
Suffix a -3.164283 b 0.861559
cutoff 5 Affix a -2.713573 b 0.775252
Suffix a -3.194672 b 0.847348
New (old) algorithm, least wrongly lemmatised (MIN(diff)).
Suffix only no
cutoff 2
fraction 9856.000000
iterations 14
trainlines 77169.000000
rules 1506.500000 (1454.357143)
rules% 1.952209 (1.884639)
same%stdev 0.840620
ambi1%stdev 0.229151
ambi2%stdev 0.132898
ambi3%stdev 0.000000
diff%stdev 0.881469
same% 86.942756 (86.151216)
ambi1% 0.468592 (0.000000)
ambi2% 0.297619 (0.000000)
ambi3% 0.000000 (0.000000)
diff% 12.291033 (13.848784)
amb.rules% 0.930851 (0.000000)
false_amb% 0.728217 (0.000000)
false_not_amb% 4.787234 (4.989868)
true_amb% 0.202634 (0.000000)
true_not_amb% 94.281915 (95.010132)
precision 0.122137 (0.000000)
recall 0.040609 (0.000000)
bests[9].suffixonly == [false]
bests[9].langbase == [en6WS3]
comp = comp_parms0_off
bests[9].rows == [1]
R->R W->R R->W W->W
0.020806 0.683019 -0.702896 0.197456 -0.000308 0.001105
*/
//iteration:20.9
/*weight ( used): 7.45427103780143261e+03 suffix only: no */
/* number of nodes: 10102, nodes/line: 1.29021546163965412e-01 weight ( used): 7.45427103780143261e+03 blobs 1 lines 78297 * fraction 1.00000000000000000e+00 = 78297 lines*/
{{
2.08057417098401302e-02, 6.83018660111897935e-01, -7.02895839518911436e-01, 1.97455702463298050e-01, -3.08022357388210524e-04, 1.10477364594036725e-03
}}
};
static bestParms best_enS2 = // English, ambiguous training pairs in training set derived from CELEX
// weight function (graph.h) rcount*rcount*exp(-2.0*rcount/3.0)
// Aiming at cutoff == 2, because maximum penalty is for 3 word/lemma pairs for a rule.
{
false,
"enS2",
1,
/*
0 0.985600 14 77169.000000 14451.285714 11784.142857 18.726802 15.270566 935.285714 5.428571 15.500000 0.000000 171.785714 945.071429 9.500000 4.928571 0.000000 168.500000 1.122684 0.142160 0.476458 0.000000 1.182352 1.057826 0.308574 0.188748 0.000000 0.849436 82.915400 0.481256 1.374113 0.000000 15.229230 1.994681 83.782928 0.842199 0.436930 0.000000 14.937943 1.437437 1.956687 4.951874 0.037994 93.053445 0.009615 0.007614 1.399443 4.951874 0.037994 93.610689 0.013393 0.007614
1 0.985600 14 77169.000000 2563.357143 6187.785714 3.321745 8.018486 975.428571 0.000000 0.000000 0.000000 152.571429 965.642857 9.428571 6.642857 0.000000 146.285714 1.082261 0.000000 0.000000 0.000000 1.082261 1.018306 0.294467 0.198559 0.000000 0.846354 86.474164 0.000000 0.000000 0.000000 13.525836 0.000000 85.606636 0.835866 0.588906 0.000000 12.968592 1.621074 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000 1.127153 4.495947 0.493921 93.882979 0.179724 0.098985
2 0.985600 14 77169.000000 1454.357143 1461.000000 1.884639 1.893247 971.785714 0.000000 0.000000 0.000000 156.214286 980.928571 4.500000 2.928571 0.000000 139.642857 1.146528 0.000000 0.000000 0.000000 1.146528 0.981951 0.179844 0.149417 0.000000 1.001545 86.151216 0.000000 0.000000 0.000000 13.848784 0.000000 86.961753 0.398936 0.259625 0.000000 12.379686 0.816869 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000 0.595238 4.768237 0.221631 94.414894 0.156951 0.044416
3 0.985600 14 77169.000000 1004.857143 740.500000 1.302151 0.959582 965.928571 0.000000 0.000000 0.000000 162.071429 981.214286 3.000000 1.785714 0.000000 142.000000 1.079324 0.000000 0.000000 0.000000 1.079324 0.956827 0.155507 0.061994 0.000000 0.977345 85.631966 0.000000 0.000000 0.000000 14.368034 0.000000 86.987082 0.265957 0.158308 0.000000 12.588652 0.607903 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000 0.455927 4.837893 0.151976 94.554205 0.142857 0.030457
4 0.985600 14 77169.000000 782.785714 542.357143 1.014378 0.702817 962.642857 0.000000 0.000000 0.000000 165.357143 980.142857 2.357143 1.357143 0.000000 144.142857 0.977102 0.000000 0.000000 0.000000 0.977102 0.955992 0.145908 0.102015 0.000000 0.992602 85.340679 0.000000 0.000000 0.000000 14.659321 0.000000 86.892097 0.208967 0.120314 0.000000 12.778622 0.487589 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000 0.360942 4.863222 0.126646 94.649189 0.149254 0.025381
5 0.985600 14 77169.000000 653.928571 418.785714 0.847398 0.542686 960.142857 0.000000 0.000000 0.000000 167.857143 978.928571 2.642857 1.285714 0.000000 145.142857 1.007113 0.000000 0.000000 0.000000 1.007113 0.893632 0.149994 0.117552 0.000000 0.880954 85.119048 0.000000 0.000000 0.000000 14.880952 0.000000 86.784448 0.234296 0.113982 0.000000 12.867275 0.468592 0.000000 4.989868 0.000000 95.010132 0.000000 0.000000 0.354610 4.875887 0.113982 94.655522 0.138462 0.022843
cutoff 0 Affix a -0.666235 b 0.892776
Suffix a -1.386611 b 0.965747
cutoff 1 Affix a -1.147043 b 0.876909
Suffix a -2.548348 b 0.910826
cutoff 2 Affix a -1.806319 b 0.801931
Suffix a -2.774658 b 0.877415
cutoff 3 Affix a -1.747592 b 0.737090
Suffix a -2.982081 b 0.866045
cutoff 4 Affix a -1.922767 b 0.724786
Suffix a -3.164283 b 0.861559
cutoff 5 Affix a -1.957773 b 0.707745
Suffix a -3.194672 b 0.847348
New (old) algorithm, least wrongly lemmatised (MIN(diff)).
Suffix only no
cutoff 2
fraction 9856.000000
iterations 14
trainlines 77169.000000
rules 1461.000000 (1454.357143)
rules% 1.893247 (1.884639)
same%stdev 0.981951
ambi1%stdev 0.179844
ambi2%stdev 0.149417
ambi3%stdev 0.000000
diff%stdev 1.001545
same% 86.961753 (86.151216)
ambi1% 0.398936 (0.000000)
ambi2% 0.259625 (0.000000)
ambi3% 0.000000 (0.000000)
diff% 12.379686 (13.848784)
amb.rules% 0.816869 (0.000000)
false_amb% 0.595238 (0.000000)
false_not_amb% 4.768237 (4.989868)
true_amb% 0.221631 (0.000000)
true_not_amb% 94.414894 (95.010132)
precision 0.156951 (0.000000)
recall 0.044416 (0.000000)
bests[10].suffixonly == [false]
bests[10].langbase == [enS2]
comp = comp_parms0_off
bests[10].rows == [1]
R->R W->R R->W W->W
0.179015 0.692631 -0.665544 0.161160 0.137396 -0.020423
*/
//iteration:20.4
/*weight ( used): 4.66981464254577713e+03 suffix only: no */
/* number of nodes: 10378, nodes/line: 1.32546585437500808e-01 weight ( used): 4.66981464254577713e+03 blobs 1 lines 78297 * fraction 1.00000000000000000e+00 = 78297 lines*/
{{
1.79014987751130839e-01, 6.92630686252189487e-01, -6.65544221444446804e-01, 1.61160030170824808e-01, 1.37395843876723128e-01, -2.04226023055548052e-02
}}
};
static bestParms best_deC0 =
{
false,
"deC0",
1,
/*
cutoff fraction iterations trainlines suffixrules affixrules suffix% affix% s-same s-ambiguous s-different a-same a-ambiguous a-different s-same-stddev% s-amb-stddev% s-diff-stddev% a-same-stddev% a-amb-stddev% a-diff-stddev% s-same% s-ambiguous% s-different% s-amb.rules% a-same% a-ambiguous% a-different% a-amb.rules% s_false_amb s_false_not_amb s_true_amb s_true_not_amb s_precision s_recall a_false_amb a_false_not_amb a_true_amb a_true_not_amb a_precision a_recall
0 0.985600 7 313549.000000 42991.285714 34219.714286 13.711186 10.913674 4070.857143 15.000000 17.142857 0.428571 478.571429 4116.142857 19.142857 16.714286 0.000000 430.000000 0.487918 0.087298 0.075000 0.017171 0.477629 0.326014 0.159099 0.106706 0.000000 0.267592 88.844547 0.327368 0.374135 0.009353 10.444597 0.841803 89.832886 0.417784 0.364781 0.000000 9.384548 0.957162 0.779448 1.686724 0.062356 97.471472 0.038462 0.035651 0.894806 1.686724 0.062356 97.356114 0.033670 0.035651
1 0.985600 7 313549.000000 12232.142857 18745.285714 3.901190 5.978423 4059.571429 0.000000 0.000000 0.000000 522.428571 4123.428571 15.285714 19.000000 0.000000 424.285714 0.422039 0.000000 0.000000 0.000000 0.422039 0.387932 0.083310 0.110568 0.000000 0.367732 88.598242 0.000000 0.000000 0.000000 11.401758 0.000000 89.991894 0.333604 0.414666 0.000000 9.259837 0.882335 0.000000 1.749080 0.000000 98.250920 0.000000 0.000000 0.760741 1.627486 0.121594 97.490179 0.074004 0.069519
2 0.985600 7 313549.000000 7700.142857 8184.428571 2.455802 2.610255 4015.285714 0.000000 0.000000 0.000000 566.714286 4115.142857 11.714286 12.857143 0.000000 442.285714 0.433335 0.000000 0.000000 0.000000 0.433335 0.510655 0.053033 0.079121 0.000000 0.478270 87.631727 0.000000 0.000000 0.000000 12.368273 0.000000 89.811062 0.255659 0.280601 0.000000 9.652678 0.654736 0.000000 1.749080 0.000000 98.250920 0.000000 0.000000 0.579909 1.674253 0.074827 97.671011 0.060606 0.042781
3 0.985600 7 313549.000000 5708.857143 5542.571429 1.820722 1.767689 3974.571429 0.000000 0.000000 0.000000 607.428571 4092.428571 8.857143 9.857143 0.000000 470.857143 0.478957 0.000000 0.000000 0.000000 0.478957 0.484232 0.042597 0.107236 0.000000 0.460635 86.743156 0.000000 0.000000 0.000000 13.256844 0.000000 89.315333 0.193303 0.215128 0.000000 10.276236 0.514435 0.000000 1.749080 0.000000 98.250920 0.000000 0.000000 0.458315 1.692960 0.056120 97.792605 0.057692 0.032086
4 0.985600 7 313549.000000 4622.571429 4432.714286 1.474274 1.413723 3931.571429 0.000000 0.000000 0.000000 650.428571 4069.000000 9.571429 7.857143 0.000000 495.571429 0.479454 0.000000 0.000000 0.000000 0.479454 0.539468 0.050176 0.092102 0.000000 0.502100 85.804702 0.000000 0.000000 0.000000 14.195298 0.000000 88.804016 0.208892 0.171478 0.000000 10.815614 0.473904 0.000000 1.749080 0.000000 98.250920 0.000000 0.000000 0.420902 1.696078 0.053002 97.830018 0.059233 0.030303
5 0.985600 7 313549.000000 3814.857143 3738.571429 1.216670 1.192340 3900.714286 0.000000 0.000000 0.000000 681.285714 4050.428571 10.000000 8.142857 0.000000 513.428571 0.522987 0.000000 0.000000 0.000000 0.522987 0.520727 0.073472 0.105745 0.000000 0.453064 85.131259 0.000000 0.000000 0.000000 14.868741 0.000000 88.398703 0.218245 0.177714 0.000000 11.205338 0.498846 0.000000 1.749080 0.000000 98.250920 0.000000 0.000000 0.445844 1.696078 0.053002 97.805076 0.056106 0.030303
cutoff 0 Affix a 0.696894 b 0.771507: N(rules)= 2.007508*N(trainpairs)^0.771507
Suffix a 0.255148 b 0.819252: N(rules)= 1.290653*N(trainpairs)^0.819252
cutoff 1 Affix a 0.424828 b 0.745235: N(rules)= 1.529327*N(trainpairs)^0.745235
Suffix a -1.476990 b 0.863095: N(rules)= 0.228324*N(trainpairs)^0.863095
cutoff 2 Affix a -0.660663 b 0.770330: N(rules)= 0.516509*N(trainpairs)^0.770330
Suffix a -2.565046 b 0.914071: N(rules)= 0.076916*N(trainpairs)^0.914071
cutoff 3 Affix a -1.199171 b 0.781329: N(rules)= 0.301444*N(trainpairs)^0.781329
Suffix a -3.132748 b 0.934644: N(rules)= 0.043598*N(trainpairs)^0.934644
cutoff 4 Affix a -1.435560 b 0.778725: N(rules)= 0.237982*N(trainpairs)^0.778725
Suffix a -3.498437 b 0.945496: N(rules)= 0.030245*N(trainpairs)^0.945496
cutoff 5 Affix a -1.957380 b 0.805932: N(rules)= 0.141228*N(trainpairs)^0.805932
Suffix a -3.614366 b 0.938124: N(rules)= 0.026934*N(trainpairs)^0.938124
New (old) algorithm, least wrongly lemmatised (MIN(diff)).
Suffix only no
Redo training no
cutoff 1
fraction 9856.000000
iterations 7
trainlines 313549.000000
rules 18745.285714 ( 12232.142857)
rules% 5.978423 ( 3.901190)
same%stdev 0.387932
ambi1%stdev 0.083310
ambi2%stdev 0.110568
ambi3%stdev 0.000000
diff%stdev 0.367732
same% 89.991894 ( 88.598242)
ambi1% 0.333604 ( 0.000000)
ambi2% 0.414666 ( 0.000000)
ambi3% 0.000000 ( 0.000000)
diff% 9.259837 ( 11.401758)
amb.rules% 0.882335 ( 0.000000)
false_amb% 0.760741 ( 0.000000)
false_not_amb% 1.627486 ( 1.749080)
true_amb% 0.121594 ( 0.000000)
true_not_amb% 97.490179 ( 98.250920)
precision 0.074004 ( 0.000000)
recall 0.069519 ( 0.000000)
bests[2].suffixonly == [false]
bests[2].langbase == [deC0]
comp = comp_parms0_off
bests[2].rows == [1]
R->R W->R R->W W->W
0.070270 0.738908 -0.655311 0.134438 0.039578 0.002103
*/
//iteration:20.9
/*weight ( used): 1.57191628625387852e+03 suffix only: no */
/* number of nodes: 33777, nodes/line: inf weight ( used): 1.57191628625387852e+03 blobs 1 lines 0 * fraction 1.00000000000000000e+00 = 0 lines*/
{{
7.02703009236997217e-02, 7.38908457216426395e-01, -6.55310616504478860e-01, 1.34437786206644400e-01, 3.95781745400746385e-02, 2.10292619579706538e-03
}}
};
static bestParms best_deC1 =
{
false,
"deC1",
1,
/*
cutoff fraction iterations trainlines suffixrules affixrules suffix% affix% s-same s-ambiguous s-different a-same a-ambiguous a-different s-same-stddev% s-amb-stddev% s-diff-stddev% a-same-stddev% a-amb-stddev% a-diff-stddev% s-same% s-ambiguous% s-different% s-amb.rules% a-same% a-ambiguous% a-different% a-amb.rules% s_false_amb s_false_not_amb s_true_amb s_true_not_amb s_precision s_recall a_false_amb a_false_not_amb a_true_amb a_true_not_amb a_precision a_recall
0 0.985600 7 313549.000000 42991.285714 41378.428571 13.711186 13.196798 4070.857143 15.000000 17.142857 0.428571 478.571429 4113.285714 24.285714 18.857143 0.000000 425.571429 0.487918 0.087298 0.075000 0.017171 0.477629 0.351168 0.068686 0.095488 0.000000 0.394426 88.844547 0.327368 0.374135 0.009353 10.444597 0.841803 89.770531 0.530024 0.411548 0.000000 9.287897 1.147347 0.779448 1.686724 0.062356 97.471472 0.038462 0.035651 1.078755 1.680489 0.068591 97.172164 0.030812 0.039216
1 0.985600 7 313549.000000 12232.142857 21711.000000 3.901190 6.924277 4059.571429 0.000000 0.000000 0.000000 522.428571 4113.428571 22.857143 21.857143 0.000000 423.857143 0.422039 0.000000 0.000000 0.000000 0.422039 0.329750 0.079121 0.089479 0.000000 0.321107 88.598242 0.000000 0.000000 0.000000 11.401758 0.000000 89.773648 0.498846 0.477022 0.000000 9.250483 1.150465 0.000000 1.749080 0.000000 98.250920 0.000000 0.000000 0.991457 1.590073 0.159007 97.259462 0.074236 0.090909
2 0.985600 7 313549.000000 7700.142857 9295.285714 2.455802 2.964540 4015.285714 0.000000 0.000000 0.000000 566.714286 4109.714286 16.571429 17.285714 0.000000 438.428571 0.433335 0.000000 0.000000 0.000000 0.433335 0.426131 0.117432 0.060053 0.000000 0.397034 87.631727 0.000000 0.000000 0.000000 12.368273 0.000000 89.692586 0.361664 0.377253 0.000000 9.568498 0.876099 0.000000 1.749080 0.000000 98.250920 0.000000 0.000000 0.773212 1.646193 0.102887 97.477708 0.062382 0.058824
3 0.985600 7 313549.000000 5708.857143 6627.142857 1.820722 2.113591 3974.571429 0.000000 0.000000 0.000000 607.428571 4081.857143 16.714286 14.571429 0.000000 468.857143 0.478957 0.000000 0.000000 0.000000 0.478957 0.367454 0.099786 0.041519 0.000000 0.406434 86.743156 0.000000 0.000000 0.000000 13.256844 0.000000 89.084617 0.364781 0.318015 0.000000 10.232587 0.826214 0.000000 1.749080 0.000000 98.250920 0.000000 0.000000 0.723327 1.646193 0.102887 97.527592 0.066398 0.058824
4 0.985600 7 313549.000000 4622.571429 5333.857143 1.474274 1.701124 3931.571429 0.000000 0.000000 0.000000 650.428571 4052.000000 16.857143 14.428571 0.000000 498.714286 0.479454 0.000000 0.000000 0.000000 0.479454 0.434120 0.084928 0.046905 0.000000 0.429101 85.804702 0.000000 0.000000 0.000000 14.195298 0.000000 88.432999 0.367899 0.314897 0.000000 10.884205 0.835568 0.000000 1.749080 0.000000 98.250920 0.000000 0.000000 0.713974 1.627486 0.121594 97.536946 0.078471 0.069519
5 0.985600 7 313549.000000 3814.857143 4496.857143 1.216670 1.434180 3900.714286 0.000000 0.000000 0.000000 681.285714 4033.714286 15.857143 14.000000 0.000000 518.428571 0.522987 0.000000 0.000000 0.000000 0.522987 0.378162 0.070639 0.035639 0.000000 0.350295 85.131259 0.000000 0.000000 0.000000 14.868741 0.000000 88.033922 0.346075 0.305543 0.000000 11.314460 0.810625 0.000000 1.749080 0.000000 98.250920 0.000000 0.000000 0.682796 1.621251 0.127829 97.568124 0.085595 0.073084
cutoff 0 Affix a 0.919873 b 0.765671: N(rules)= 2.508971*N(trainpairs)^0.765671
Suffix a 0.255148 b 0.819252: N(rules)= 1.290653*N(trainpairs)^0.819252
cutoff 1 Affix a 0.525258 b 0.746897: N(rules)= 1.690895*N(trainpairs)^0.746897
Suffix a -1.476990 b 0.863095: N(rules)= 0.228324*N(trainpairs)^0.863095
cutoff 2 Affix a -1.113654 b 0.815502: N(rules)= 0.328357*N(trainpairs)^0.815502
Suffix a -2.565046 b 0.914071: N(rules)= 0.076916*N(trainpairs)^0.914071
cutoff 3 Affix a -1.594497 b 0.824227: N(rules)= 0.203011*N(trainpairs)^0.824227
Suffix a -3.132748 b 0.934644: N(rules)= 0.043598*N(trainpairs)^0.934644
cutoff 4 Affix a -2.182396 b 0.851581: N(rules)= 0.112771*N(trainpairs)^0.851581
Suffix a -3.498437 b 0.945496: N(rules)= 0.030245*N(trainpairs)^0.945496
cutoff 5 Affix a -2.728401 b 0.880592: N(rules)= 0.065324*N(trainpairs)^0.880592
Suffix a -3.614366 b 0.938124: N(rules)= 0.026934*N(trainpairs)^0.938124
New (old) algorithm, least wrongly lemmatised (MIN(diff)).
Suffix only no
Redo training no
cutoff 1
fraction 9856.000000
iterations 7
trainlines 313549.000000
rules 21711.000000 ( 12232.142857)
rules% 6.924277 ( 3.901190)
same%stdev 0.329750
ambi1%stdev 0.079121
ambi2%stdev 0.089479
ambi3%stdev 0.000000
diff%stdev 0.321107
same% 89.773648 ( 88.598242)
ambi1% 0.498846 ( 0.000000)
ambi2% 0.477022 ( 0.000000)
ambi3% 0.000000 ( 0.000000)
diff% 9.250483 ( 11.401758)
amb.rules% 1.150465 ( 0.000000)
false_amb% 0.991457 ( 0.000000)
false_not_amb% 1.590073 ( 1.749080)
true_amb% 0.159007 ( 0.000000)
true_not_amb% 97.259462 ( 98.250920)
precision 0.074236 ( 0.000000)
recall 0.090909 ( 0.000000)
bests[4].suffixonly == [false]
bests[4].langbase == [deC1]
comp = comp_parms0_off
bests[4].rows == [1]
R->R W->R R->W W->W
0.155770 0.629146 -0.737028 0.165459 0.079044 0.055455
*/
//iteration:20.8
/*weight ( used): 6.78787945680193934e+03 suffix only: no */
/* number of nodes: 36815, nodes/line: 1.15722768293564607e-01 weight ( used): 6.78787945680193934e+03 blobs 1 lines 318131 * fraction 1.00000000000000000e+00 = 318131 lines*/
{{
1.55770018417734107e-01, 6.29146340759315748e-01, -7.37028188136959139e-01, 1.65459089224974654e-01, 7.90444951020151526e-02, 5.54553040724012833e-02
}}
};
static bestParms best_elC0 =
{
false,
"elC0",
1,
/* First time with ambiguity > 2 (2015.02.16)
cutoff fraction iterations trainlines suffixrules affixrules suffix% affix% s-same s-ambiguous s-different a-same a-ambiguous a-different s-same-stddev% s-amb-stddev% s-diff-stddev% a-same-stddev% a-amb-stddev% a-diff-stddev% s-same% s-ambiguous% s-different% s-amb.rules% a-same% a-ambiguous% a-different% a-amb.rules% s_false_amb s_false_not_amb s_true_amb s_true_not_amb s_precision s_recall a_false_amb a_false_not_amb a_true_amb a_true_not_amb a_precision a_recall
0 0.985600 5 556568.000000 91660.600000 77694.400000 16.468895 13.959552 6965.200000 17.600000 11.000000 0.200000 1138.000000 6947.200000 51.200000 25.400000 0.000000 1108.200000 0.210385 0.033225 0.035852 0.005499 0.228739 0.347749 0.047148 0.053319 0.000000 0.309804 85.651746 0.216429 0.135268 0.002459 13.994097 0.450074 85.430398 0.629611 0.312346 0.000000 13.627644 1.160846 0.356616 5.627152 0.093458 93.922774 0.115854 0.016337 1.032956 5.592720 0.127890 93.246434 0.058296 0.022356
1 0.985600 5 556568.000000 20985.400000 40104.800000 3.770501 7.205732 7066.800000 0.000000 0.000000 0.000000 1065.200000 6970.600000 96.600000 111.000000 0.000000 953.800000 0.276261 0.000000 0.000000 0.000000 0.276261 0.407162 0.078548 0.066222 0.000000 0.431136 86.901131 0.000000 0.000000 0.000000 13.098869 0.000000 85.718151 1.187900 1.364978 0.000000 11.728972 2.843089 0.000000 5.720610 0.000000 94.279390 0.000000 0.000000 0.976390 3.853910 1.866699 93.303000 0.488731 0.326311
2 0.985600 5 556568.000000 12589.600000 13686.600000 2.262006 2.459107 6981.000000 0.000000 0.000000 0.000000 1151.000000 6986.200000 134.000000 193.800000 0.200000 817.800000 0.350629 0.000000 0.000000 0.000000 0.350629 0.325745 0.188912 0.167575 0.005499 0.256093 85.846040 0.000000 0.000000 0.000000 14.153960 0.000000 85.909985 1.647811 2.383178 0.002459 10.056567 4.323660 0.000000 5.720610 0.000000 94.279390 0.000000 0.000000 0.782095 2.179046 3.541564 93.497295 0.693642 0.619089
3 0.985600 5 556568.000000 8924.600000 8586.000000 1.603506 1.542669 6909.800000 0.000000 0.000000 0.000000 1222.200000 6959.800000 148.200000 187.200000 0.000000 836.800000 0.399930 0.000000 0.000000 0.000000 0.399930 0.325397 0.194123 0.148942 0.000000 0.269472 84.970487 0.000000 0.000000 0.000000 15.029513 0.000000 85.585342 1.822430 2.302017 0.000000 10.290212 4.402361 0.000000 5.720610 0.000000 94.279390 0.000000 0.000000 0.686178 2.004427 3.716183 93.593212 0.730304 0.649613
4 0.985600 5 556568.000000 6978.800000 6381.800000 1.253899 1.146634 6859.600000 0.000000 0.000000 0.000000 1272.400000 6931.800000 155.600000 181.000000 0.200000 863.400000 0.294951 0.000000 0.000000 0.000000 0.294951 0.305255 0.158818 0.170172 0.005499 0.233580 84.353173 0.000000 0.000000 0.000000 15.646827 0.000000 85.241023 1.913428 2.225775 0.002459 10.617314 4.404820 0.000000 5.720610 0.000000 94.279390 0.000000 0.000000 0.673881 1.989670 3.730939 93.605509 0.734625 0.652193
5 0.985600 5 556568.000000 5790.000000 5232.800000 1.040304 0.940191 6802.000000 0.000000 0.000000 0.000000 1330.000000 6908.400000 161.800000 172.000000 0.000000 889.800000 0.253063 0.000000 0.000000 0.000000 0.253063 0.295079 0.147923 0.123584 0.000000 0.229300 83.644860 0.000000 0.000000 0.000000 16.355140 0.000000 84.953271 1.989670 2.115101 0.000000 10.941958 4.387605 0.000000 5.720610 0.000000 94.279390 0.000000 0.000000 0.659124 1.992130 3.728480 93.620266 0.738791 0.651763
cutoff 0 Affix a 0.596157 b 0.808684: N(rules)= 1.815130*N(trainpairs)^0.808684
Suffix a -0.054097 b 0.862983: N(rules)= 0.947341*N(trainpairs)^0.862983
cutoff 1 Affix a 0.030996 b 0.801290: N(rules)= 1.031482*N(trainpairs)^0.801290
Suffix a -1.391083 b 0.856475: N(rules)= 0.248806*N(trainpairs)^0.856475
cutoff 2 Affix a -0.559577 b 0.760960: N(rules)= 0.571451*N(trainpairs)^0.760960
Suffix a -1.801956 b 0.843674: N(rules)= 0.164976*N(trainpairs)^0.843674
cutoff 3 Affix a -0.787383 b 0.742457: N(rules)= 0.455034*N(trainpairs)^0.742457
Suffix a -1.870778 b 0.819764: N(rules)= 0.154004*N(trainpairs)^0.819764
cutoff 4 Affix a -0.907361 b 0.730001: N(rules)= 0.403588*N(trainpairs)^0.730001
Suffix a -1.821582 b 0.794623: N(rules)= 0.161770*N(trainpairs)^0.794623
cutoff 5 Affix a -0.992606 b 0.721078: N(rules)= 0.370610*N(trainpairs)^0.721078
Suffix a -1.762567 b 0.774137: N(rules)= 0.171604*N(trainpairs)^0.774137
New (old) algorithm, least wrongly lemmatised (MIN(diff)).
Suffix only no
Redo training yes
cutoff 2
fraction 9856.000000
iterations 5
trainlines 556568.000000
rules 13686.600000 ( 12589.600000)
rules% 2.459107 ( 2.262006)
same%stdev 0.325745
ambi1%stdev 0.188912
ambi2%stdev 0.167575
ambi3%stdev 0.005499
diff%stdev 0.256093
same% 85.909985 ( 85.846040)
ambi1% 1.647811 ( 0.000000)
ambi2% 2.383178 ( 0.000000)
ambi3% 0.002459 ( 0.000000)
diff% 10.056567 ( 14.153960)
amb.rules% 4.323660 ( 0.000000)
false_amb% 0.782095 ( 0.000000)
false_not_amb% 2.179046 ( 5.720610)
true_amb% 3.541564 ( 0.000000)
true_not_amb% 93.497295 ( 94.279390)
precision 0.693642 ( 0.000000)
recall 0.619089 ( 0.000000)
bests[5].suffixonly == [false]
bests[5].langbase == [elC0]
comp = comp_parms0_off
bests[5].rows == [1]
R->R W->R R->W W->W
0.162988 0.723313 -0.629186 0.197114 0.003088 0.124558
*/
//iteration:20.5
/*weight ( used): 2.88408936339958746e+03 suffix only: no */
/* number of nodes: 71647, nodes/line: 1.26876217460598545e-01 weight ( used): 2.88408936339958746e+03 blobs 1 lines 564700 * fraction 1.00000000000000000e+00 = 564700 lines*/
{{
1.62988138483171396e-01, 7.23313127184479043e-01, -6.29185831644276772e-01, 1.97113838764327726e-01, 3.08765150599594205e-03, 1.24558327663259708e-01
}}
};
static bestParms best_elC1 =
{
false,
"elC1",
1,
/*
cutoff fraction iterations trainlines suffixrules affixrules suffix% affix% s-same s-ambiguous s-different a-same a-ambiguous a-different s-same-stddev% s-amb-stddev%
s-diff-stddev% a-same-stddev% a-amb-stddev% a-diff-stddev% s-same% s-ambiguous% s-different% s-amb.rules% a-same% a-ambiguous% a-different% a-amb.rules% s_false_amb s_false_not_amb s_true_amb s_tr
ue_not_amb s_precision s_recall a_false_amb a_false_not_amb a_true_amb a_true_not_amb a_precision a_recall
0 0.985600 5 556568.000000 91660.600000 78373.000000 16.468895 14.081478 6965.200000 17.600000 11.000000 0.200000 1138.000000 6959.600000 46.400000 24.400000 0.000000 1101.600000 0.210385 0.033225 0.035852 0.005499
0.228739 0.406233 0.093894 0.079027 0.000000 0.330721 85.651746 0.216429 0.135268 0.002459 13.994097 0.450074 85.582882 0.570585 0.300049 0.000000 13.546483 1.082145 0.356616 5.627152 0.093458 9
3.922774 0.115854 0.016337 0.954255 5.592720 0.127890 93.325135 0.062802 0.022356
1 0.985600 5 556568.000000 20985.400000 41554.600000 3.770501 7.466222 7066.800000 0.000000 0.000000 0.000000 1065.200000 6973.400000 95.600000 108.600000 0.000000 954.400000 0.276261 0.000000 0.000000 0.000000
0.276261 0.371832 0.084573 0.066564 0.000000 0.403150 86.901131 0.000000 0.000000 0.000000 13.098869 0.000000 85.752582 1.175603 1.335465 0.000000 11.736350 2.796360 0.000000 5.720610 0.000000 9
4.279390 0.000000 0.000000 0.951795 3.876045 1.844565 93.327595 0.492126 0.322442
2 0.985600 5 556568.000000 12589.600000 14185.800000 2.262006 2.548799 6981.000000 0.000000 0.000000 0.000000 1151.000000 6986.600000 140.800000 184.000000 0.000000 820.600000 0.350629 0.000000 0.000000 0.000000 0.350629 0.291340 0.108047 0.102147 0.000000 0.274916 85.846040 0.000000 0.000000 0.000000 14.153960 0.000000 85.914904 1.731431 2.262666 0.000000 10.090999 4.272012 0.000000 5.720610 0.000000 94.279390 0.000000 0.000000 0.762420 2.211018 3.509592 93.516970 0.697118 0.613500
3 0.985600 5 556568.000000 8924.600000 8917.600000 1.603506 1.602248 6909.800000 0.000000 0.000000 0.000000 1222.200000 6957.800000 151.000000 175.400000 0.000000 847.800000 0.399930 0.000000 0.000000 0.000000 0.399930 0.284752 0.147821 0.074089 0.000000 0.243066 84.970487 0.000000 0.000000 0.000000 15.029513 0.000000 85.560748 1.856862 2.156911 0.000000 10.425480 4.294147 0.000000 5.720610 0.000000 94.279390 0.000000 0.000000 0.688637 2.115101 3.605509 93.590753 0.723593 0.630267
4 0.985600 5 556568.000000 6978.800000 6777.000000 1.253899 1.217641 6859.600000 0.000000 0.000000 0.000000 1272.400000 6928.000000 157.800000 172.400000 0.000000 873.800000 0.294951 0.000000 0.000000 0.000000 0.294951 0.305948 0.129090 0.107062 0.000000 0.244462 84.353173 0.000000 0.000000 0.000000 15.646827 0.000000 85.194294 1.940482 2.120020 0.000000 10.745204 4.340876 0.000000 5.720610 0.000000 94.279390 0.000000 0.000000 0.696016 2.075750 3.644860 93.583374 0.723633 0.637145
5 0.985600 5 556568.000000 5790.000000 5582.600000 1.040304 1.003040 6802.000000 0.000000 0.000000 0.000000 1330.000000 6904.800000 160.800000 163.200000 0.000000 903.200000 0.253063 0.000000 0.000000 0.000000 0.253063 0.281413 0.155401 0.114831 0.000000 0.207854 83.644860 0.000000 0.000000 0.000000 16.355140 0.000000 84.909001 1.977373 2.006886 0.000000 11.106739 4.294147 0.000000 5.720610 0.000000 94.279390 0.000000 0.000000 0.664043 2.090507 3.630103 93.615347 0.732143 0.634566
cutoff 0 Affix a 0.581590 b 0.810821: N(rules)= 1.788881*N(trainpairs)^0.810821
Suffix a -0.054097 b 0.862983: N(rules)= 0.947341*N(trainpairs)^0.862983
cutoff 1 Affix a -0.024716 b 0.808521: N(rules)= 0.975587*N(trainpairs)^0.808521
Suffix a -1.391083 b 0.856475: N(rules)= 0.248806*N(trainpairs)^0.856475
cutoff 2 Affix a -0.755380 b 0.779266: N(rules)= 0.469832*N(trainpairs)^0.779266
Suffix a -1.801956 b 0.843674: N(rules)= 0.164976*N(trainpairs)^0.843674
cutoff 3 Affix a -0.965364 b 0.760000: N(rules)= 0.380844*N(trainpairs)^0.760000
Suffix a -1.870778 b 0.819764: N(rules)= 0.154004*N(trainpairs)^0.819764
cutoff 4 Affix a -1.117902 b 0.750784: N(rules)= 0.326965*N(trainpairs)^0.750784
Suffix a -1.821582 b 0.794623: N(rules)= 0.161770*N(trainpairs)^0.794623
cutoff 5 Affix a -1.252385 b 0.745853: N(rules)= 0.285822*N(trainpairs)^0.745853
Suffix a -1.762567 b 0.774137: N(rules)= 0.171604*N(trainpairs)^0.774137
New (old) algorithm, least wrongly lemmatised (MIN(diff)).
Suffix only no
Redo training yes
cutoff 2
fraction 9856.000000
iterations 5
trainlines 556568.000000
rules 14185.800000 ( 12589.600000)
rules% 2.548799 ( 2.262006)
same%stdev 0.291340
ambi1%stdev 0.108047
ambi2%stdev 0.102147
ambi3%stdev 0.000000
diff%stdev 0.274916
same% 85.914904 ( 85.846040)
ambi1% 1.731431 ( 0.000000)
ambi2% 2.262666 ( 0.000000)
ambi3% 0.000000 ( 0.000000)
diff% 10.090999 ( 14.153960)
amb.rules% 4.272012 ( 0.000000)
false_amb% 0.762420 ( 0.000000)
false_not_amb% 2.211018 ( 5.720610)
true_amb% 3.509592 ( 0.000000)
true_not_amb% 93.516970 ( 94.279390)
precision 0.697118 ( 0.000000)
recall 0.613500 ( 0.000000)
bests[5].suffixonly == [false]
bests[5].langbase == [elC1]
comp = comp_parms0_off
bests[5].rows == [1]
R->R W->R R->W W->W
0.208419 0.719204 -0.602475 0.220196 0.054780 0.157617
*/
//iteration:20.1
/*weight ( used): 1.40401105178530815e+04 suffix only: no */
/* number of nodes: 72056, nodes/line: 1.27600495838498307e-01 weight ( used): 1.40401105178530815e+04 blobs 1 lines 564700 * fraction 1.00000000000000000e+00 = 564700 lines*/
{{
2.08418641675330557e-01, 7.19204329987296109e-01, -6.02475173735412373e-01, 2.20196492945528233e-01, 5.47799882952482026e-02, 1.57617016654553105e-01
}}
};
static bestParms best_elC2 =
{
false,
"elC2",
1,
/*
cutoff fraction iterations trainlines suffixrules affixrules suffix% affix% s-same s-ambiguous s-different a-same a-ambiguous a-different s-same-stddev% s-amb-stddev% s-diff-stddev% a-same-stddev% a-amb-stddev% a-diff-stddev% s-same% s-ambiguous% s-different% s-amb.rules% a-same% a-ambiguous% a-different% a-amb.rules% s_false_amb s_false_not_amb s_true_amb s_true_not_amb s_precision s_recall a_false_amb a_false_not_amb a_true_amb a_true_not_amb a_precision a_recall
0 0.985600 5 556568.000000 91660.600000 76515.400000 16.468895 13.747718 6965.200000 17.600000 11.000000 0.200000 1138.000000 6840.800000 104.200000 61.600000 0.000000 1125.400000 0.210385 0.033225 0.035852 0.005499 0.228739 0.411576 0.061115 0.108814 0.000000 0.398054 85.651746 0.216429 0.135268 0.002459 13.994097 0.450074 84.121987 1.281358 0.757501 0.000000 13.839154 2.528283 0.356616 5.627152 0.093458 93.922774 0.115854 0.016337 2.400394 5.592720 0.127890 91.878997 0.025948 0.022356
1 0.985600 5 556568.000000 20985.400000 43830.800000 3.770501 7.875192 7066.800000 0.000000 0.000000 0.000000 1065.200000 6864.600000 158.400000 145.000000 0.000000 964.000000 0.276261 0.000000 0.000000 0.000000 0.276261 0.562490 0.123768 0.139939 0.000000 0.617860 86.901131 0.000000 0.000000 0.000000 13.098869 0.000000 84.414658 1.947860 1.783079 0.000000 11.854402 4.289228 0.000000 5.720610 0.000000 94.279390 0.000000 0.000000 2.365962 3.797344 1.923266 91.913428 0.288987 0.336199
2 0.985600 5 556568.000000 12589.600000 16235.200000 2.262006 2.917020 6981.000000 0.000000 0.000000 0.000000 1151.000000 6899.600000 180.800000 198.800000 0.000000 852.800000 0.350629 0.000000 0.000000 0.000000 0.350629 0.420769 0.143776 0.088419 0.000000 0.344582 85.846040 0.000000 0.000000 0.000000 14.153960 0.000000 84.845057 2.223315 2.444663 0.000000 10.486965 5.241023 0.000000 5.720610 0.000000 94.279390 0.000000 0.000000 2.090507 2.570093 3.150516 92.188883 0.429722 0.550731
3 0.985600 5 556568.000000 8924.600000 10638.200000 1.603506 1.911393 6909.800000 0.000000 0.000000 0.000000 1222.200000 6866.600000 197.400000 194.800000 0.000000 873.200000 0.399930 0.000000 0.000000 0.000000 0.399930 0.430627 0.105998 0.103398 0.000000 0.411117 84.970487 0.000000 0.000000 0.000000 15.029513 0.000000 84.439252 2.427447 2.395475 0.000000 10.737826 5.373832 0.000000 5.720610 0.000000 94.279390 0.000000 0.000000 2.090507 2.437285 3.283325 92.188883 0.439868 0.573947
4 0.985600 5 556568.000000 6978.800000 8064.000000 1.253899 1.448880 6859.600000 0.000000 0.000000 0.000000 1272.400000 6839.000000 200.400000 188.800000 0.000000 903.800000 0.294951 0.000000 0.000000 0.000000 0.294951 0.379720 0.161416 0.069236 0.000000 0.288051 84.353173 0.000000 0.000000 0.000000 15.646827 0.000000 84.099852 2.464338 2.321692 0.000000 11.114117 5.371372 0.000000 5.720610 0.000000 94.279390 0.000000 0.000000 2.058534 2.407772 3.312838 92.220856 0.445879 0.579106
5 0.985600 5 556568.000000 5790.000000 6704.400000 1.040304 1.204597 6802.000000 0.000000 0.000000 0.000000 1330.000000 6807.000000 202.600000 184.600000 0.000000 937.800000 0.253063 0.000000 0.000000 0.000000 0.253063 0.315079 0.154229 0.063064 0.000000 0.202324 83.644860 0.000000 0.000000 0.000000 16.355140 0.000000 83.706345 2.491392 2.270044 0.000000 11.532218 5.388588 0.000000 5.720610 0.000000 94.279390 0.000000 0.000000 2.031481 2.363502 3.357108 92.247909 0.452436 0.586844
cutoff 0 Affix a 0.721174 b 0.797865: N(rules)= 2.056846*N(trainpairs)^0.797865
Suffix a -0.054097 b 0.862983: N(rules)= 0.947341*N(trainpairs)^0.862983
cutoff 1 Affix a 0.108771 b 0.802291: N(rules)= 1.114907*N(trainpairs)^0.802291
Suffix a -1.391083 b 0.856475: N(rules)= 0.248806*N(trainpairs)^0.856475
cutoff 2 Affix a -0.518561 b 0.771959: N(rules)= 0.595377*N(trainpairs)^0.771959
Suffix a -1.801956 b 0.843674: N(rules)= 0.164976*N(trainpairs)^0.843674
cutoff 3 Affix a -0.784815 b 0.760477: N(rules)= 0.456204*N(trainpairs)^0.760477
Suffix a -1.870778 b 0.819764: N(rules)= 0.154004*N(trainpairs)^0.819764
cutoff 4 Affix a -0.999690 b 0.756766: N(rules)= 0.367993*N(trainpairs)^0.756766
Suffix a -1.821582 b 0.794623: N(rules)= 0.161770*N(trainpairs)^0.794623
cutoff 5 Affix a -1.143049 b 0.753603: N(rules)= 0.318845*N(trainpairs)^0.753603
Suffix a -1.762567 b 0.774137: N(rules)= 0.171604*N(trainpairs)^0.774137
New (old) algorithm, least wrongly lemmatised (MIN(diff)).
Suffix only no
Redo training yes
cutoff 2
fraction 9856.000000
iterations 5
trainlines 556568.000000
rules 16235.200000 ( 12589.600000)
rules% 2.917020 ( 2.262006)
same%stdev 0.420769
ambi1%stdev 0.143776
ambi2%stdev 0.088419
ambi3%stdev 0.000000
diff%stdev 0.344582
same% 84.845057 ( 85.846040)
ambi1% 2.223315 ( 0.000000)
ambi2% 2.444663 ( 0.000000)
ambi3% 0.000000 ( 0.000000)
diff% 10.486965 ( 14.153960)
amb.rules% 5.241023 ( 0.000000)
false_amb% 2.090507 ( 0.000000)
false_not_amb% 2.570093 ( 5.720610)
true_amb% 3.150516 ( 0.000000)
true_not_amb% 92.188883 ( 94.279390)
precision 0.429722 ( 0.000000)
recall 0.550731 ( 0.000000)
bests[6].suffixonly == [false]
bests[6].langbase == [elC2]
comp = comp_parms0_off
bests[6].rows == [1]
R->R W->R R->W W->W
0.081085 0.737793 -0.622803 0.225382 0.019282 0.100173
*/
//iteration:20.4
/*weight ( used): 3.47608497189166446e+04 suffix only: no */
/* number of nodes: 77922, nodes/line: 1.37988312378253930e-01 weight ( used): 3.47608497189166446e+04 blobs 1 lines 564700 * fraction 1.00000000000000000e+00 = 564700 lines*/
{{
8.10850759088081185e-02, 7.37792532174295523e-01, -6.22803219705343025e-01, 2.25381946520170823e-01, 1.92821878529992295e-02, 1.00173423979330933e-01
}}
};
static struct bestParms bests[] =
{
best_da3
,best_daC0
,best_daC1
,best_deC0
,best_deC1
,best_elC0
,best_elC1
,best_elC2
,best_en4
,best_en4_suffix
,best_en4W
,best_en6W
,best_en6WS1
,best_en6WS2
,best_en6WS3
,best_enS2
,best_enC0
,best_isC0
,best_nlC0
,best_nlC1
,best_nlC2
,best_is
,best_is_suffix
};
static struct rotation best =
/* R_R W_R R_W W_W R_NA W_NA */
{{ 0.0, 3.0, -3.0, 1.0, 0.0, 0.0 }};
//iteration:19.63
/* 387 381.726058 */
/*
0.000000, 10.000000, -10.000000, 2.000000,
-13.581532, -5.623026, -4.283725, 6.696503,
-2.889162, 1.349916, 0.464059, -4.429286,
-0.305579, 0.715326, 0.815889, 0.502817
*/
//iteration:19.63
/* 391 387.069724 */
/*
0.000000, 6.000000, -6.000000, 2.000000,
-6.337474, 0.399136, -1.608900, -6.024108,
-3.947893, 0.883151, 2.100111, 3.650880,
0.463499, 1.732265, 1.476537, -0.767184
*/
// Orthogonalised:
//iteration:19.39
/* number of nodes: 386, nodes/line: 0.387550 weight: 381.768147 blobs 1 lines 1254 * fraction 0.794328 = 996 lines*/
/*
{ // # decisions
0.000000, 4.100000, -6.000000, 1.000000, //9886
-4.167700, 0.043205, -0.140436, -1.019756, //784
0.040775, 0.183468, 0.096672, -0.172187, //17
0.001182, -0.003050, -0.002846, -0.004568 //0
} //(0 unresolved comparisons)
*/
static void plus(double * dest, double * term,int cols)
{
for(int col = 0;col < cols;++col)
dest[col] += term[col];
}
static int improvements = 0;
static double previous[6] = {0,0,0,0,0,0};
void betterfound(int Nnodes,double weight,int swath,int iterations,int blobs,int lines,double fraction,int fraclines,bool improvement,optionStruct * options)
{
if(improvement)
{
++improvements;
FILE * f = fopen(options->currentParms(),"a");
++openfiles;
assert(f);
fprintf(f,"//-> IMPROVEMENT #%d\n",improvements);
--openfiles;
fclose(f);
}
else
{/*
times(previous,-1.0);
plus(parms.Matrix,previous,parms.ROWPARMS);
normalise(parms.Matrix);*/
}
best = parms;
parms.better(options);
options->setSwath(swath);
options->setSwathIteration(iterations);
options->setNumberOfNodes(Nnodes);
options->setTrainingPairsLines(lines);
options->setWeight(weight);
options->printArgFile();
printf("%d.%d %d \tparms ",swath,iterations,Nnodes);
int i = 0;
for(;i < NPARMS;++i)
{
printf("%7.6f",parms.Matrix[i]);
if(((i+1) % parms.ROWPARMS) == 0)
{
if(i == NPARMS - 1)
printf("\n");
else
printf(";\n");
}
else
printf(";");
}
FILE * f = fopen(options->bestParms(),"a");
++openfiles;
if(f)
{
fprintf(f,"//iteration:%d.%d\n",swath,iterations);
fprintf(f
,"/*weight (%s used): %.*e suffix only: %s */\n"
, (options->getWeightFunction() == esupport || options->getWeightFunction() == eentropy) ? "" : "not "
,DBL_DIG+2
,weight
,options->suffixOnly() ? "yes" : "no"
);
fprintf(f
,"/* number of nodes: %d, nodes/line: %.*e weight (%s used): %.*e blobs %d lines %d * fraction %.*e = %d lines*/\n"
,Nnodes
,DBL_DIG+2,(double)Nnodes/(double)fraclines
, (options->getWeightFunction() == esupport || options->getWeightFunction() == eentropy) ? "" : "not "
,DBL_DIG+2,weight
,blobs
,lines
,DBL_DIG+2,fraction
,fraclines
);
fprintf(f," {{\n ");
int i = 0;
for(;i < NPARMS;++i)
{
fprintf(f,"%.*e", DBL_DIG+2,parms.Matrix[i]);
if(((i+1) % parms.ROWPARMS) == 0)
{
if(i == NPARMS - 1)
fprintf(f,"\n ");
else
fprintf(f,",\n ");
}
else
fprintf(f,",\t");
}
fprintf(f,"}}\n\n");
--openfiles;
fclose(f);
}
}
void worsefound()
{
parms = best;
}
void copybest()
{
parms = best; // go on with best result so far.
}
static double integral(double x,int n)
{
if(n > 1)
return (-1.0/(double)n) * pow(sin(x),n-1) * cos(x) + ((n-1)/(double)n) * integral(x,n-2);
else if(n == 1)
return 1 - cos(x);
else
return x;
}
static double maxintegral;
static double minintegral;
#define pi 3.14159265358979323846
void setMinMaxIntegral(int dimensions)
{
assert(dimensions >= 2); // dimension of embedding space. So dimensions == 2 is for a circle.
maxintegral = integral(pi,dimensions - 2);
minintegral = integral(0.0,dimensions - 2);
}
static double expectationValueA(double angle,int dimensions)
{
double ret;
if(dimensions == 3)
ret = cos(angle);
else if(dimensions == 2)
{
ret = 1.0 - (angle / (pi*0.5));
}
else
{
double inp = integral(angle,dimensions - 2);
ret = 1.0 - 2.0*(inp - minintegral)/(maxintegral - minintegral);
}
return ret;
}
static double angle(double expectectationvalue,int dim)
{
double mina = 0.0;
double maxa = pi;
double mine = expectationValueA(mina,dim);
double maxe = expectationValueA(maxa,dim);
if(expectectationvalue == mine)
return mina;
else if(expectectationvalue == maxe)
return maxa;
else
{
double h = 0.0;
int i;
for(i = 0;i < 100 && maxa - mina > 0.0000000001;++i)
{
h = (mina+maxa)/2.0;
double e = expectationValueA(h,dim);
if(e > expectectationvalue)
{
mine = e;
mina = h;
}
else if(e < expectectationvalue)
{
maxe = e;
maxa = h;
}
else
break;
}
return h;
}
}
void testAngle()
{
for(int d = 2;d <= 4;++d)
{
setMinMaxIntegral(d);
for(double z = 1.0;z >= -1.0;z -= 0.25)
{
double ang = angle(z,d);
printf("%d %2.3f -> %2.3f: %2.2f\n",d,z,cos(ang),180.0/pi*ang);
}
}
getchar();
}
static struct rotation goodParms[] =
{
{{ 0.0238310217,-0.6213812731, 0.7764218951, 0.0162447104, 0.0240052761, 0.0982155421},6,"bg","S", 9419,1,0.19208886,false},
{{ 0.1021732216,-0.5298420485, 0.8355539585, 0.0025004658, 0.1024159319, 0.0135034591},6,"bg","D", 9556,1,0.19515940,false},
{{ 0.013192 ,-0.681737 , 0.722663 ,-0.108477 , 0.015373 , 0.028548 },6,"bg","C", 9407,1,0.19651404,false},
{{-0.029092 ,-0.729701 , 0.679400 ,-0.062773 ,-0.005901 , 0.033617 },6,"bg","E", 9614,1,0.19904272,false},
{{ 0.1930245535,-0.4821981014, 0.8318583033, 0.0216216036, 0.1930529303, 0.0223908087},6,"cs","D", 11112,1,0.17103465,false},
{{ 0.0251096524,-0.6115198550, 0.7900264600,-0.0240712845, 0.0258880462,-0.0046418833},6,"cs","S", 10956,1,0.17227674,false},
{{ 0.014974 ,-0.603475 , 0.774494 , 0.070238 , 0.018401 , 0.174589 },6,"cs","C", 10825,1,0.17314619,false},
{{ 0.119333 ,-0.433123 , 0.815527 , 0.168871 , 0.139466 , 0.291738 },6,"cs","E", 11024,1,0.17314619,false},
{{-0.002807 ,-0.748938 , 0.628884 ,-0.204355 ,-0.000636 , 0.042753 },6,"da","C", 53832,1,0.07773620,false},
{{-0.0010481855,-0.7067872619, 0.6965400975,-0.1214246708, 0.0000826859, 0.0232078163},6,"da","S", 53558,1,0.07834370,false},
{{-0.0004270133,-0.7221318990, 0.6817640089,-0.1065134775,-0.0004766746, 0.0487629587},6,"da","D", 53689,1,0.07890260,false},
{{-0.010943 ,-0.756421 , 0.639771 ,-0.106260 ,-0.004888 , 0.084177 },6,"da","E", 54781,1,0.07921851,false},
{{ 1.98752607997805752e-01, -4.70117267694809882e-01, 8.36381371790766326e-01, 1.49014833998354877e-02, 1.98759017297805707e-01, 1.50384568998339751e-02},6,"dagammel","C", 54781,1,0.07921851,false}, // last numbers are copied from line above, ignore them.
{{ 0.072141 ,-0.562967 , 0.815173 ,-0.043071 , 0.073552 , 0.078053 },6,"de","C", 33033,0,0.08559601,false},
{{ 0.0172003300,-0.6998264583, 0.7114994769,-0.0376781049, 0.0169872277, 0.0448036365},6,"de","S", 32607,0,0.08580908,false},
{{-0.006618 ,-0.584949 , 0.803968 ,-0.026080 , 0.000991 , 0.103662 },6,"de","E", 33638,0,0.08666139,false},
{{ 0.0223945186,-0.5571704335, 0.8262356319, 0.0296707768, 0.0226460927, 0.0707182696},6,"de","D", 33279,0,0.08711798,false},
//{{ 0.005600 ,-0.438006 , 0.891624 , 0.072352 , 0.006273 , 0.088605 },6}, // Greek (-XC)
{{ 0.0446935086,-0.3023603714, 0.8712477459, 0.2557936479, 0.0447718118, 0.2829720055},6,"el","S", 61986,1,0.09432719,false},
{{-0.0029958277,-0.4289888183, 0.8632170809, 0.1740615451,-0.0002318057, 0.2012918646},6,"el","E", 62501,1,0.09432719,false},
{{ 0.039035 ,-0.441248 , 0.889026 , 0.066674 , 0.039309 , 0.086126 },6,"el","C", 59706,1,0.09850769,false},
{{-0.0227055905,-0.6269595867, 0.7208545657, 0.0364387780,-0.0049863206, 0.2922707307},6,"en","E", 11283,1,0.12178388,false},
{{ 0.0132135437,-0.6752102320, 0.7316886509,-0.0246358080, 0.0132845242, 0.0881184515},6,"en","S", 11163,2,0.12191093,false},
{{ 0.1186315828,-0.5514749040, 0.7814237869, 0.1016008409, 0.1187735190, 0.2162150858},6,"en","C", 11149,2,0.12267327,false},
{{ 0.0295695581,-0.6858376245, 0.7258637975, 0.0089433217, 0.0297721711, 0.0301293284},6,"en","D", 11201,2,0.12318150,false},
{{-0.0005687963,-0.6946223877, 0.7166924688,-0.0549098036,-0.0003540120, 0.0289155389},6,"es","C", 18456,1,0.09748033,false},
{{ 0.0021852525,-0.6800784215, 0.7302859021, 0.0373439567, 0.0021819120, 0.0526472654},6,"et","C",326847,1,0.07609883,false}, // big dataset
{{-0.0146161627,-0.7565549227, 0.6398755896,-0.1317946093,-0.0146880782, 0.0196134623},6,"et","C", 21610,0,0.16883198,true},
{{ 0.012820 ,-0.706711 , 0.698674 ,-0.034104 , 0.011938 , 0.104611 },6,"et","C", 20219,0,0.18455896,false},
{{ 0.0043017280,-0.6706417662, 0.7359937667, 0.0171824972, 0.0049466980, 0.0906343418},6,"et","S", 20317,0,0.18636166,false},
{{-0.033463 ,-0.750747 , 0.652380 ,-0.072970 ,-0.006969 , 0.065474 },6,"et","E", 20568,0,0.18648598,false},
{{ 0.2078542471,-0.4288771669, 0.8345478643, 0.1270351602, 0.2079485197, 0.1304236601},6,"et","D", 20790,0,0.19145894,false},
{{ 0.0476291369,-0.6595158821, 0.7455334457, 0.0309211596, 0.0478341347, 0.0608763121},6,"fa","S", 1963,3,0.15233817,false},
{{ 0.0023118581,-0.6790274640, 0.7275921544, 0.0149554752, 0.0028828714, 0.0964055916},6,"fa","C", 1971,3,0.15376761,false},
{{-0.001161 ,-0.739382 , 0.661059 ,-0.127638 ,-0.000914 ,-0.004777 },6,"fr","C", 32620,0,0.09453368,false},
{{ 0.051583 ,-0.564670 , 0.769960 , 0.131552 , 0.059553 , 0.254550 },6,"fr","E", 33159,0,0.09512586,false},
{{ 0.0661175662,-0.5948519753, 0.7849882938, 0.0502248828, 0.0659135351, 0.1367692967},6,"fr","S", 33085,1,0.09567646,false},
{{ 0.2322447589,-0.4074394521, 0.8454759395, 0.0747928330, 0.2322360204, 0.0754875058},6,"fr","D", 34221,1,0.09875563,false},
{{-0.032724 ,-0.547237 , 0.728254 , 0.212133 ,-0.015476 , 0.351947 },6,"hu","E", 9096,0,0.11809593,false},
{{ 0.0334409657,-0.6849089134, 0.7240905990,-0.0096012049, 0.0336626711, 0.0651838026},6,"hu","S", 8757,0,0.11904070,false},
{{ 0.042048 ,-0.682636 , 0.724419 ,-0.001560 , 0.043343 , 0.074674 },6,"hu","C", 8794,0,0.12013081,false},
{{ 0.175030 ,-0.437740 , 0.851017 , 0.105433 , 0.175194 , 0.108212 },6,"hu","D", 9004,1,0.12165698,false},
{{ 0.0013657790,-0.6741470922, 0.7357042565,-0.0119229210, 0.0013559218, 0.0641800992},6,"is","C",287331,1,0.08705487,false},
{{ 0.0006500449,-0.7124017300, 0.6974840179,-0.0751288207, 0.0006206730, 0.0188327467},6,"it","C", 42595,1,0.08523434,false},
{{ 0.0418611016,-0.6396477087, 0.7639149581,-0.0502833369, 0.0418180244, 0.0354292608},6,"it","S", 38648,1,0.09202083,false},
{{-0.0100664067,-0.7168609538, 0.6834200900,-0.1366354183,-0.0159555050, 0.0047122527},6,"la","C", 8870,1,0.43405365,true},
{{ 0.0342099708,-0.6801916827, 0.7312369223,-0.0131917562, 0.0348503862, 0.0085407472},6,"la","C",234931,1,0.16238077,false}, // big dataset, v = u
{{-0.0197892011,-0.7029359285, 0.7067127008,-0.0682898054,-0.0127179756, 0.0349482767},6,"la","E", 7774,1,0.42846498,false}, // cutoff == 3 is probably to be preferred
{{-0.0012861551,-0.6999562655, 0.7108840224,-0.0674097817,-0.0007561510, 0.0126027136},6,"la","C", 7750,1,0.43107303,false}, // cutoff == 3 is probably to be preferred
{{ 0.1987526080,-0.4701172677, 0.8363813718, 0.0149014834, 0.1987590173, 0.0150384569},6,"la","D", 7949,1,0.43181818,false}, // cutoff == 3 is probably to be preferred
{{ 0.2280389808,-0.4079373954, 0.8323171192, 0.1346465385, 0.2281451027, 0.1365783272},6,"la","S", 7938,2,0.43268753,false},
{{ 0.0057774324,-0.6732268078, 0.7362387698, 0.0273332853, 0.0059725934, 0.0624658307},6,"mk","S",107080,1,0.07884323,false},
{{ 0.0537713922,-0.6225543318, 0.7783935741,-0.0252297864, 0.0537369935, 0.0106760683},6,"mk","C",107643,1,0.07963091,false},
// {{ 0.139452 ,-0.511179 , 0.830374 , 0.027272 , 0.139112 , 0.098139 },6}, // Dutch (-XC) (better)
{{ 0.024488 ,-0.495293 , 0.826486 , 0.149246 , 0.024359 , 0.219404 },6,"nl","D", 40924,1,0.10994085,false},
{{ 0.1069193225,-0.6193738021, 0.7649522686,-0.0790680200, 0.1081691280, 0.0428959952},6,"nl","S", 40674,1,0.11003574,false},
{{ 0.016243 ,-0.614317 , 0.760298 , 0.018110 , 0.021227 , 0.208614 },6,"nl","C", 41089,1,0.11060505,false},
{{-0.011390 ,-0.594109 , 0.774997 , 0.030348 ,-0.001950 , 0.212978 },6,"nl","E", 41185,1,0.11088971,false},
{{ 0.0003446314,-0.7182364582, 0.6783115447,-0.1550148018, 0.0003337550, 0.0001398656},6,"no","C", 74713,1,0.14553599,false},
{{ 0.0016262062,-0.7090379332, 0.7016664117,-0.0701639737, 0.0016556865, 0.0010425635},6,"pl","C",302734,0,0.07316330,false},
{{ 0.1326332335,-0.6279433806, 0.7548159800,-0.0161485532, 0.1326242114, 0.0223252152},6,"pt","S", 70558,1,0.06575459,false},
{{-0.0009255873,-0.6707122696, 0.7325781059,-0.0819672647,-0.0005595239, 0.0821861781},6,"pt","C", 70642,1,0.06617960,false},
{{-0.021046 ,-0.659149 , 0.741112 ,-0.011371 ,-0.005483 , 0.125192 },6,"ro","E", 50221,1,0.10066098,false},
{{ 0.2212514567,-0.4178187162, 0.8504829526, 0.0449211169, 0.2213110412, 0.0464497255},6,"ro","D", 47878,1,0.11131702,false},
{{ 0.129777 ,-0.588471 , 0.779661 , 0.038950 , 0.129759 , 0.103123 },6,"ro","C", 46469,1,0.11261428,false},
{{-0.0016509028,-0.7015134891, 0.7114569708,-0.0355891928,-0.0009517376, 0.0209184461},6,"ro","S", 46385,1,0.11366444,false},
{{ 0.0081560789,-0.6936789856, 0.7188250225,-0.0388209278, 0.0083157831, 0.0213848919},6,"ru","C",179299,1,0.10137872,false},
{{-0.0006731287,-0.7490133747, 0.6458430451,-0.1476802380, 0.0000640001, 0.0074709968},6,"se","C", 40582,2,0.07582600,false},
{{ 0.021767 ,-0.480966 , 0.778614 , 0.266345 , 0.021757 , 0.300908 },6,"sk","C", 70835,0,0.06273769,false},
{{ 0.004002 ,-0.540399 , 0.809001 , 0.131287 , 0.005075 , 0.190287 },6,"sk","E", 74039,1,0.06525689,false},
{{ 0.0179669122,-0.6965438108, 0.7157928227,-0.0093251017, 0.0180222526, 0.0416267408},6,"sk","S", 72207,1,0.06571712,false},
{{ 0.016221 ,-0.408998 , 0.765411 , 0.330531 , 0.016203 , 0.370256 },6,"sl","C", 30333,0,0.13036071,false},
{{ 0.0603373290,-0.6840087393, 0.7239138311,-0.0277666783, 0.0602158002,-0.0065791674},6,"sl","D", 30326,1,0.13124954,false},
{{ 0.0242105152,-0.6728127440, 0.7368843437, 0.0312092389, 0.0243645571, 0.0465905375},6,"sl","S", 30425,0,0.13132361,false},
{{-0.005696 ,-0.563334 , 0.773168 , 0.190243 , 0.002000 , 0.220540 },6,"sl","E", 31467,1,0.13487890,false},
{{ 0.0603758783,-0.6567697052, 0.7464491599, 0.0072245730, 0.0603574155, 0.0642397641},6,"sr","C", 29824,1,0.14598299,false},
{{ 0.0146026555,-0.4844863176, 0.8364239989, 0.1629106060, 0.0150191535, 0.1966957652},6,"sr","S", 28775,1,0.15225828,false},
{{ 0.008000 ,-0.238492 , 0.876932 , 0.278726 , 0.008168 , 0.310311 },6,"uk","C", 32925,1,0.14237366,false},
{{ 0.0245488840,-0.0909029197, 0.9162818920, 0.2706077141, 0.0246089256, 0.2787962907},6,"uk","S", 33120,1,0.14241939,false},
{{ 0.038750 ,-0.441348 , 0.888120 , 0.065188 , 0.043326 , 0.093945 },6,"uk","E", 33696,1,0.14296821,false},
};
static size_t goodParmsIndex = 0;
static double InitialDelta = 0.5;
void brown()
{
static int it = 0;
if(it++ < 2)
{
normalise(parms.Matrix);
return;
}
if(goodParmsIndex < sizeof(goodParms)/sizeof(goodParms[0]))
{
parms = goodParms[goodParmsIndex++];
normalise(parms.Matrix);
return;
}
double vector[6];
size_t i;
double tangens = InitialDelta*pow(0.995,it);
/*
Compute a randomly directed vector in 6 dimensional space.
*/
#if ZIGGURAT
for(i = 0;i < sizeof(vector)/sizeof(vector[0]);++i)
{
vector[i] = RNOR; // Use Ziggurat algorithm (rnorrexp.c)
}
#else
double radius2 = 0.0;
do
{
int i;
radius2 = 0.0;
for(i = 0;i < parms.ROWPARMS;++i)
{
vector[i] = rand() - (RAND_MAX/2);
radius2 += vector[i]*vector[i];
}
}
while(radius2 > ((double)RAND_MAX/2.0)*((double)RAND_MAX/2.0));
#endif
normalise(vector);
double inproduct = inner(parms.Matrix,vector); // Only first row. Ignore the rest.
struct rotation diff = parms;
times(diff.Matrix,-inproduct);
plus(vector,diff.Matrix,parms.ROWPARMS);
normalise(vector);
/*
We require that the delta is not only perpendicular to the currently best
vector, but also to the previous delta. This could be extended to comprise
even earlier delta's, up to three. (Though there are six values to define,
there are only five degrees of freedom, because we look at a vector
sweeping the unit sphere in six dimensional space. So we can require delta
to be perpendicular to the current vector and to three previous delta's
and still have one degree of freedom.)
*/
inproduct = inner(previous,vector);
times(previous,-inproduct);
plus(vector,previous,parms.ROWPARMS);
normalise(vector);
times(vector,tangens);
for(i = 0;i < sizeof(vector)/sizeof(vector[0]);++i)
{
previous[i] = vector[i];
}
plus(parms.Matrix,vector,parms.ROWPARMS);
normalise(parms.Matrix);
}
bool init(optionStruct * options)
{
#if ZIGGURAT
zigset(86947731);
#endif
parms.init(options);
double MinMaxInnerProduct = 1.0;
for(size_t i = 0;i < sizeof(goodParms)/sizeof(goodParms[0]);++i)
{
double MaxInnerProduct = -1.0;
for(size_t j = 0;j < sizeof(goodParms)/sizeof(goodParms[0]);++j)
{
if(i != j)
{
double InnerProduct = inner(goodParms[i].Matrix,goodParms[j].Matrix);
if(InnerProduct > MaxInnerProduct) // Find closest meighbour of i
{
MaxInnerProduct = InnerProduct;
}
}
}
if(MaxInnerProduct < MinMaxInnerProduct)
{
MinMaxInnerProduct = MaxInnerProduct;
}
}
/* Take the distance between the most outlying vector and its closest
neighbour as the initial headroom for changing a vector. */
InitialDelta = sqrt(1.0 - MinMaxInnerProduct * MinMaxInnerProduct);
/*
printf("InitialDelta %f outlier %d closest to %d\n",InitialDelta,outlier,neighbour);
printf("Furthest distance %f between %d and %d\n",sqrt(1.0 - MinInnerProduct * MinInnerProduct),furthest1,furthest2);
*/
return true;
}
void printparms(int Nnodes,double weight,optionStruct * options)
{
int i;
FILE * f = fopen(options->currentParms(),"a");
++openfiles;
assert(f);
fprintf(f
,"/*#nodes in tree: %d weight (%s used): %.*e , Suffix only: %s */\n"
,Nnodes
, ( options->getWeightFunction() == esupport
|| options->getWeightFunction() == eentropy
) ? "more support is better"
: options->getWeightFunction() == edepth ? "fewer non-wildcard characters is better"
: options->getWeightFunction() == esize ? "fewer characters is better"
: "not used"
,DBL_DIG+2
,weight
,options->suffixOnly() ? "yes" : "no"
);
fprintf(f," {\n ");
for(i = 0;i < NPARMS;++i)
{
fprintf(f,"%.*e", DBL_DIG+2,parms.Matrix[i]);
if(((i+1) % parms.ROWPARMS) == 0)
{
if(i == NPARMS - 1)
fprintf(f,"\n ");
else
fprintf(f,",\n ");
}
else
fprintf(f,",");
}
fprintf(f,"}\n");
--openfiles;
fclose(f);
}
#define RR 0
#define WR 1
#define RW 2
#define WW 3
#define RN 4
#define WN 5
void computeWeight(vertex * a)
{
double A = parms.Matrix[RR] * a->R__R + parms.Matrix[WR] * a->W__R + parms.Matrix[RW] * a->R__W + parms.Matrix[WW] * a->W__W;
#if _NA
if (parms.ROWPARMS == 6)
{
A += parms.Matrix[RN] * a->R__NA + parms.Matrix[WN] * a->W__NA;
}
#endif
a->wght = A;
}
// Notice that we want to sort from low penalty to high penalty, therefore sgn(a - b)
int comp_parms(const vertex * a,const vertex * b)
{
if (a->wght != b->wght)
return a->wght > b->wght ? -1 : 1;
return 0;
}
static int nparms = 0;
static int comp_parms0_off(const vertex * a,const vertex * b)
{
double A = parms.Matrix[RR]*a->R__R + parms.Matrix[WR]*a->W__R + parms.Matrix[RW]*a->R__W + parms.Matrix[WW]*a->W__W;
double B = parms.Matrix[RR]*b->R__R + parms.Matrix[WR]*b->W__R + parms.Matrix[RW]*b->R__W + parms.Matrix[WW]*b->W__W;
#if _NA
if(parms.ROWPARMS == 6)
{
A += parms.Matrix[RN]*a->R__NA + parms.Matrix[WN]*a->W__NA;
B += parms.Matrix[RN]*b->R__NA + parms.Matrix[WN]*b->W__NA;
}
#endif
if(A != B)
{
return A > B ? -1 : 1;
}
return 0;
}
struct funcstruct
{
const char * number;
const char * name;
int (*comp)(const vertex * a,const vertex * b);
};
static struct funcstruct funcstructs[] =
{
#if _NA
{"1","fairly_good",comp_fairly_good},
{"2","even_better",comp_even_better},
{"3","affiksFEW3",comp_affiksFEW3},
{"4","affiksFEW",comp_affiksFEW},
{"5","affiksFEW2",comp_affiksFEW2},
{"6","fixNA",comp_fixNA},
{"7","fruit",comp_fruit},
{"8","ice",comp_ice},
{"9","pisang",comp_pisang},
{"10","kiwi",comp_kiwi},
{"11","carrot",comp_carrot},
{"12","peen",comp_peen},
{"13","beet",comp_beet},
{"14","sugar",comp_sugar},
{"15","affiksFEW2org",comp_affiksFEW2org},
#endif
{"16","honey",comp_honey},
{"17","koud",comp_koud},
{"18","parms0",comp_parms0_off},
{"19","parmsoff",comp_parms0_off},
{0,0,0}
};
void setCompetitionFunction(optionStruct * options)
{
size_t langlength = strlen(options->extra());
const char * underscore = strchr(options->extra(),'_');
if(underscore != NULL)
langlength = (size_t)(underscore - options->extra());
if(options->compfunc())
{
for(int i = 0;funcstructs[i].number;++i)
if(!strcmp(options->compfunc(),funcstructs[i].number) || !strcmp(options->compfunc(),funcstructs[i].name))
{
comp = funcstructs[i].comp;
if(comp == comp_parms0_off)
{
for(unsigned int j = 0;j < sizeof(bests)/sizeof(bests[0]);++j)
{
if( bests[j].suffixonly == options->suffixOnly()
&& (langlength == strlen(bests[j].langbase)) // 20130125
&& !strncmp(bests[j].langbase,options->extra(),strlen(bests[j].langbase))
)
{
printf("bests[%d].suffixonly == [%s] bests[%d].langbase == [%s]\n",j,bests[j].suffixonly ? "true" : "false",j,bests[j].langbase);
printf("comp = comp_parms0_off\n");
comp = comp_parms0_off;
printf("bests[%d].rows == [%d]\n",j,bests[j].rowss);
nparms = bests[j].rowss * parms.ROWPARMS;
if(nparms > NPARMS)
{
fprintf(stderr,"Too many rows of parameters in bestParms struct for %s (%d, max allowed %d)\n",options->extra(),nparms,NPARMS);
exit(-1);
}
parms = bests[j].val;
if(options->currentParms())
{
FILE * f = fopen(options->currentParms(),"w");
++openfiles;
if(f)
{
fprintf(f,"bests[%d].suffixonly == [%s]\nbests[%d].langbase == [%s]\n",j,bests[j].suffixonly ? "true" : "false",j,bests[j].langbase);
fprintf(f,"comp = comp_parms0_off\n");
fprintf(f,"bests[%d].rows == [%d]\n",j,bests[j].rowss);
fprintf(f," R->R W->R R->W W->W\n");
for(int k = 0;k < nparms;++k)
{
if(k % parms.ROWPARMS == 0)
fprintf(f,"\n");
fprintf(f,"%6f ",parms.Matrix[k]);
}
fprintf(f,"\n");
--openfiles;
fclose(f);
}
}
break;
}
}
if(nparms == 0)
{
fprintf(stderr,"No parameters defined for \"%s\"\nChoose one of:\n",options->extra());
for(unsigned int j = 0;j < sizeof(bests)/sizeof(bests[0]);++j)
{
fprintf(stderr,"\t%s %s\n",bests[j].langbase,bests[j].suffixonly ? "suffix":"affix");
}
fprintf(stderr,"Or find optimal parameters for %s and put these in comp.cpp.\n",options->extra());
getchar();
exit(-1);
}
if(options->verbose())
{
printf("comp_parms0_off\n");
}
}
return;
}
}
if(options->numberOfParms() == 4 || options->numberOfParms() == 6)
{
comp = comp_parms0_off;
return;
}
fprintf(stderr,"Error: Unknown competition function %s\n",options->compfunc());
getchar();
exit(2);
}
Something went wrong with that request. Please try again.