-
Notifications
You must be signed in to change notification settings - Fork 0
/
comp.cpp
executable file
·3233 lines (2992 loc) · 257 KB
/
comp.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
AFFIXTRAIN - supervised learning of affix rules for CSTLEMMA
Copyright (C) 2012 Center for Sprogteknologi, University of Copenhagen
This file is part of AFFIXTRAIN.
AFFIXTRAIN is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
AFFIXTRAIN is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with AFFIXTRAIN; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include "comp.h"
#include "affixtrain.h"
#include "graph.h"
#include "optionaff.h"
#include <float.h>
#define ZIGGURAT 1
#if ZIGGURAT
#include "rnorrexp.c"
#endif
#define NPARMS parms.ROWPARMS
/*
ACL 2009 paper:
Icelandic 71.3 1.5 even_better (71,30 1,51 iflg. D:\dokumenter\tvärsök\even_better\icelandic.xls) peen 71,51 1,65 sugar 70,93 1,86 affiksFEW3 71,02 2,16 no pruning
Danish 92.8 0.2 peen sugar: 92,72 0,19 no pruning
Norwegian 87.6 0.3 affiksFEW2 sugar: 86,67 0,68
Greek 90.4 0.4 sugar no pruning
Slovene 86.7 0.3 affiksFEW3 affiksFEW2: 86,23 0,58 sugar: 86,27 0,41 peen:86,13 0,55 0,4
Swedish 92.3 0.3 sugar pruning 1
German 91.46 0.17 sugar no pruning
English 89.0 1.3 sugar pruning 2
Dutch 90.4 0.5 affiksFEW2 sugar: 90,17 0,31 0,3 no pruning
Polish 93.88 0.08 peen sugar: 93,88 0,08 (?) no pruning
*/
#if _NA
// IMPORTANT (20090511) R__NA and W__NA are not updated as sibling rules are
// added and eat up the training pairs that earlier siblings did not handle.
// This error was detected after having used the weight functions for
// the ACL-paper.
static int comp_fairly_good(const vertex * a,const vertex * b)
{
//const vertex * a = *(const vertex **)A;
//const vertex * b = *(const vertex **)B;
//fairly good, Icelandic 71.270883
//AMBI:
// French ok 85.767516 ambi1 1.156051 ambi2 0.955414 diff 12.121019 rules 7337.500000 2.731849% cutoff 2
int A1 = a->W__R + a->R__R;
int B1 = b->W__R + b->R__R;
int A2 = a->W__R + a->W__W + a->R__NA;
int B2 = b->W__R + b->W__W + b->R__NA;
int A3 = a->W__R + a->R__R + a->R__NA;
int B3 = b->W__R + b->R__R + b->R__NA;
/* int A2 = a->R__NA - a->W__NA;
int B2 = b->R__NA - b->W__NA;
int A3 = a->W__R - a->R__W;
int B3 = b->W__R - b->R__W;
*/
return (A1>B1)?-1:(A1<B1)?1:(A2>B2)?-1:(A2<B2)?1:(A3>B3)?-1:(A3<B3)?1:0;
}
#endif
#if _NA
static int comp_even_better(const vertex * a,const vertex * b)
{
//const vertex * a = *(const vertex **)A;
//const vertex * b = *(const vertex **)B;
//even better, Icelandic 71.300716
// BEST Icelandic 71.535870 +/- 1.919590 at 0.9856 of dataset, 17 iterations, 23209.882353 = 40.477646% rules, cutoff = 0
// Icelandic 71.283167 +/- 1.714260 at 0.9856 of dataset, 17 iterations, 22719.470588 = 39.622376% rules, cutoff = 0, RECURSE == 4
//AMBI:
// French ok 85.487261 ambi1 1.283439 ambi2 1.050955 diff 12.178344 rules 7360.125000 2.740283% cutoff 2
int A1 = a->W__R + a->R__R;// wr + rr
int B1 = b->W__R + b->R__R;
int A2 = a->W__R + a->R__R + a->R__NA;// wr + rr + rn - r = wr - rw
int B2 = b->W__R + b->R__R + b->R__NA;
int A3 = a->W__R + a->W__W + a->R__NA;// wr + ww + rn - w = -wn + rn
int B3 = b->W__R + b->W__W + b->R__NA;
// int A2 = a->W__R - a->R__W;
// int B2 = b->W__R - b->R__W;
// int A3 = a->R__NA - a->W__NA;
// int B3 = b->R__NA - b->W__NA;
return (A1>B1)?-1:(A1<B1)?1:(A2>B2)?-1:(A2<B2)?1:(A3>B3)?-1:(A3<B3)?1:0;
}
#endif
#if _NA
static int comp_affiksFEW3(const vertex * a,const vertex * b)
{
//const vertex * a = *(const vertex **)A;
//const vertex * b = *(const vertex **)B;
// Icelandic 65.781623, cutoff 1 (old lemmatizer 73.329356, cutoff 0)
// Icelandic 66.544995 +/- 1.943469 at 0.9856 of dataset, 17 iterations, 11134.176471 = 19.417817% rules, cutoff = 1
// English 87.863636, cutoff 2 (old 87.954545, cutoff 1)
// English 87.806061 +/- 1.009323 at 0.9856 of dataset, 15 iterations, 1619.133333 = 2.152101% rules (cutoff = 2)
// BEST Slovene 86.669776 +/- 0.331106 at 0.9856 of dataset, 9 iterations, 5650.777778 = 2.888237% rules (cutoff = 2)
// Slovene-ambi (4.23%) 83.165661, cutoff 3 (3550 rules!) (old 82.017103, cutoff 1, 9377 rules) Better than _affiksFEW2, 82.780013, 6656 rules.
// Danish 90.942165 +/- 0.589437 at 0.9856 of dataset, 5 iterations, 32327.400000 = 5.925881% rules, cutoff = 1
// German 90.266461 +/- 0.509202 at 0.9856 of dataset, 7 iterations, 21539.428571 = 6.930653% rules, cutoff = 1
// Greek 89.640779 +/- 0.402079 at 0.9856 of dataset, 5 iterations, 13377.200000 = 2.472132% rules, cutoff = 2
// Dutch 87.817059 +/- 0.366236 at 0.9856 of dataset, 7 iterations, 23493.571429 = 7.895486% rules, cutoff = 1
// Norwegian 85.788507 +/- 0.484921 at 0.9856 of dataset, 6 iterations, 14904.000000 = 3.157580% rules, cutoff = 2
// Polish 93.203365 +/- 0.175436 at 0.9856 of dataset, 2 iterations, 50597.500000 = 1.491153% rules, cutoff = 2
// Swedish 91.709042 +/- 0.170094 at 0.9856 of dataset, 6 iterations, 4407.666667 = 0.935737% rules, cutoff = 3
//AMBI:
// French ok 82.754777 ambi1 2.353503 ambi2 1.805732 diff 13.085987 rules 7360.125000 2.740283% cutoff 2
/* Interesting because it generates far less rules than the above
variables, only 20 % more than the old lemmatizer.
Also interesting is that there are not many leaves with only one
supporting training pair.
Yet, the leaves with only one supporter are detrimentous to the overall
result (cutoff has to be 1 or even 2).
*/
#if 1
int A1 = a->W__R + a->R__R + a->R__NA; // Good: previously wrong words got it right. Bad: previously right words got it wrong.
int B1 = b->W__R + b->R__R + b->R__NA;
int A2 = a->W__R + a->R__R; // Good: any rightly lemmatized word
int B2 = b->W__R + b->R__R;
int A3 = a->W__R + a->W__W + a->R__NA; // Good: previously right words that didn't match. They may return to the parent.
int B3 = b->W__R + b->W__W + b->R__NA; // Bad: previously wrong words that didn't match. They must be handled by siblings.
#else
int A1 = a->W__R - a->R__W; // Good: previously wrong words got it right. Bad: previously right words got it wrong.
int B1 = b->W__R - b->R__W;
int A2 = a->W__R + a->R__R; // Good: any rightly lemmatized word
int B2 = b->W__R + b->R__R;
int A3 = a->W__R + a->W__W - a->R__R - a->R__W; // Good: previously right words that didn't match. They may return to the parent.
int B3 = b->W__R + b->W__W - b->R__R - b->R__W; // Bad: previously wrong words that didn't match. They must be handled by siblings.
#endif
/*
int A1 = a->W__R - a->R__W; // Good: previously wrong words got it right. Bad: previously right words got it wrong.
int B1 = b->W__R - b->R__W;
int A2 = a->W__R + a->R__R; // Good: any rightly lemmatized word
int B2 = b->W__R + b->R__R;
int A3 = a->R__NA - a->W__NA; // Good: previously right words that didn't match. They may return to the parent.
int B3 = b->R__NA - b->W__NA; // Bad: previously wrong words that didn't match. They must be handled by siblings.
*/
return (A1>B1)?-1:(A1<B1)?1:(A2>B2)?-1:(A2<B2)?1:(A3>B3)?-1:(A3<B3)?1:0;
}
#endif
#if _NA
static int comp_affiksFEW(const vertex * a,const vertex * b)
{
//const vertex * a = *(const vertex **)A;
//const vertex * b = *(const vertex **)B;
/**/
//_affiksFEW
// Dutch 88.138224, 39943.5 flexrules cutoff 1 (old 89.656164, 47277.75 flexrules, cutoff 1)
// German 90.266461 +/- 0.509202 at 0.9856 of dataset, 7 iterations, 21539.428571 = 6.930653% rules, cutoff = 1
//AMBI:
// French ok 82.617834 ambi1 2.455414 ambi2 1.872611 diff 13.054140 rules 7360.125000 2.740283% cutoff 2
int N = a->W__W + a->W__R + a->W__NA + a->R__W + a->R__R + a->R__NA;
int A1;
int B1;
int A2;
int B2;
int A3;
int B3;
// good for small numbers:
if(N < 3)
{
A1 = a->W__R + a->R__R;
B1 = b->W__R + b->R__R;
A2 = a->W__R + a->R__R + a->R__NA;
B2 = b->W__R + b->R__R + b->R__NA;
A3 = a->W__R + a->W__W + a->R__NA;
B3 = b->W__R + b->W__W + b->R__NA;
/* A1 = a->W__R + a->R__R;
B1 = b->W__R + b->R__R;
A2 = a->W__R - a->R__W;
B2 = b->W__R - b->R__W;
A3 = a->R__NA - a->W__NA;
B3 = b->R__NA - b->W__NA;
*/
}
// good for big numbers:
else
{
A1 = a->W__R + a->R__R + a->R__NA; // Good: previously wrong words got it right. Bad: previously right words got it wrong.
B1 = b->W__R + b->R__R + b->R__NA;
A2 = a->W__R + a->R__R; // Good: any rightly lemmatized word
B2 = b->W__R + b->R__R;
A3 = a->R__NA + a->W__R + a->W__W; // Good: previously right words that didn't match. They may return to the parent.
B3 = b->R__NA + b->W__R + a->W__W; // Bad: previously wrong words that didn't match. They must be handled by siblings.
/* A1 = a->W__R - a->R__W; // Good: previously wrong words got it right. Bad: previously right words got it wrong.
B1 = b->W__R - b->R__W;
A2 = a->W__R + a->R__R; // Good: any rightly lemmatized word
B2 = b->W__R + b->R__R;
A3 = a->R__NA - a->W__NA; // Good: previously right words that didn't match. They may return to the parent.
B3 = b->R__NA - b->W__NA; // Bad: previously wrong words that didn't match. They must be handled by siblings.
*/
}
return (A1>B1)?-1:(A1<B1)?1:(A2>B2)?-1:(A2<B2)?1:(A3>B3)?-1:(A3<B3)?1:0;
}
#endif
#if _NA
static int comp_affiksFEW2(const vertex * a,const vertex * b)
{
//const vertex * a = *(const vertex **)A;
//const vertex * b = *(const vertex **)B;
//_affiksFEW2
// (OK) BEST Dutch 90.452096 +/- 0.655431 at 0.9856 of dataset, 7 iterations, 53607.714286 = 18.015948% rules, cutoff = 0
// (OK) Norwegian 86.776860 +/- 0.642621 at 0.9856 of dataset, 6 iterations, 112374.000000 = 23.807698% rules, cutoff = 0
// (OK) English 88.424242 +/- 1.191106 at 0.9856 of dataset, 15 iterations, 1383.000000 = 1.838240% rules, cutoff = 2
// (OK) Icelandic 71.304226 +/- 1.453643 at 0.9856 of dataset, 17 iterations, 25635.000000 = 44.707011% rules, cutoff = 0
// (OK) German 91.156762 +/- 0.348391 at 0.9856 of dataset, 7 iterations, 48816.571429 = 15.707506% rules, cutoff = 0
// (OK) Slovene 86.537639 +/- 0.559484 at 0.9856 of dataset, 9 iterations, 40643.444444 = 20.773759% rules, cutoff = 0
// (OK) Swedish 91.907598 +/- 0.224888 at 0.9856 of dataset, 6 iterations, 27958.000000 = 5.935415% rules, cutoff = 1
// (OK) Greek 90.741209 +/- 0.312526 at 0.9856 of dataset, 5 iterations, 125306.400000 = 23.156860% rules, cutoff = 0
// (OK) Danish 92.994605 +/- 0.210674 at 0.9856 of dataset, 5 iterations, 67278.800000 = 12.332763% rules, cutoff = 0
// (?)ALMOST BEST Polish 93.398015 +/- 0.045642 at 0.9856 of dataset, 2 iterations, 165511.500000 = 4.877770% rules, cutoff = 1
//AMBI:
// French ok 84.194268 ambi1 2.277070 ambi2 1.576433 diff 11.952229 rules 6453.250000 2.402640% cutoff 2
/* 20140922,
nohup nice /home/zgk261/bin/testrules -I -D /home/zgk261/sandkasse/nl/dict_nl_non_ambiguous -L nl -C affiksFEW2 -A >/home/zgk261/sandkasse/nl/testout 2>/home/zgk261/sandkasse/nl/testerr &
cutoff fraction iterations trainlines suffixrules affixrules suffix% affix% s-same s-ambiguous s-different a-same a-ambiguous a-different s-same-stddev% s-amb-stddev% s-diff-stddev% a-same-stddev% a-amb-stddev% a-diff-stddev% s-same% s-ambiguous% s-different% s-amb.rules% a-same% a-ambiguous% a-different% a-amb.rules% s_false_amb s_false_not_amb s_true_amb s_true_not_amb s_precision s_recall a_false_amb a_false_not_amb a_true_amb a_true_not_amb a_precision a_recall
0 0.985600 7 297719.000000 41163.857143 50127.285714 13.826413 16.837113 3830.428571 0.000000 0.000000 0.000000 519.571429 3922.428571 0.000000 0.000000 0.000000 427.571429 0.539105 0.000000 0.000000 0.000000 0.539105 0.440368 0.000000 0.000000 0.000000 0.440368 88.055829 0.000000 0.000000 0.000000 11.944171 0.000000 90.170772 0.000000 0.000000 0.000000 9.829228 0.000000 0.000000 0.000000 0.000000 100.000000 0.000000 -nan 0.000000 0.000000 0.000000 100.000000 0.000000 -nan
1 0.985600 7 297719.000000 11178.571429 22834.000000 3.754739 7.669648 3803.285714 0.000000 0.000000 0.000000 546.714286 3914.285714 0.000000 0.000000 0.000000 435.714286 0.587252 0.000000 0.000000 0.000000 0.587252 0.570207 0.000000 0.000000 0.000000 0.570207 87.431856 0.000000 0.000000 0.000000 12.568144 0.000000 89.983580 0.000000 0.000000 0.000000 10.016420 0.000000 0.000000 0.000000 0.000000 100.000000 0.000000 -nan 0.000000 0.000000 0.000000 100.000000 0.000000 -nan
2 0.985600 7 297719.000000 6855.000000 8014.285714 2.302507 2.691896 3757.857143 0.000000 0.000000 0.000000 592.142857 3895.857143 0.000000 0.000000 0.000000 454.142857 0.549072 0.000000 0.000000 0.000000 0.549072 0.530304 0.000000 0.000000 0.000000 0.530304 86.387521 0.000000 0.000000 0.000000 13.612479 0.000000 89.559934 0.000000 0.000000 0.000000 10.440066 0.000000 0.000000 0.000000 0.000000 100.000000 0.000000 -nan 0.000000 0.000000 0.000000 100.000000 0.000000 -nan
3 0.985600 7 297719.000000 5109.142857 5404.142857 1.716096 1.815182 3726.285714 0.000000 0.000000 0.000000 623.714286 3868.571429 0.000000 0.000000 0.000000 481.428571 0.586802 0.000000 0.000000 0.000000 0.586802 0.407759 0.000000 0.000000 0.000000 0.407759 85.661741 0.000000 0.000000 0.000000 14.338259 0.000000 88.932677 0.000000 0.000000 0.000000 11.067323 0.000000 0.000000 0.000000 0.000000 100.000000 0.000000 -nan 0.000000 0.000000 0.000000 100.000000 0.000000 -nan
4 0.985600 7 297719.000000 4082.571429 4193.857143 1.371283 1.408663 3694.285714 0.000000 0.000000 0.000000 655.714286 3844.714286 0.000000 0.000000 0.000000 505.285714 0.695342 0.000000 0.000000 0.000000 0.695342 0.391416 0.000000 0.000000 0.000000 0.391416 84.926108 0.000000 0.000000 0.000000 15.073892 0.000000 88.384236 0.000000 0.000000 0.000000 11.615764 0.000000 0.000000 0.000000 0.000000 100.000000 0.000000 -nan 0.000000 0.000000 0.000000 100.000000 0.000000 -nan
5 0.985600 7 297719.000000 3462.714286 3482.571429 1.163081 1.169751 3667.000000 0.000000 0.000000 0.000000 683.000000 3827.000000 0.000000 0.000000 0.000000 523.000000 0.788006 0.000000 0.000000 0.000000 0.788006 0.407141 0.000000 0.000000 0.000000 0.407141 84.298851 0.000000 0.000000 0.000000 15.701149 0.000000 87.977011 0.000000 0.000000 0.000000 12.022989 0.000000 0.000000 0.000000 0.000000 100.000000 0.000000 -nan 0.000000 0.000000 0.000000 100.000000 0.000000 -nan
cutoff 0 Affix a 0.676984 b 0.804745: N(rules)= 1.967934*N(trainpairs)^0.804745
Suffix a 0.181309 b 0.828157: N(rules)= 1.198786*N(trainpairs)^0.828157
cutoff 1 Affix a 0.020304 b 0.794314: N(rules)= 1.020512*N(trainpairs)^0.794314
Suffix a -1.577537 b 0.865510: N(rules)= 0.206483*N(trainpairs)^0.865510
cutoff 2 Affix a -1.037201 b 0.799804: N(rules)= 0.354446*N(trainpairs)^0.799804
Suffix a -2.496919 b 0.900882: N(rules)= 0.082338*N(trainpairs)^0.900882
cutoff 3 Affix a -1.918028 b 0.840841: N(rules)= 0.146896*N(trainpairs)^0.840841
Suffix a -2.960479 b 0.913754: N(rules)= 0.051794*N(trainpairs)^0.913754
cutoff 4 Affix a -2.344658 b 0.853928: N(rules)= 0.095880*N(trainpairs)^0.853928
Suffix a -3.219691 b 0.916661: N(rules)= 0.039967*N(trainpairs)^0.916661
cutoff 5 Affix a -2.716219 b 0.868745: N(rules)= 0.066124*N(trainpairs)^0.868745
Suffix a -3.525562 b 0.927406: N(rules)= 0.029435*N(trainpairs)^0.927406
New (old) algorithm, least wrongly lemmatised (MIN(diff)).
Suffix only no
cutoff 0
fraction 9856.000000
iterations 7
trainlines 297719.000000
rules 50127.285714 ( 41163.857143)
rules% 16.837113 ( 13.826413)
same%stdev 0.440368
ambi1%stdev 0.000000
ambi2%stdev 0.000000
ambi3%stdev 0.000000
diff%stdev 0.440368
same% 90.170772 ( 88.055829)
ambi1% 0.000000 ( 0.000000)
ambi2% 0.000000 ( 0.000000)
ambi3% 0.000000 ( 0.000000)
diff% 9.829228 ( 11.944171)
amb.rules% 0.000000 ( 0.000000)
false_amb% 0.000000 ( 0.000000)
false_not_amb% 0.000000 ( 0.000000)
true_amb% 0.000000 ( 0.000000)
true_not_amb% 100.000000 ( 100.000000)
precision 0.000000 ( 0.000000)
recall -nan ( -nan)
*/
/* 20140922, same as above, except that data are ambiguous.
nohup nice /home/zgk261/bin/testrules -I -D /home/zgk261/sandkasse/nl/dict_nl_without_doubles_UTF8 -L nl -C affiksFEW2 -A >/home/zgk261/sandkasse/nl/testout 2>/home/zgk261/sandkasse/nl/testerr &
cutoff fraction iterations trainlines suffixrules affixrules suffix% affix% s-same s-ambiguous s-different a-same a-ambiguous a-different s-same-stddev% s-amb-stddev% s-diff-stddev% a-same-stddev% a-amb-stddev% a-diff-stddev% s-same% s-ambiguous% s-different% s-amb.rules% a-same% a-ambiguous% a-different% a-amb.rules% s_false_amb s_false_not_amb s_true_amb s_true_not_amb s_precision s_recall a_false_amb a_false_not_amb a_true_amb a_true_not_amb a_precision a_recall
0 0.985600 7 306848.000000 73132.000000 50311.000000 23.833299 16.396066 3635.857143 78.000000 117.000000 4.714286 648.428571 3843.428571 0.000000 0.000000 0.000000 640.571429 0.368578 0.192276 0.216604 0.027956 0.556177 0.484660 0.000000 0.000000 0.000000 0.484660 81.085128 1.739518 2.609277 0.105136 14.460940 4.982796 85.714286 0.000000 0.000000 0.000000 14.285714 0.000000 4.552695 6.203008 0.430101 88.814197 0.045105 0.064841 0.000000 6.633108 0.000000 93.366892 0.000000 0.000000
1 0.985600 7 306848.000000 18391.714286 22871.857143 5.993754 7.453807 3743.714286 0.000000 0.000000 0.000000 740.285714 3897.142857 0.000000 0.000000 0.000000 586.857143 0.341564 0.000000 0.000000 0.000000 0.341564 0.373271 0.000000 0.000000 0.000000 0.373271 83.490506 0.000000 0.000000 0.000000 16.509494 0.000000 86.912196 0.000000 0.000000 0.000000 13.087804 0.000000 0.000000 6.633108 0.000000 93.366892 0.000000 0.000000 0.000000 6.633108 0.000000 93.366892 0.000000 0.000000
2 0.985600 7 306848.000000 11355.142857 8029.142857 3.700576 2.616652 3696.857143 0.000000 0.000000 0.000000 787.142857 3909.714286 0.000000 0.000000 0.000000 574.285714 0.396739 0.000000 0.000000 0.000000 0.396739 0.301886 0.000000 0.000000 0.000000 0.301886 82.445521 0.000000 0.000000 0.000000 17.554479 0.000000 87.192558 0.000000 0.000000 0.000000 12.807442 0.000000 0.000000 6.633108 0.000000 93.366892 0.000000 0.000000 0.000000 6.633108 0.000000 93.366892 0.000000 0.000000
3 0.985600 7 306848.000000 8112.285714 5438.857143 2.643747 1.772492 3652.571429 0.000000 0.000000 0.000000 831.428571 3888.428571 0.000000 0.000000 0.000000 595.571429 0.255206 0.000000 0.000000 0.000000 0.255206 0.205149 0.000000 0.000000 0.000000 0.205149 81.457882 0.000000 0.000000 0.000000 18.542118 0.000000 86.717854 0.000000 0.000000 0.000000 13.282146 0.000000 0.000000 6.633108 0.000000 93.366892 0.000000 0.000000 0.000000 6.633108 0.000000 93.366892 0.000000 0.000000
4 0.985600 7 306848.000000 6239.285714 4213.000000 2.033347 1.372992 3615.000000 0.000000 0.000000 0.000000 869.000000 3864.000000 0.000000 0.000000 0.000000 620.000000 0.273440 0.000000 0.000000 0.000000 0.273440 0.204397 0.000000 0.000000 0.000000 0.204397 80.619982 0.000000 0.000000 0.000000 19.380018 0.000000 86.173060 0.000000 0.000000 0.000000 13.826940 0.000000 0.000000 6.633108 0.000000 93.366892 0.000000 0.000000 0.000000 6.633108 0.000000 93.366892 0.000000 0.000000
5 0.985600 7 306848.000000 5044.285714 3494.285714 1.643904 1.138768 3592.714286 0.000000 0.000000 0.000000 891.285714 3847.428571 0.000000 0.000000 0.000000 636.571429 0.262209 0.000000 0.000000 0.000000 0.262209 0.203118 0.000000 0.000000 0.000000 0.203118 80.122977 0.000000 0.000000 0.000000 19.877023 0.000000 85.803492 0.000000 0.000000 0.000000 14.196508 0.000000 0.000000 6.633108 0.000000 93.366892 0.000000 0.000000 0.000000 6.633108 0.000000 93.366892 0.000000 0.000000
cutoff 0 Affix a 0.884193 b 0.790803: N(rules)= 2.421031*N(trainpairs)^0.790803
Suffix a -0.706998 b 0.933881: N(rules)= 0.493123*N(trainpairs)^0.933881
cutoff 1 Affix a 0.242492 b 0.779144: N(rules)= 1.274421*N(trainpairs)^0.779144
Suffix a -2.482111 b 0.967669: N(rules)= 0.083567*N(trainpairs)^0.967669
cutoff 2 Affix a -0.969721 b 0.794266: N(rules)= 0.379189*N(trainpairs)^0.794266
Suffix a -3.356905 b 0.996574: N(rules)= 0.034843*N(trainpairs)^0.996574
cutoff 3 Affix a -1.776403 b 0.828451: N(rules)= 0.169246*N(trainpairs)^0.828451
Suffix a -3.794027 b 1.003571: N(rules)= 0.022505*N(trainpairs)^1.003571
cutoff 4 Affix a -2.351608 b 0.854877: N(rules)= 0.095216*N(trainpairs)^0.854877
Suffix a -4.020226 b 1.000475: N(rules)= 0.017949*N(trainpairs)^1.000475
cutoff 5 Affix a -2.757390 b 0.871710: N(rules)= 0.063457*N(trainpairs)^0.871710
Suffix a -4.205819 b 0.997875: N(rules)= 0.014909*N(trainpairs)^0.997875
New (old) algorithm, least wrongly lemmatised (MIN(diff)).
Suffix only no
cutoff 2
fraction 9856.000000
iterations 7
trainlines 306848.000000
rules 8029.142857 ( 11355.142857)
rules% 2.616652 ( 3.700576)
same%stdev 0.301886
ambi1%stdev 0.000000
ambi2%stdev 0.000000
ambi3%stdev 0.000000
diff%stdev 0.301886
same% 87.192558 ( 82.445521)
ambi1% 0.000000 ( 0.000000)
ambi2% 0.000000 ( 0.000000)
ambi3% 0.000000 ( 0.000000)
diff% 12.807442 ( 17.554479)
amb.rules% 0.000000 ( 0.000000)
false_amb% 0.000000 ( 0.000000)
false_not_amb% 6.633108 ( 6.633108)
true_amb% 0.000000 ( 0.000000)
true_not_amb% 93.366892 ( 93.366892)
precision 0.000000 ( 0.000000)
recall 0.000000 ( 0.000000)
*/
#if 1 // 20090511
int A1 = a->W__R + 2*a->R__R + a->R__NA; // good: all words that are lemmatised correctly. bad: all previously right words that got it wrong
// wr + 2rr + rn - r = ww + rr - rw
int B1 = b->W__R + 2*b->R__R + b->R__NA;
int A2 = a->W__R + a->R__R + a->R__NA;
// wr + rr + rn - r = wr - rw
int B2 = b->W__R + b->R__R + b->R__NA;
int A3 = a->W__R + a->W__W + a->R__NA;
// wr + ww + rn - w = -wn + rn
int B3 = b->W__R + b->W__W + b->R__NA;
#else
int A1 = a->W__R + a->R__R - a->R__W; // good: all words that are lemmatised correctly. bad: all previously right words that got it wrong
// wr + 2rr + rn - r = ww + rr - rw
int B1 = b->W__R + b->R__R - b->R__W;
int A2 = a->W__R - a->R__W;
// wr + rr + rn - r = wr - rw
int B2 = b->W__R - b->R__W;
int A3 = a->W__R + a->W__W - a->R__R - a->R__W;
// wr + ww + rn - w = -wn + rn
int B3 = b->W__R + b->W__W - b->R__R - b->R__W;
#endif
/* int A1 = a->W__R + a->R__R - a->R__W; // good: all words that are lemmatised correctly. bad: all previously right words that got it wrong
int B1 = b->W__R + b->R__R - b->R__W;
int A2 = a->W__R - a->R__W;
int B2 = b->W__R - b->R__W;
int A3 = a->R__NA - a->W__NA;
int B3 = b->R__NA - b->W__NA;
*/
return (A1>B1)?-1:(A1<B1)?1:(A2>B2)?-1:(A2<B2)?1:(A3>B3)?-1:(A3<B3)?1:0;
}
#endif
#if _NA
static int comp_affiksFEW2org(const vertex * a,const vertex * b)
{
// BEST Norwegian 87.494563 +/- 0.217147 at 0.9856 of dataset, 6 iterations, 101814.500000 = 21.570549% rules, cutoff = 0
// English 88.260606 +/- 0.826699 at 0.9856 of dataset, 15 iterations, 7362.466667 = 9.785960% rules, cutoff = 1
// Icelandic 70.651411 +/- 1.565857 at 0.9856 of dataset, 17 iterations, 23232.941176 = 40.517860% rules, cutoff = 0
// German 90.307358 +/- 0.355867 at 0.9856 of dataset, 7 iterations, 50595.857143 = 16.280019% rules, cutoff = 0
// Dutch 90.274675 +/- 0.462929 at 0.9856 of dataset, 7 iterations, 23452.142857 = 7.881563% rules, cutoff = 1
// Slovene 86.417162 +/- 0.540735 at 0.9856 of dataset, 9 iterations, 40847.666667 = 20.878142% rules, cutoff = 0
// Swedish 91.982663 +/- 0.250703 at 0.9856 of dataset, 6 iterations, 28998.000000 = 6.156204% rules, cutoff = 1
// Greek 90.258032 +/- 0.234665 at 0.9856 of dataset, 5 iterations, 43156.000000 = 7.975310% rules, cutoff = 1 (but exactly the same as cutoff = 0)
// Danish 92.425041 +/- 0.374415 at 0.9856 of dataset, 5 iterations, 73177.800000 = 13.414099% rules, cutoff = 0
//AMBI:
// French ok 84.761146 ambi1 2.015924 ambi2 1.665605 diff 11.557325 rules 7262.500000 2.703935% cutoff 2
int A1 = a->W__R + a->R__R - a->R__W; // good: all words that are lemmatised correctly. bad: all previously right words that got it wrong
int B1 = b->W__R + b->R__R - b->R__W;
int A2 = a->W__R - a->R__W;
int B2 = b->W__R - b->R__W;
int A3 = a->R__NA - a->W__NA;
int B3 = b->R__NA - b->W__NA;
return (A1>B1)?-1:(A1<B1)?1:(A2>B2)?-1:(A2<B2)?1:(A3>B3)?-1:(A3<B3)?1:0;
}
#endif
#if _NA
static int comp_fixNA(const vertex * a,const vertex * b)
{
/*
Icelandic 47.982267 (at 0.8488 of dataset)
*/
//const vertex * a = *(const vertex **)A;
//const vertex * b = *(const vertex **)B;
//_fixNA
//AMBI:
// French: stopped because of very bad results. (> 25% wrong results)
int A1 = a->W__R + a->W__NA + a->R__NA;
int B1 = b->W__R + b->W__NA + b->R__NA;
int A2 = a->W__R + a->R__R + a->R__NA;
int B2 = b->W__R + b->R__R + b->R__NA;
int A3 = a->W__R + a->R__NA;
int B3 = b->W__R + b->R__NA;
return (A1>B1)?-1:(A1<B1)?1:(A2>B2)?-1:(A2<B2)?1:(A3>B3)?-1:(A3<B3)?1:0;
}
#endif
#if _NA
static int comp_fruit(const vertex * a,const vertex * b)
{
//const vertex * a = *(const vertex **)A;
//const vertex * b = *(const vertex **)B;
//Icelandic 71.344041 at 0.939 of dataset
//ALMOST BEST Icelandic 71.521831 +/- 1.988737 at 0.9856 of dataset, 17 iterations, 23539.352941 = 41.052237% rules
//Slovene 85.900276 +/- 0.456532 at 0.9856 of dataset, 9 iterations, 42167.333333 = 21.552652% rules
//English 87.626771 +/- 0.060148 at 0.4928 (!) of dataset, 3 iterations, 933.000000 = 2.480262% rules
//AMBI:
// French ok 85.382166 ambi1 1.359873 ambi2 1.089172 diff 12.168790 rules 7259.125000 2.899075% cutoff 2
int A1 = a->W__R + a->R__R;
int B1 = b->W__R + b->R__R;
int A2 = a->W__R + a->R__R + a->R__NA;
int B2 = b->W__R + b->R__R + b->R__NA;
int A3 = a->W__R + a->W__NA + a->R__NA;
int B3 = b->W__R + b->W__NA + b->R__NA;
return (A1>B1)?-1:(A1<B1)?1:(A2>B2)?-1:(A2<B2)?1:(A3>B3)?-1:(A3<B3)?1:0;
}
#endif
#if _NA
static int comp_ice(const vertex * a,const vertex * b)
{
//const vertex * a = *(const vertex **)A;
//const vertex * b = *(const vertex **)B;
//Icelandic 60.242322 at 0.939 of dataset
//AMBI:
// French ok 82.557325 ambi1 2.522293 ambi2 1.866242 diff 13.054140 rules 8556.625000 3.185757% cutoff 2
int A1 = a->W__R + a->R__R + a->R__NA;
int B1 = b->W__R + b->R__R + b->R__NA;
int A2 = a->W__R + a->R__R;
int B2 = b->W__R + b->R__R;
int A3 = a->W__R + a->W__NA + a->R__NA;
int B3 = b->W__R + b->W__NA + b->R__NA;
return (A1>B1)?-1:(A1<B1)?1:(A2>B2)?-1:(A2<B2)?1:(A3>B3)?-1:(A3<B3)?1:0;
}
#endif
#if _NA
static int comp_pisang(const vertex * a,const vertex * b)
{
//const vertex * a = *(const vertex **)A;
//const vertex * b = *(const vertex **)B;
//Icelandic 71.287687 at 0.939 of dataset
//AMBI:
// French ok 85.414013 ambi1 1.359873 ambi2 1.085987 diff 12.140127 rules 7848.375000 2.922065% cutoff 2
int A1 = a->W__R + a->R__R;
int B1 = b->W__R + b->R__R;
int A2 = a->W__R + a->W__NA + a->R__NA;
int B2 = b->W__R + b->W__NA + b->R__NA;
int A3 = a->W__R + a->R__R + a->R__NA;
int B3 = b->W__R + b->R__R + b->R__NA;
return (A1>B1)?-1:(A1<B1)?1:(A2>B2)?-1:(A2<B2)?1:(A3>B3)?-1:(A3<B3)?1:0;
}
#endif
#if _NA
static int comp_kiwi(const vertex * a,const vertex * b)
{
//const vertex * a = *(const vertex **)A;
//const vertex * b = *(const vertex **)B;
//Icelandic 70.865032 at 0.939 of dataset
//AMBI:
// French ok 85.410828 ambi1 1.378981 ambi2 1.035032 diff 12.175159 rules 7676.875000 2.858213% cutoff 2
int A1 = a->W__R + a->R__R;
int B1 = b->W__R + b->R__R;
int A2 = a->W__R + a->W__NA + a->R__R + a->R__NA;
int B2 = b->W__R + b->W__NA + b->R__R + b->R__NA;
int A3 = a->W__R + a->R__NA;
int B3 = b->W__R + b->R__NA;
return (A1>B1)?-1:(A1<B1)?1:(A2>B2)?-1:(A2<B2)?1:(A3>B3)?-1:(A3<B3)?1:0;
}
#endif
#if _NA
static int comp_carrot(const vertex * a,const vertex * b)
{
//const vertex * a = *(const vertex **)A;
//const vertex * b = *(const vertex **)B;
//Icelandic 71.090448 at 0.939 of dataset
//AMBI:
// French ok 85.060510 ambi1 1.328025 ambi2 1.041401 diff 12.570064 rules 7241.625000 2.696163% cutoff 2
int A1 = 4*(a->W__R + a->R__R) + a->R__NA;
int B1 = 4*(b->W__R + b->R__R) + b->R__NA;
int A2 = a->W__R + a->R__R + a->R__NA;
int B2 = b->W__R + b->R__R + b->R__NA;
int A3 = a->W__R + a->W__NA + a->R__NA;
int B3 = b->W__R + b->W__NA + b->R__NA;
return (A1>B1)?-1:(A1<B1)?1:(A2>B2)?-1:(A2<B2)?1:(A3>B3)?-1:(A3<B3)?1:0;
}
#endif
#if _NA
static int comp_peen(const vertex * a,const vertex * b)
{
//const vertex * a = *(const vertex **)A;
//const vertex * b = *(const vertex **)B;
// Icelandic 71.344041 at 0.939 of dataset
// ALMOST BEST Icelandic 71.507792 +/- 1.645702 at 0.9856 of dataset, 17 iterations, 25240.882353 = 44.019676% rules
// Slovene 86.133458 +/- 0.549185 at 0.9856 of dataset, 9 iterations, 40898.777778 = 20.904266% rules
// English 87.803261 +/- 0.106156 at 0.4928 (!) of dataset, 3 iterations, 889.333333 = 2.364179% rules
// Dutch 89.837692 +/- 0.412795 at 0.9856 of dataset, 7 iterations, 56640.285714 = 19.035104% rules, cutoff = 0
// ALMOST BEST German 91.288892 +/- 0.670828 at 0.9856 of dataset, 7 iterations, 50584.857143 = 16.276480% rules, cutoff = 0
// Swedish 91.873698 +/- 0.367967 at 0.9856 of dataset, 6 iterations, 9066.166667 = 1.924725% rules, cutoff = 2
// ALMOST BEST Norwegian 87.535644 +/- 0.344659 at 0.9856 of dataset, 6 iterations, 48468 = 10.268492% rules, cutoff = 1
// ALMOST BEST Greek 90.414875+/- 0.385254 at 0.9856 of dataset, 5 iterations, 120691.4 = 22.303999% rules, cutoff = 0
// BEST Danish 92.796387 +/- 0.214267 at 0.9856 of dataset, 5 iterations, 67807 = 12.429587% rules, cutoff = 0
// ALMOST BEST Russian 80.484806 +/- 0.409391 at 0.9856 of dataset, 6 iterations, 54630 = 14.022614% rules, cutoff = 1
// BEST Polish 93.880103 +/- 0.077021 at 0.9856 of dataset, 2 iterations, 344944.5 = 10.165818% rules, cutoff = 0
//AMBI:
// French ok 84.993631 ambi1 1.388535 ambi2 1.085987 diff 12.531847 rules 7318.375000 2.724738% cutoff 2
/*
0 0.985600 2 3490123.000000 0.000000 415069.500000 0.000000 11.892690 0.000000 0.000000 0.000000 0.000000 0.000000 45811.000000 299.500000 168.000000 0.000000 4714.500000 0.000000 0.000000 0.000000 0.000000 0.000000 0.044374 0.026347 0.033280 0.000000 0.015253 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 89.837821 0.587336 0.329457 0.000000 9.245387 1.282529 0.000000 0.000000 0.000000 0.000000 0.000000 nan 1.013865 6.709744 0.268664 92.007727 0.116994 0.038499
1 0.985600 2 3490123.000000 0.000000 198203.500000 0.000000 5.678983 0.000000 0.000000 0.000000 0.000000 0.000000 46176.500000 283.000000 241.000000 0.000000 4292.500000 0.000000 0.000000 0.000000 0.000000 0.000000 0.092907 0.063787 0.016640 0.000000 0.012480 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 90.554586 0.554978 0.472614 0.000000 8.417822 1.257035 0.000000 0.000000 0.000000 0.000000 0.000000 nan 0.784421 6.505795 0.472614 92.237170 0.231508 0.067725
2 0.985600 2 3490123.000000 0.000000 57342.000000 0.000000 1.642979 0.000000 0.000000 0.000000 0.000000 0.000000 46504.500000 192.000000 245.000000 0.000000 4051.500000 0.000000 0.000000 0.000000 0.000000 0.000000 0.134507 0.030507 0.044374 0.000000 0.059627 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 91.197811 0.376522 0.480458 0.000000 7.945208 0.999157 0.000000 0.000000 0.000000 0.000000 0.000000 nan 0.543212 6.522464 0.455945 92.478379 0.295613 0.065337
3 0.985600 2 3490123.000000 0.000000 34167.000000 0.000000 0.978963 0.000000 0.000000 0.000000 0.000000 0.000000 46470.500000 178.500000 210.000000 0.000000 4134.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.051307 0.029120 0.030507 0.000000 0.008320 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 91.131136 0.350048 0.411821 0.000000 8.106995 0.890318 0.000000 0.000000 0.000000 0.000000 0.000000 nan 0.498108 6.586198 0.392211 92.523484 0.282486 0.056203
4 0.985600 2 3490123.000000 0.000000 24896.500000 0.000000 0.713342 0.000000 0.000000 0.000000 0.000000 0.000000 46392.500000 166.500000 180.000000 0.000000 4254.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.059627 0.029120 0.044374 0.000000 0.013867 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 90.978173 0.326515 0.352990 0.000000 8.342321 0.809915 0.000000 0.000000 0.000000 0.000000 0.000000 nan 0.455945 6.624439 0.353970 92.565646 0.279628 0.050724
5 0.985600 2 3490123.000000 0.000000 19778.500000 0.000000 0.566699 0.000000 0.000000 0.000000 0.000000 0.000000 46335.500000 151.500000 180.500000 0.000000 4325.500000 0.000000 0.000000 0.000000 0.000000 0.000000 0.065174 0.006933 0.009707 0.000000 0.048534 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 90.866393 0.297100 0.353970 0.000000 8.482537 0.759908 0.000000 0.000000 0.000000 0.000000 0.000000 nan 0.429471 6.647971 0.330438 92.592120 0.277824 0.047351
New algorithm, least wrongly lemmatised (MIN(diff)).
Suffix only no
cutoff 2
fraction 9856.000000
iterations 2
trainlines 3490123.000000
rules 57342.000000
rules% 1.642979
same%stdev 0.134507
ambi1%stdev 0.030507
ambi2%stdev 0.044374
ambi3%stdev 0.000000
diff%stdev 0.059627
same% 91.197811
ambi1% 0.376522
ambi2% 0.480458
ambi3% 0.000000
diff% 7.945208
amb.rules% 0.999157
false_amb% 0.543212
false_not_amb% 6.522464
true_amb% 0.455945
true_not_amb% 92.478379
precision 0.295613
recall 0.065337
Comment: If comparing by diff%, comp_peen is marginally worse than best_pl
(best_pl uses automatically computed parameters)
If compared by same%, comp_peen is 0.3% better. Reason: best_pl has many more
ambiguous rules and generates more false ambiguous results, but also more true
ambiguous results.
*/
int A1 = 3*(a->W__R + a->R__R) + a->R__NA;
// 3wr + 3rr + rn - r = 3wr + 2rr - rw
int B1 = 3*(b->W__R + b->R__R) + b->R__NA;
int A2 = a->W__R + a->R__R + a->R__NA;
//wr + rr + rn - r = wr - rw
int B2 = b->W__R + b->R__R + b->R__NA;
int A3 = a->W__R + a->W__NA + a->R__NA;
// wr + wn + rn - w = -ww + rn
int B3 = b->W__R + b->W__NA + b->R__NA;
return (A1>B1)?-1:(A1<B1)?1:(A2>B2)?-1:(A2<B2)?1:(A3>B3)?-1:(A3<B3)?1:0;
}
#endif
#if _NA
static int comp_sugar(const vertex * a,const vertex * b)
{
//const vertex * a = *(const vertex **)A;
//const vertex * b = *(const vertex **)B;
// Slovene 86.273367 +/- 0.410931 at 0.9856 of dataset, 9 iterations, 17254.777778 = 8.819297% rules (cutoff = 1)
// BEST English 89.060606 +/- 1.320829 at 0.9856 of dataset, 3 iterations, 1318.266667 = 1.752199% rules, cutoff=2
// Icelandic 70.925172 +/- 1.858255 at 0.9856 of dataset, 17 iterations, 27151.294118 = 47.351402% rules, cutoff = 0
// Dutch 90.172822 +/- 0.307911 at 0.9856 of dataset, 7 iterations, 57761.142857 = 19.411791% rules, cutoff = 0
// BEST Greek 90.422464 +/- 0.437009 at 0.9856 of dataset, 5 iterations, 132765.6 = 24.535334% rules, cutoff = 0
// BEST German 91.461918 +/- 0.167574 at 0.9856 of dataset, 7 iterations, 50986 = 16.405554% rules, cutoff = 0
// BEST Swedish 92.265969 +/- 0.277289 at 0.9856 of dataset, 6 iterations, 25935.333333 = 5.506008% rules, cutoff = 1
// Norwegian 86.665700 +/- 0.676264 at 0.9856 of dataset, 6 iterations, 46685.5 = 9.890849% rules, cutoff = 1
// Danish 92.585623 +/- 0.171327 at 0.9856 of dataset, 5 iterations, 30422.400000 = 5.576679% rules, cutoff = 1
// BEST Russian 80.815622 +/- 0.450500 at 0.9856 of dataset, 6 iterations, 47079.166667 = 12.084440% rules, cutoff = 1
//AMBI:
// French ok 75.472316 ambi1 4.615600 ambi2 3.493266 diff 16.418818 rules 4162.909091 3.129560% cutoff 2
#if 1
// next lines from affixFEW2
int A1 = a->W__R + 2*a->R__R + a->R__NA; // good: all words that are lemmatised correctly. bad: all previously right words that got it wrong
// wr - rw + rr
int B1 = b->W__R + 2*b->R__R + b->R__NA;
int A2 = a->W__R + a->R__R + a->R__NA;
// wr - rw
int B2 = b->W__R + b->R__R + b->R__NA;
// next lines from peen
int A3 = a->W__R + a->W__NA + a->R__NA;
// -ww + rn
int B3 = b->W__R + b->W__NA + b->R__NA;
return (A1>B1)?-1:(A1<B1)?1:(A2>B2)?-1:(A2<B2)?1:(A3>B3)?-1:(A3<B3)?1:0;
#else
//equivalent with:
int AA1 = a->W__R - a->R__W + a->R__R;
int AA2 = - a->R__R;
int AA3 = - a->W__R - 2*a->W__W;//a->R__NA - a->W__W;
int BB1 = b->W__R - b->R__W + b->R__R;
int BB2 = - b->R__R;
int BB3 = - b->W__R - 2*b->W__W;//b->R__NA - b->W__W;
#endif
}
#endif
static int comp_honey(const vertex * a,const vertex * b)
{
//const vertex * a = *(const vertex **)A;
//const vertex * b = *(const vertex **)B;
// (OK) Dutch 90.179393 +/- 0.589662 at 0.9856 of dataset, 7 iterations, 73324.571429 = 24.642193% rules, cutoff = 0
// (OK) Norwegian 87.272244 +/- 0.267729 at 0.9856 of dataset, 6 iterations, 141038.666667 = 29.880630% rules, cutoff = 0
// (OK) English 88.315152 +/- 1.097312 at 0.9856 of dataset, 3 iterations, 5285.466667 = 7.025276% rules, cutoff=1
// (OK) Icelandic 70.742665 +/- 1.686147 at 0.9856 of dataset, 17 iterations, 29857.000000 = 52.070108% rules, cutoff = 0
// (?) Slovene 86.273367 +/- 0.410931 at 0.9856 of dataset, 9 iterations, 17254.777778 = 8.819297% rules (cutoff = 1)
// (?) BEST Greek 90.422464 +/- 0.437009 at 0.9856 of dataset, 5 iterations, 132765.6 = 24.535334% rules, cutoff = 0
// (?) BEST German 91.461918 +/- 0.167574 at 0.9856 of dataset, 7 iterations, 50986 = 16.405554% rules, cutoff = 0
// (?) BEST Swedish 92.265969 +/- 0.277289 at 0.9856 of dataset, 6 iterations, 25935.333333 = 5.506008% rules, cutoff = 1
// (?) Danish 92.585623 +/- 0.171327 at 0.9856 of dataset, 5 iterations, 30422.400000 = 5.576679% rules, cutoff = 1
// (?) BEST Russian 80.815622 +/- 0.450500 at 0.9856 of dataset, 6 iterations, 47079.166667 = 12.084440% rules, cutoff = 1
//AMBI:
// French ok 84.477707 ambi1 2.251592 ambi2 1.426752 diff 11.843949 rules 7413.875000 2.760295% cutoff 2
int A1 = a->W__R + 2*a->R__R;
int B1 = b->W__R + 2*b->R__R;
int A2 = a->W__R + a->R__R;
int B2 = b->W__R + b->R__R;
int A3 = a->W__R ;
int B3 = b->W__R ;
return (A1>B1)?-1:(A1<B1)?1:(A2>B2)?-1:(A2<B2)?1:(A3>B3)?-1:(A3<B3)?1:0;
}
#if _NA
static int comp_beet(const vertex * a,const vertex * b)
{
//const vertex * a = *(const vertex **)A;
//const vertex * b = *(const vertex **)B;
//Icelandic 71.034094 at 0.939 of dataset
//AMBI:
// French ok 85.057325 ambi1 1.283439 ambi2 1.057325 diff 12.601911 rules 7260.375000 2.703144% cutoff 2
int A1 = 2*(a->W__R + a->R__R) + a->R__NA;
int B1 = 2*(b->W__R + b->R__R) + b->R__NA;
int A2 = a->W__R + a->R__R + a->R__NA;
int B2 = b->W__R + b->R__R + b->R__NA;
int A3 = a->W__R + a->W__NA + a->R__NA;
int B3 = b->W__R + b->W__NA + b->R__NA;
return (A1>B1)?-1:(A1<B1)?1:(A2>B2)?-1:(A2<B2)?1:(A3>B3)?-1:(A3<B3)?1:0;
}
#endif
static int comp_koud(const vertex * a,const vertex * b)
{
// German 91.260578 +/- 0.363285 at 0.9856 of dataset, 7 iterations, 30890.714286 = 9.939577% rules, cutoff = 0
//AMBI:
// French 86.356688 ambi1 0.996815 ambi2 0.796178 diff 11.850318 rules 3335.250000 1.241763% cutoff 3 (!)
// French 85.250493 ambi1 2.333057 ambi2 2.161181 diff 10.255268 rules 28520.250000 10.618597% cutoff 0 (!) paradigms+homographs clumped
// French 85.313973 ambi1 2.050694 ambi2 2.289517 diff 10.345816 rules 28509.250000 10.614432% cutoff 0 (!) homographs clumped
// Dutch.clumped.ph 85.789838 ambi1 1.086067 ambi2 1.256060 diff 11.868035 rules 37400.142857 12.190637% cutoff 0 paradigms+homographs clumped
// Dutch.clumped.h 85.818923 ambi1 1.095507 ambi2 1.060476 diff 12.025095 rules 37411.857143 12.192383% cutoff 0 homographs clumped
// (Dutch.clumped.ph suffix, old algo:
// 83.532708 ambi1 1.948624 ambi2 2.719889 ambi3 0.107033 diff 11.691746 rules 73024.571429 23.802477% cutoff 0 paradigms+homographs clumped
// (Dutch.clumped.h suffix, old algo:
// 83.624725 ambi1 1.859813 ambi2 2.611382 ambi3 0.162415 diff 11.741664 rules 72975.428571 23.782417% cutoff 0 paradigms+homographs clumped
// Russian clumped ph 74.983460 ambi1 0.517762 ambi2 0.558033 diff 23.940745 rules 95077.500000 24.184389% cutoff 0 paradigms+homographs clumped
// (old algo:)
// 79.485114 ambi1 0.218611 ambi2 0.342298 ambi3 0.005753 diff 19.948224 rules 94247.166667 23.973181% cutoff 0 paradigms+homographs clumped
// The A1 vs B1 condition is pretty close to what later was found as the
// best factors using automatic factor setting (comp_parms).
// These factors were found by manual optimizing.
int A1 = 6*a->W__R - 5*a->R__W + a->W__W;
int B1 = 6*b->W__R - 5*b->R__W + b->W__W;
int A2 = a->W__R - 6*a->R__W;
int B2 = b->W__R - 6*b->R__W;
int A3 = a->R__R - a->W__W;
int B3 = b->R__R - b->W__W;
return (A1>B1)?-1:(A1<B1)?1:(A2>B2)?-1:(A2<B2)?1:(A3>B3)?-1:(A3<B3)?1:0;
}
int (*comp)(const vertex * a,const vertex * b) = comp_parms;
// returns b > a ? 1 : b < a ? -1 : 0
// (Chosen like this to let qsort sort in descending order.)
// You can find a local optimum for the parameters by using comp_parms as the
// weight function and setting compute_parms = true. The parameters parms.Matrix[]
// can be seeded with non-zero values by hard coding. The file parms.txt
// will contain the currently best parameters.
// Optimal parameters == resulting in smallest rule set.
// Hypothesis: small rule sets give (almost) best lemmatization results.
// Optimizing for the size of rule sets is computationally MUCH cheaper
// than optimizing for accuracy.
// If you have found a good set of parameters (presumably with a small subset
// of the training data), you can hard code them (as is done below) and run
// the program with the full set of the training data. In that case,
// set compute_parms = false
//bool compute_parms = false;
struct rotation
{
double Matrix[6];
int ROWPARMS;
// metadata:
const char * Language;
const char * Xparm;
int NumberOfRules;
int CutOffForFewestErrors;
double FractionErroneousPedictionOOV;
bool suffixOnly;
void init(optionStruct * options)
{
ROWPARMS = options->numberOfParms();
for(int i = 0;i < ROWPARMS;++i)
{
Matrix[i] = options->parm(i);
}
}
void better(optionStruct * options)
{
for(int i = 0;i < ROWPARMS;++i)
{
options->setParm(i,Matrix[i]);
}
}
} rotation;
static struct rotation parms =
/* R_R W_R R_W W_W R_NA W_NA */
{{ 0.0, 3.0, -2.0, 1.0, 0.0, 0.0},6,"","",0,0,0.0}
;
static void normalise(double * ROW)
{
double modulus = 0.0;
for(int i = 0;i < parms.ROWPARMS;++i)
modulus += ROW[i] * ROW[i];
modulus = sqrt(modulus);
for(int i = 0;i < parms.ROWPARMS;++i)
ROW[i] /= modulus;
}
static double inner(double * a, double * b)
{
double ret = 0;
for(int i = 0;i < parms.ROWPARMS;++i)
ret += a[i]*b[i];
return ret;
}
static void times(double * a, double f)
{
for(int i = 0;i < parms.ROWPARMS;++i)
a[i] *= f;
}
struct bestParms
{
bool suffixonly;
const char * langbase;
int rowss;
struct rotation val;
// Each row:
// R__R W__R R__W W__W R__NA W__NA
// Generally, good that Wrongs change to Rights (W__R > 0) and that Rights don't change to Wrongs (R__W < 0)
// But what about rules that don't improve lemmatisation? (R__R > 0 or W__W > 0)
// Intuitively difficult to decide!
};
#if 1
static bestParms best_is_suffix =
{
true,
"is",
1,
//iteration:18.1
/*weight (not used): 1.41244386452166131e+05 suffix only: yes */
/* number of nodes: 152108, nodes/line: 1.05629895368709495e-01 weight (not used): 1.41244386452166131e+05 blobs 2809220 lines 2873370 * fraction 5.01187233627272799e-01 = 1440009 lines*/
{{
0.00000000000000000e+00, 6.94542434383270568e-01, -7.18112257666929654e-01, 4.38815704990783637e-02
}}
};
#elif 1
static bestParms best_is_suffix =
{
true,
"is",
1,
/*
0 0.985600 2 2831993.000000 471048.500000 306036.000000 16.633110 10.806383 36063.500000 712.500000 528.500000 83.000000 3989.500000 36948.000000 188.500000 140.500000 0.000000 4100.000000 0.176020 0.158931 0.029052 0.006836 0.052977 0.102536 0.039306 0.042723 0.000000 0.105954 87.158325 1.721971 1.277280 0.200595 9.641830 3.905551 89.295986 0.455567 0.339561 0.000000 9.908887 1.377577 1.841603 33.482369 2.063949 62.612079 0.359125 0.058064 0.424149 34.592890 0.953428 64.029533 0.529175 0.026822
1 0.985600 2 2831993.000000 128789.000000 168125.000000 4.547645 5.936632 36520.000000 0.000000 0.000000 0.000000 4857.000000 37099.500000 273.500000 263.000000 0.000000 3741.000000 0.051268 0.000000 0.000000 0.000000 0.051268 0.032470 0.001709 0.047850 0.000000 0.082029 88.261595 0.000000 0.000000 0.000000 11.738405 0.000000 89.662131 0.660995 0.635619 0.000000 9.041255 1.598714 0.000000 35.546318 0.000000 64.453682 0.000000 0.000000 0.493028 34.440631 1.105687 63.960654 0.528596 0.031106
2 0.985600 2 2831993.000000 84003.000000 52867.500000 2.966215 1.866795 36089.500000 0.000000 0.000000 0.000000 5287.500000 37338.500000 236.500000 207.000000 0.000000 3595.000000 0.063231 0.000000 0.000000 0.000000 0.063231 0.029052 0.049559 0.037597 0.000000 0.041014 87.221162 0.000000 0.000000 0.000000 12.778838 0.000000 90.239747 0.571574 0.500278 0.000000 8.688402 1.203567 0.000000 35.546318 0.000000 64.453682 0.000000 0.000000 0.420523 34.763274 0.783044 64.033159 0.482143 0.022029
3 0.985600 2 2831993.000000 63704.000000 38416.000000 2.249441 1.356501 35649.000000 0.000000 0.000000 0.000000 5728.000000 37204.000000 219.500000 209.500000 0.000000 3744.000000 0.017089 0.000000 0.000000 0.000000 0.017089 0.088865 0.022216 0.025634 0.000000 0.085447 86.156560 0.000000 0.000000 0.000000 13.843440 0.000000 89.914687 0.530488 0.506320 0.000000 9.048505 1.185441 0.000000 35.546318 0.000000 64.453682 0.000000 0.000000 0.422940 34.783817 0.762501 64.030742 0.474080 0.021451
4 0.985600 2 2831993.000000 51623.000000 31338.500000 1.822851 1.106588 35334.000000 0.000000 0.000000 0.000000 6043.000000 37065.500000 224.000000 188.000000 0.000000 3899.500000 0.061522 0.000000 0.000000 0.000000 0.061522 0.097409 0.006836 0.023925 0.000000 0.080320 85.395268 0.000000 0.000000 0.000000 14.604732 0.000000 89.579960 0.541364 0.454359 0.000000 9.424318 1.152814 0.000000 35.546318 0.000000 64.453682 0.000000 0.000000 0.435024 34.828528 0.717790 64.018658 0.452055 0.020193
5 0.985600 2 2831993.000000 43987.500000 27450.500000 1.553235 0.969300 35034.500000 0.000000 0.000000 0.000000 6342.500000 36936.000000 219.500000 179.000000 0.000000 4042.500000 0.093992 0.000000 0.000000 0.000000 0.093992 0.082029 0.015380 0.003418 0.000000 0.100827 84.671436 0.000000 0.000000 0.000000 15.328564 0.000000 89.266984 0.530488 0.432607 0.000000 9.769920 1.125021 0.000000 35.546318 0.000000 64.453682 0.000000 0.000000 0.408439 34.829736 0.716582 64.045243 0.467297 0.020159
New (old) algorithm, least wrongly lemmatised (MIN(diff)).
Suffix only yes
cutoff 2
fraction 9856.000000
iterations 2
trainlines 2831993.000000
rules 52867.500000 (84003.000000)
rules% 1.866795 (2.966215)
same%stdev 0.029052
ambi1%stdev 0.049559
ambi2%stdev 0.037597
ambi3%stdev 0.000000
diff%stdev 0.041014
same% 90.239747 (87.221162)
ambi1% 0.571574 (0.000000)
ambi2% 0.500278 (0.000000)
ambi3% 0.000000 (0.000000)
diff% 8.688402 (12.778838)
amb.rules% 1.203567 (0.000000)
false_amb% 0.420523 (0.000000)
false_not_amb% 34.763274 (35.546318)
true_amb% 0.783044 (0.000000)
true_not_amb% 64.033159 (64.453682)
precision 0.482143 (0.000000)
recall 0.022029 (0.000000)
bests[11].suffixonly == [true]
bests[11].langbase == [is]
comp = comp_parms0_off
bests[11].rows == [1]
R->R W->R R->W W->W
0.0085050.669878-0.7383730.077434
*/
//iteration:14.-1
/* number of nodes: 88858, nodes/line: 1.20004915909585078e-01 weight: 8.44399637102287234e+04 blobs 1 lines 5881633 * fraction 1.25892541179416839e-01 = 740453 lines*/
{ // # decisions
8.50547688621742723e-03, 6.69877760720549498e-01, -7.38373491250877478e-01, 7.74340362692699236e-02, //1177883
-7.82684292973299223e-01, 4.59948960180274258e-01, 3.92892970820137744e-01, -1.46585691805502294e-01, //0
-6.01147073676968957e-01, -5.82239786374368462e-01, -5.42665925463000409e-01, -7.16430060350067843e-02, //0
-1.61106021629986690e-01, -2.66002045826051053e-02, 7.71582415526826937e-02, 9.83556752135442247e-01 //0
} //(0 unresolved comparisons)
// Same as
//iteration:13.11
/* number of nodes: 74744, nodes/line: 1.42586526923832640e-01 weight: 7.11594670680982090e+04 blobs 1 lines 5881633 * fraction 8.91250938133746201e-02 = 524201 lines*/
};
#endif
static bestParms best_isC0 =
{
false,
"isC0",
1,
/* SINGLE SHOT
cutoff fraction iterations trainlines suffixrules affixrules suffix% affix% s-same s-ambiguous s-different a-same a-ambiguous a-different s-same-stddev% s-amb-stddev% s-diff-stddev% a-same-stddev% a-amb-stddev% a-diff-stddev% s-same% s-ambiguous% s-different% s-amb.rules% a-same% a-ambiguous% a-different% a-amb.rules% s_false_amb s_false_not_amb s_true_amb s_true_not_amb s_precision s_recall a_false_amb a_false_not_amb a_true_amb a_true_not_amb a_precision a_recall
0 0.985600 2 2831993.000000 471048.500000 60440.000000 16.633110 2.134186 36063.500000 712.500000 528.500000 83.000000 3989.500000 36357.500000 343.000000 254.000000 0.000000 4422.500000 0.176020 0.158931 0.029052 0.006836 0.052977 0.227289 0.092283 0.003418 0.000000 0.131588 87.158325 1.721971 1.277280 0.200595 9.641830 3.905551 87.868864 0.828963 0.613868 0.000000 10.688305 2.130411 1.841603 33.482369 2.063949 62.612079 0.359125 0.058064 0.775793 34.191701 1.354617 63.677889 0.466112 0.038109
1 0.985600 2 2831993.000000 128789.000000 32445.000000 4.547645 1.145660 36520.000000 0.000000 0.000000 0.000000 4857.000000 36779.500000 343.500000 318.000000 0.000000 3936.000000 0.051268 0.000000 0.000000 0.000000 0.051268 0.278557 0.083738 0.064940 0.000000 0.259758 88.261595 0.000000 0.000000 0.000000 11.738405 0.000000 88.888755 0.830171 0.768543 0.000000 9.512531 1.969693 0.000000 35.546318 0.000000 64.453682 0.000000 0.000000 0.679121 34.255746 1.290572 63.774561 0.487226 0.036307
2 0.985600 2 2831993.000000 84003.000000 14192.000000 2.966215 0.501131 36089.500000 0.000000 0.000000 0.000000 5287.500000 36974.500000 310.500000 289.500000 0.000000 3802.500000 0.063231 0.000000 0.000000 0.000000 0.063231 0.295646 0.090574 0.029052 0.000000 0.234124 87.221162 0.000000 0.000000 0.000000 12.778838 0.000000 89.360031 0.750417 0.699664 0.000000 9.189888 1.688136 0.000000 35.546318 0.000000 64.453682 0.000000 0.000000 0.607826 34.466008 1.080310 63.845856 0.470526 0.030392
3 0.985600 2 2831993.000000 63704.000000 10130.000000 2.249441 0.357699 35649.000000 0.000000 0.000000 0.000000 5728.000000 36863.000000 286.500000 274.000000 0.000000 3953.500000 0.017089 0.000000 0.000000 0.000000 0.017089 0.283684 0.083738 0.013671 0.000000 0.213617 86.156560 0.000000 0.000000 0.000000 13.843440 0.000000 89.090558 0.692414 0.662204 0.000000 9.554825 1.579380 0.000000 35.546318 0.000000 64.453682 0.000000 0.000000 0.566740 34.533678 1.012640 63.886942 0.471847 0.028488
4 0.985600 2 2831993.000000 51623.000000 8154.000000 1.822851 0.287924 35334.000000 0.000000 0.000000 0.000000 6043.000000 36739.500000 269.500000 257.500000 0.000000 4110.500000 0.061522 0.000000 0.000000 0.000000 0.061522 0.275139 0.029052 0.008545 0.000000 0.237542 85.395268 0.000000 0.000000 0.000000 14.604732 0.000000 88.792083 0.651328 0.622326 0.000000 9.934263 1.498417 0.000000 35.546318 0.000000 64.453682 0.000000 0.000000 0.547406 34.595307 0.951011 63.906276 0.464855 0.026754
5 0.985600 2 2831993.000000 43987.500000 6904.000000 1.553235 0.243786 35034.500000 0.000000 0.000000 0.000000 6342.500000 36631.500000 255.000000 239.500000 0.000000 4251.000000 0.093992 0.000000 0.000000 0.000000 0.093992 0.305900 0.017089 0.015380 0.000000 0.273430 84.671436 0.000000 0.000000 0.000000 15.328564 0.000000 88.531068 0.616284 0.578824 0.000000 10.273824 1.407787 0.000000 35.546318 0.000000 64.453682 0.000000 0.000000 0.526863 34.665394 0.880924 63.926819 0.455340 0.024782
cutoff 0 Affix a 11.009406 b 0.000000: N(rules)= 60440.000000*N(trainpairs)^0.000000
Suffix a -0.112324 b 0.878094: N(rules)= 0.893755*N(trainpairs)^0.878094
cutoff 1 Affix a 10.387302 b 0.000000: N(rules)= 32445.000000*N(trainpairs)^0.000000
Suffix a -0.733958 b 0.836893: N(rules)= 0.480005*N(trainpairs)^0.836893
cutoff 2 Affix a 9.560434 b 0.000000: N(rules)= 14192.000000*N(trainpairs)^0.000000
Suffix a -1.605792 b 0.868583: N(rules)= 0.200730*N(trainpairs)^0.868583
cutoff 3 Affix a 9.223257 b 0.000000: N(rules)= 10130.000000*N(trainpairs)^0.000000
Suffix a -2.263068 b 0.896039: N(rules)= 0.104031*N(trainpairs)^0.896039
cutoff 4 Affix a 9.006264 b 0.000000: N(rules)= 8154.000000*N(trainpairs)^0.000000
Suffix a -2.871889 b 0.924806: N(rules)= 0.056592*N(trainpairs)^0.924806
cutoff 5 Affix a 8.839856 b 0.000000: N(rules)= 6904.000000*N(trainpairs)^0.000000
Suffix a -3.315163 b 0.944783: N(rules)= 0.036328*N(trainpairs)^0.944783
New (old) algorithm, least wrongly lemmatised (MIN(diff)).
Suffix only no
Redo training no
cutoff 2
fraction 9856.000000
iterations 2
trainlines 2831993.000000
rules 14192.000000 ( 84003.000000)
rules% 0.501131 ( 2.966215)
same%stdev 0.295646
ambi1%stdev 0.090574
ambi2%stdev 0.029052
ambi3%stdev 0.000000
diff%stdev 0.234124
same% 89.360031 ( 87.221162)
ambi1% 0.750417 ( 0.000000)
ambi2% 0.699664 ( 0.000000)
ambi3% 0.000000 ( 0.000000)
diff% 9.189888 ( 12.778838)
amb.rules% 1.688136 ( 0.000000)
false_amb% 0.607826 ( 0.000000)
false_not_amb% 34.466008 ( 35.546318)
true_amb% 1.080310 ( 0.000000)
true_not_amb% 63.845856 ( 64.453682)
precision 0.470526 ( 0.000000)
recall 0.030392 ( 0.000000)
bests[16].suffixonly == [false]
bests[16].langbase == [isC0]
comp = comp_parms0_off
bests[16].rows == [1]
R->R W->R R->W W->W
0.247669 0.636764 -0.729558 0.022230 -0.004214 0.020625
*/
/* REDO
cutoff fraction iterations trainlines suffixrules affixrules suffix% affix% s-same s-ambiguous s-different a-same a-ambiguous a-different s-same-stddev% s-amb-stddev% s-diff-stddev% a-same-stddev% a-amb-stddev% a-diff-stddev% s-same% s-ambiguous% s-different% s-amb.rules% a-same% a-ambiguous% a-different% a-amb.rules% s_false_amb s_false_not_amb s_true_amb s_true_not_amb s_precision s_recall a_false_amb a_false_not_amb a_true_amb a_true_not_amb a_precision a_recall
0 0.985600 2 2831993.000000 471048.500000 60440.000000 16.633110 2.134186 36063.500000 712.500000 528.500000 83.000000 3989.500000 38009.500000 563.500000 543.000000 0.000000 2261.000000 0.176020 0.158931 0.029052 0.006836 0.052977 5.419039 0.845924 0.991183 0.000000 7.256146 87.158325 1.721971 1.277280 0.200595 9.641830 3.905551 91.861421 1.361868 1.312323 0.000000 5.464388 3.111632 1.841603 33.482369 2.063949 62.612079 0.359125 0.058064 0.374604 32.809290 2.737028 64.079078 0.785095 0.076999
1 0.985600 2 2831993.000000 128789.000000 32445.000000 4.547645 1.145660 36520.000000 0.000000 0.000000 0.000000 4857.000000 37620.000000 466.500000 460.000000 0.000000 2830.500000 0.051268 0.000000 0.000000 0.000000 0.051268 2.594166 0.504136 0.420398 0.000000 3.518701 88.261595 0.000000 0.000000 0.000000 11.738405 0.000000 90.920076 1.127438 1.111729 0.000000 6.840757 2.505015 0.000000 35.546318 0.000000 64.453682 0.000000 0.000000 0.546197 33.587500 1.958818 63.907485 0.641980 0.055106
2 0.985600 2 2831993.000000 84003.000000 14192.000000 2.966215 0.501131 36089.500000 0.000000 0.000000 0.000000 5287.500000 37357.500000 328.500000 317.500000 0.000000 3373.500000 0.063231 0.000000 0.000000 0.000000 0.063231 1.013400 0.152095 0.066649 0.000000 1.232143 87.221162 0.000000 0.000000 0.000000 12.778838 0.000000 90.285666 0.793919 0.767335 0.000000 8.153080 1.789642 0.000000 35.546318 0.000000 64.453682 0.000000 0.000000 0.541364 34.298040 1.248278 63.912318 0.535511 0.035117
3 0.985600 2 2831993.000000 63704.000000 10130.000000 2.249441 0.357699 35649.000000 0.000000 0.000000 0.000000 5728.000000 37147.500000 295.000000 281.500000 0.000000 3653.000000 0.017089 0.000000 0.000000 0.000000 0.017089 0.688702 0.112790 0.011963 0.000000 0.813454 86.156560 0.000000 0.000000 0.000000 13.843440 0.000000 89.778138 0.712956 0.680330 0.000000 8.828576 1.604756 0.000000 35.546318 0.000000 64.453682 0.000000 0.000000 0.513570 34.455132 1.091186 63.940112 0.515117 0.030698
4 0.985600 2 2831993.000000 51623.000000 8154.000000 1.822851 0.287924 35334.000000 0.000000 0.000000 0.000000 6043.000000 36962.000000 285.000000 261.500000 0.000000 3868.500000 0.061522 0.000000 0.000000 0.000000 0.061522 0.485338 0.082029 0.022216 0.000000 0.589583 85.395268 0.000000 0.000000 0.000000 14.604732 0.000000 89.329821 0.688788 0.631994 0.000000 9.349397 1.540711 0.000000 35.546318 0.000000 64.453682 0.000000 0.000000 0.525654 34.531261 1.015057 63.928028 0.491228 0.028556
5 0.985600 2 2831993.000000 43987.500000 6904.000000 1.553235 0.243786 35034.500000 0.000000 0.000000 0.000000 6342.500000 36809.500000 273.000000 235.500000 0.000000 4059.000000 0.093992 0.000000 0.000000 0.000000 0.093992 0.302482 0.078611 0.001709 0.000000 0.382802 84.671436 0.000000 0.000000 0.000000 15.328564 0.000000 88.961259 0.659787 0.569157 0.000000 9.809798 1.437997 0.000000 35.546318 0.000000 64.453682 0.000000 0.000000 0.506320 34.614641 0.931677 63.947362 0.479180 0.026210
cutoff 0 Affix a 11.009406 b 0.000000: N(rules)= 60440.000000*N(trainpairs)^0.000000
Suffix a -0.112324 b 0.878094: N(rules)= 0.893755*N(trainpairs)^0.878094
cutoff 1 Affix a 10.387302 b 0.000000: N(rules)= 32445.000000*N(trainpairs)^0.000000
Suffix a -0.733958 b 0.836893: N(rules)= 0.480005*N(trainpairs)^0.836893
cutoff 2 Affix a 9.560434 b 0.000000: N(rules)= 14192.000000*N(trainpairs)^0.000000
Suffix a -1.605792 b 0.868583: N(rules)= 0.200730*N(trainpairs)^0.868583
cutoff 3 Affix a 9.223257 b 0.000000: N(rules)= 10130.000000*N(trainpairs)^0.000000
Suffix a -2.263068 b 0.896039: N(rules)= 0.104031*N(trainpairs)^0.896039
cutoff 4 Affix a 9.006264 b 0.000000: N(rules)= 8154.000000*N(trainpairs)^0.000000
Suffix a -2.871889 b 0.924806: N(rules)= 0.056592*N(trainpairs)^0.924806
cutoff 5 Affix a 8.839856 b 0.000000: N(rules)= 6904.000000*N(trainpairs)^0.000000
Suffix a -3.315163 b 0.944783: N(rules)= 0.036328*N(trainpairs)^0.944783
New (old) algorithm, least wrongly lemmatised (MIN(diff)).
Suffix only no
Redo training yes
cutoff 0
fraction 9856.000000
iterations 2
trainlines 2831993.000000
rules 60440.000000 ( 471048.500000)
rules% 2.134186 ( 16.633110)
same%stdev 5.419039
ambi1%stdev 0.845924
ambi2%stdev 0.991183
ambi3%stdev 0.000000
diff%stdev 7.256146
same% 91.861421 ( 87.158325)
ambi1% 1.361868 ( 1.721971)
ambi2% 1.312323 ( 1.277280)
ambi3% 0.000000 ( 0.200595)
diff% 5.464388 ( 9.641830)
amb.rules% 3.111632 ( 3.905551)
false_amb% 0.374604 ( 1.841603)
false_not_amb% 32.809290 ( 33.482369)
true_amb% 2.737028 ( 2.063949)
true_not_amb% 64.079078 ( 62.612079)
precision 0.785095 ( 0.359125)
recall 0.076999 ( 0.058064)
bests[16].suffixonly == [false]
bests[16].langbase == [isC0]
comp = comp_parms0_off
bests[16].rows == [1]
R->R W->R R->W W->W
0.247669 0.636764 -0.729558 0.022230 -0.004214 0.020625
*/
//iteration:20.6
/*weight ( used): 1.05436295090904787e+04 suffix only: no */
/* number of nodes: 336797, nodes/line: inf weight ( used): 1.05436295090904787e+04 blobs 1 lines 0 * fraction 1.00000000000000000e+00 = 0 lines*/
{{
2.47669087481595079e-01, 6.36764047976876468e-01, -7.29557569755324042e-01, 2.22303428808458027e-02, -4.21447897318842114e-03, 2.06245665463890698e-02
}}
};
#if 1
static bestParms best_is =
{
false,
"is",
1,
//iteration:18.2
/*weight (not used): 1.34340843669173279e+05 suffix only: no */
/* number of nodes: 145852, nodes/line: 1.01285478076873131e-01 weight (not used): 1.34340843669173279e+05 blobs 2809220 lines 2873370 * fraction 5.01187233627272799e-01 = 1440009 lines*/
{{
0.00000000000000000e+00, 6.96451349087997107e-01, -7.13849249589145862e-01, 7.33128038921041919e-02
}}
};
#elif 1
static bestParms best_is =
{
false,