diff --git a/grobid-home/models/citation-BidLSTM_CRF_FEATURES/model_weights.hdf5 b/grobid-home/models/citation-BidLSTM_CRF_FEATURES/model_weights.hdf5 index 9242b66524..0619143b63 100644 Binary files a/grobid-home/models/citation-BidLSTM_CRF_FEATURES/model_weights.hdf5 and b/grobid-home/models/citation-BidLSTM_CRF_FEATURES/model_weights.hdf5 differ diff --git a/grobid-home/models/header-BidLSTM_CRF_FEATURES/model_weights.hdf5 b/grobid-home/models/header-BidLSTM_CRF_FEATURES/model_weights.hdf5 index 80661ad181..679d81e95c 100644 Binary files a/grobid-home/models/header-BidLSTM_CRF_FEATURES/model_weights.hdf5 and b/grobid-home/models/header-BidLSTM_CRF_FEATURES/model_weights.hdf5 differ diff --git a/grobid-trainer/doc/PMC_sample_1943.results.grobid-0.7.3-SNAPSHOT-Glutton-DeLFT_BidLSTM-CRF_citation_reference_segmenter-header-02.12.2022 b/grobid-trainer/doc/PMC_sample_1943.results.grobid-0.7.3-SNAPSHOT-Glutton-DeLFT_BidLSTM-CRF_citation_reference_segmenter-header-02.12.2022 new file mode 100644 index 0000000000..f4b0ce078a --- /dev/null +++ b/grobid-trainer/doc/PMC_sample_1943.results.grobid-0.7.3-SNAPSHOT-Glutton-DeLFT_BidLSTM-CRF_citation_reference_segmenter-header-02.12.2022 @@ -0,0 +1,393 @@ +-------------> GROBID failed on 0 PDF + +1943 PDF files processed in 1920.476 seconds, 0.988407617086979 seconds per PDF file + + + +Evaluation header 100% │█████████████│ 1943/1943 (0:00:55 / 0:00:00) + + +Evaluation citation 100% │███████████│ 1943/1943 (0:13:12 / 0:00:00) + + +Evaluation full text 100% │██████████│ 1943/1943 (0:00:26 / 0:00:00) +Evaluation metrics produced in 874.772 seconds +> :grobid-trainer:jatsEval +======= Header metadata ======= + +Evaluation on 1943 random PDF files out of 1941 PDF (ratio 1.0). + +======= Strict Matching ======= (exact matches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +abstract 82.47 16.69 16.38 16.53 1911 +authors 98.61 93.55 93.46 93.51 1941 +first_author 99.29 96.75 96.65 96.7 1941 +keywords 93.69 61.6 59.86 60.71 1380 +title 97.15 86.84 86.57 86.7 1943 + +all (micro avg.) 94.24 72.1 71.42 71.76 9116 +all (macro avg.) 94.24 71.09 70.58 70.83 9116 + + +======== Soft Matching ======== (ignoring punctuation, case and space characters mismatches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +abstract 91.93 62.83 61.64 62.23 1911 +authors 98.74 94.17 94.08 94.12 1941 +first_author 99.32 96.91 96.81 96.86 1941 +keywords 94.93 70.02 68.04 69.02 1380 +title 98.62 93.8 93.52 93.66 1943 + +all (micro avg.) 96.71 84.59 83.8 84.19 9116 +all (macro avg.) 96.71 83.55 82.82 83.18 9116 + + +==== Levenshtein Matching ===== (Minimum Levenshtein distance at 0.8) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +abstract 97.39 89.44 87.76 88.59 1911 +authors 99.23 96.49 96.39 96.44 1941 +first_author 99.39 97.22 97.11 97.16 1941 +keywords 96.76 82.55 80.22 81.37 1380 +title 99.58 98.35 98.04 98.2 1943 + +all (micro avg.) 98.47 93.51 92.64 93.07 9116 +all (macro avg.) 98.47 92.81 91.91 92.35 9116 + + += Ratcliff/Obershelp Matching = (Minimum Ratcliff/Obershelp similarity at 0.95) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +abstract 96.6 85.6 83.99 84.79 1911 +authors 99.03 95.51 95.41 95.46 1941 +first_author 99.29 96.75 96.65 96.7 1941 +keywords 95.95 77.03 74.86 75.93 1380 +title 99.32 97.11 96.81 96.96 1943 + +all (micro avg.) 98.04 91.32 90.47 90.89 9116 +all (macro avg.) 98.04 90.4 89.54 89.97 9116 + +===== Instance-level results ===== + +Total expected instances: 1943 +Total correct instances: 215 (strict) +Total correct instances: 883 (soft) +Total correct instances: 1392 (Levenshtein) +Total correct instances: 1260 (ObservedRatcliffObershelp) + +Instance-level recall: 11.07 (strict) +Instance-level recall: 45.45 (soft) +Instance-level recall: 71.64 (Levenshtein) +Instance-level recall: 64.85 (RatcliffObershelp) + +======= Citation metadata ======= + +Evaluation on 1943 random PDF files out of 1941 PDF (ratio 1.0). + +======= Strict Matching ======= (exact matches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +authors 97.67 83.65 76 79.64 85778 +date 99.17 94.35 83.65 88.68 87067 +first_author 98.57 90.12 81.85 85.78 85778 +inTitle 96.16 73 71.05 72.01 81007 +issue 99.6 88.5 82.98 85.65 16635 +page 98.8 95.56 84.83 89.87 80501 +title 97.27 80.16 75.04 77.52 80736 +volume 99.36 95.63 89.18 92.29 80067 + +all (micro avg.) 98.32 87.38 80.3 83.69 597569 +all (macro avg.) 98.32 87.62 80.57 83.93 597569 + + +======== Soft Matching ======== (ignoring punctuation, case and space characters mismatches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +authors 97.74 84.15 76.44 80.11 85778 +date 99.17 94.35 83.65 88.68 87067 +first_author 98.6 90.3 82.02 85.96 85778 +inTitle 97.85 84.91 82.64 83.76 81007 +issue 99.6 88.5 82.98 85.65 16635 +page 98.8 95.56 84.83 89.87 80501 +title 98.84 91.64 85.78 88.62 80736 +volume 99.36 95.63 89.18 92.29 80067 + +all (micro avg.) 98.74 90.77 83.41 86.93 597569 +all (macro avg.) 98.74 90.63 83.44 86.87 597569 + + +==== Levenshtein Matching ===== (Minimum Levenshtein distance at 0.8) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +authors 98.54 89.88 81.65 85.57 85778 +date 99.17 94.35 83.65 88.68 87067 +first_author 98.63 90.52 82.22 86.17 85778 +inTitle 98.04 86.21 83.91 85.05 81007 +issue 99.6 88.5 82.98 85.65 16635 +page 98.8 95.56 84.83 89.87 80501 +title 99.16 93.98 87.98 90.88 80736 +volume 99.36 95.63 89.18 92.29 80067 + +all (micro avg.) 98.91 92.12 84.66 88.23 597569 +all (macro avg.) 98.91 91.83 84.55 88.02 597569 + + += Ratcliff/Obershelp Matching = (Minimum Ratcliff/Obershelp similarity at 0.95) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +authors 98.09 86.68 78.75 82.52 85778 +date 99.17 94.35 83.65 88.68 87067 +first_author 98.58 90.14 81.86 85.8 85778 +inTitle 97.65 83.5 81.27 82.37 81007 +issue 99.6 88.5 82.98 85.65 16635 +page 98.8 95.56 84.83 89.87 80501 +title 99.1 93.56 87.58 90.47 80736 +volume 99.36 95.63 89.18 92.29 80067 + +all (micro avg.) 98.79 91.16 83.78 87.32 597569 +all (macro avg.) 98.79 90.99 83.76 87.21 597569 + +===== Instance-level results ===== + +Total expected instances: 90125 +Total extracted instances: 85176 +Total correct instances: 38757 (strict) +Total correct instances: 50856 (soft) +Total correct instances: 55697 (Levenshtein) +Total correct instances: 52253 (RatcliffObershelp) + +Instance-level precision: 45.5 (strict) +Instance-level precision: 59.71 (soft) +Instance-level precision: 65.39 (Levenshtein) +Instance-level precision: 61.35 (RatcliffObershelp) + +Instance-level recall: 43 (strict) +Instance-level recall: 56.43 (soft) +Instance-level recall: 61.8 (Levenshtein) +Instance-level recall: 57.98 (RatcliffObershelp) + +Instance-level f-score: 44.22 (strict) +Instance-level f-score: 58.02 (soft) +Instance-level f-score: 63.54 (Levenshtein) +Instance-level f-score: 59.62 (RatcliffObershelp) + +Matching 1 : 67684 + +Matching 2 : 4228 + +Matching 3 : 2051 + +Matching 4 : 735 + +Total matches : 74698 + +======= Citation context resolution ======= + +Total expected references: 90125 - 46.38 references per article +Total predicted references: 85176 - 43.84 references per article + +Total expected citation contexts: 139835 - 71.97 citation contexts per article +Total predicted citation contexts: 116677 - 60.05 citation contexts per article + +Total correct predicted citation contexts: 98449 - 50.67 citation contexts per article +Total wrong predicted citation contexts: 18228 (wrong callout matching, callout missing in NLM, or matching with a bib. ref. not aligned with a bib.ref. in NLM) + +Precision citation contexts: 84.38 +Recall citation contexts: 70.4 +fscore citation contexts: 76.76 + +======= Fulltext structures ======= + +Evaluation on 1943 random PDF files out of 1941 PDF (ratio 1.0). + +======= Strict Matching ======= (exact matches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +figure_title 96.75 32.15 25.43 28.4 7058 +reference_citation 59.13 57.48 58.63 58.05 134196 +reference_figure 95.02 64.47 63.07 63.76 19330 +reference_table 99.11 82.72 83.62 83.17 7327 +section_title 94.12 71.58 67.65 69.56 27619 +table_title 98.84 57.56 54.84 56.16 3784 + +all (micro avg.) 90.49 60.23 59.98 60.11 199314 +all (macro avg.) 90.49 60.99 58.87 59.85 199314 + + +======== Soft Matching ======== (ignoring punctuation, case and space characters mismatches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +figure_title 98.57 78.83 62.35 69.63 7058 +reference_citation 61.84 61.72 62.96 62.34 134196 +reference_figure 94.9 65.04 63.62 64.32 19330 +reference_table 99.08 82.87 83.77 83.32 7327 +section_title 94.8 76.13 71.95 73.98 27619 +table_title 99.44 81.78 77.91 79.79 3784 + +all (micro avg.) 91.44 65.57 65.3 65.43 199314 +all (macro avg.) 91.44 74.39 70.43 72.23 199314 + + +************************************************************************************ +COUNTER: org.grobid.core.engines.counters.TableRejectionCounters +************************************************************************************ +------------------------------------------------------------------------------------ + CANNOT_PARSE_LABEL_TO_INT: 140 + CONTENT_SIZE_TOO_SMALL: 78 + CONTENT_WIDTH_TOO_SMALL: 20 + EMPTY_LABEL_OR_HEADER_OR_CONTENT: 1991 + HEADER_NOT_STARTS_WITH_TABLE_WORD: 148 + HEADER_NOT_CONSECUTIVE: 983 + HEADER_AND_CONTENT_DIFFERENT_PAGES: 11 + HEADER_AND_CONTENT_INTERSECT: 564 + FEW_TOKENS_IN_HEADER: 1 +==================================================================================== + +************************************************************************************ +COUNTER: org.grobid.core.engines.counters.ReferenceMarkerMatcherCounters +************************************************************************************ +------------------------------------------------------------------------------------ + UNMATCHED_REF_MARKERS: 9998 + MATCHED_REF_MARKERS_AFTER_POST_FILTERING: 3270 + STYLE_AUTHORS: 37197 + STYLE_NUMBERED: 52094 + MANY_CANDIDATES: 4776 + MANY_CANDIDATES_AFTER_POST_FILTERING: 604 + NO_CANDIDATES: 18385 + INPUT_REF_STRINGS_CNT: 91415 + MATCHED_REF_MARKERS: 116677 + NO_CANDIDATES_AFTER_POST_FILTERING: 500 + STYLE_OTHER: 2124 +==================================================================================== + +************************************************************************************ +COUNTER: org.grobid.core.engines.counters.FigureCounters +************************************************************************************ +------------------------------------------------------------------------------------ + SKIPPED_BAD_STANDALONE_FIGURES: 659 + SKIPPED_DUE_TO_MISMATCH_OF_CAPTIONS_AND_VECTOR_AND_BITMAP_GRAPHICS: 3 + SKIPPED_SMALL_STANDALONE_FIGURES: 526 + SKIPPED_BIG_STANDALONE_FIGURES: 133 +==================================================================================== + +************************************************************************************ +COUNTER: org.grobid.core.engines.label.TaggingLabelImpl +************************************************************************************ +------------------------------------------------------------------------------------ + HEADER_DOCTYPE: 2897 + CITATION_TITLE: 81615 + HEADER_DATE: 1131 + HEADER_KEYWORD: 1429 + NAME-HEADER_MIDDLENAME: 5839 + TABLE_FIGDESC: 4343 + NAME-HEADER_SURNAME: 13903 + NAME-CITATION_OTHER: 437103 + CITATION_BOOKTITLE: 7123 + HEADER_FUNDING: 148 + HEADER_ADDRESS: 6017 + HEADER_AFFILIATION: 6186 + CITATION_NOTE: 2842 + FULLTEXT_CITATION_MARKER: 181115 + TABLE_NOTE: 2999 + HEADER_EMAIL: 2210 + FULLTEXT_TABLE_MARKER: 14699 + CITATION_WEB: 1375 + HEADER_GROUP: 4 + TABLE_LABEL: 3399 + FULLTEXT_SECTION: 55218 + NAME-HEADER_FORENAME: 14116 + DATE_YEAR: 86760 + TABLE_CONTENT: 5357 + CITATION_COLLABORATION: 42 + CITATION_ISSUE: 17151 + HEADER_MEETING: 24 + HEADER_EDITOR: 114 + CITATION_SERIES: 224 + CITATION_JOURNAL: 77697 + NAME-CITATION_SURNAME: 330684 + TABLE_FIGURE_HEAD: 4837 + FULLTEXT_EQUATION_MARKER: 1651 + CITATION_OTHER: 450295 + FULLTEXT_FIGURE_MARKER: 37742 + HEADER_TITLE: 2041 + CITATION_TECH: 383 + FIGURE_CONTENT: 3283 + FIGURE_LABEL: 5990 + FULLTEXT_EQUATION_LABEL: 1962 + HEADER_OTHER: 10807 + FULLTEXT_EQUATION: 4418 + TABLE_OTHER: 1 + CITATION_DATE: 86066 + CITATION_AUTHOR: 86094 + FULLTEXT_FIGURE: 14313 + FULLTEXT_TABLE: 10073 + CITATION_EDITOR: 2699 + FULLTEXT_OTHER: 509 + HEADER_SUBMISSION: 1207 + NAME-HEADER_OTHER: 17369 + FIGURE_FIGDESC: 7505 + NAME-HEADER_SUFFIX: 20 + HEADER_AVAILABILITY: 13 + CITATION_VOLUME: 76292 + CITATION_LOCATION: 7896 + NAME-CITATION_SUFFIX: 394 + NAME-HEADER_TITLE: 735 + DATE_MONTH: 3107 + HEADER_WEB: 344 + HEADER_ABSTRACT: 2305 + CITATION_INSTITUTION: 1685 + HEADER_REFERENCE: 3047 + CITATION_PAGES: 80522 + HEADER_AUTHOR: 4272 + NAME-HEADER_MARKER: 8104 + DATE_OTHER: 4721 + NAME-CITATION_FORENAME: 319284 + CITATION_PUBLISHER: 6061 + HEADER_PUBNUM: 1730 + NAME-CITATION_MIDDLENAME: 66214 + CITATION_PUBNUM: 10886 + HEADER_COPYRIGHT: 2379 + FULLTEXT_PARAGRAPH: 381002 + FIGURE_FIGURE_HEAD: 9715 + DATE_DAY: 2836 +==================================================================================== + +************************************************************************************ +COUNTER: FigureCounters +************************************************************************************ +------------------------------------------------------------------------------------ + STANDALONE_FIGURES: 491 + ASSIGNED_GRAPHICS_TO_FIGURES: 3777 +==================================================================================== +==================================================================================== diff --git a/grobid-trainer/doc/bioRxiv_test_2000.results.grobid-0.7-3-SNAPSHOT-Glutton-DeLFT-BidLSTM-CRF-FEATURES_CITATIONS_reference_segmenter_header-03.12.2022 b/grobid-trainer/doc/bioRxiv_test_2000.results.grobid-0.7-3-SNAPSHOT-Glutton-DeLFT-BidLSTM-CRF-FEATURES_CITATIONS_reference_segmenter_header-03.12.2022 new file mode 100644 index 0000000000..0c1e87c337 --- /dev/null +++ b/grobid-trainer/doc/bioRxiv_test_2000.results.grobid-0.7-3-SNAPSHOT-Glutton-DeLFT-BidLSTM-CRF-FEATURES_CITATIONS_reference_segmenter_header-03.12.2022 @@ -0,0 +1,401 @@ +PDF processing 100% │████████████████│ 2000/2000 (0:37:54 / 0:00:00) + +-------------> GROBID failed on 0 PDF + +2000 PDF files processed in 2292.139 seconds, 1.1460695 seconds per PDF file + +Evaluation metrics produced in 988.221 seconds + +======= Header metadata ======= + +Evaluation on 2000 random PDF files out of 1998 PDF (ratio 1.0). + +======= Strict Matching ======= (exact matches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +abstract 78.01 2.22 2.16 2.19 1990 +authors 96.19 84.02 83.09 83.55 1999 +first_author 99.05 96.81 95.84 96.33 1997 +keywords 95.68 57.71 58.4 58.06 839 +title 95.31 79.96 79.2 79.58 2000 + +all (micro avg.) 92.85 65.24 64.5 64.87 8825 +all (macro avg.) 92.85 64.14 63.74 63.94 8825 + + +======== Soft Matching ======== (ignoring punctuation, case and space characters mismatches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +abstract 90.43 58.94 57.49 58.2 1990 +authors 96.3 84.52 83.59 84.05 1999 +first_author 99.11 97.07 96.09 96.58 1997 +keywords 96.19 63.02 63.77 63.39 839 +title 96.11 83.63 82.75 83.19 2000 + +all (micro avg.) 95.63 79.38 78.46 78.91 8825 +all (macro avg.) 95.63 77.43 76.74 77.08 8825 + + +==== Levenshtein Matching ===== (Minimum Levenshtein distance at 0.8) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +abstract 95.02 79.91 77.94 78.91 1990 +authors 98.09 92.56 91.55 92.05 1999 +first_author 99.16 97.32 96.34 96.83 1997 +keywords 97.64 78.21 79.14 78.67 839 +title 97.88 91.56 90.6 91.08 2000 + +all (micro avg.) 97.56 89.2 88.17 88.68 8825 +all (macro avg.) 97.56 87.91 87.11 87.51 8825 + + += Ratcliff/Obershelp Matching = (Minimum Ratcliff/Obershelp similarity at 0.95) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +abstract 94.3 76.61 74.72 75.66 1990 +authors 97.13 88.27 87.29 87.78 1999 +first_author 99.05 96.81 95.84 96.33 1997 +keywords 96.84 69.85 70.68 70.26 839 +title 97.26 88.78 87.85 88.31 2000 + +all (micro avg.) 96.92 85.93 84.94 85.43 8825 +all (macro avg.) 96.92 84.06 83.28 83.67 8825 + +===== Instance-level results ===== + +Total expected instances: 2000 +Total correct instances: 34 (strict) +Total correct instances: 724 (soft) +Total correct instances: 1197 (Levenshtein) +Total correct instances: 1037 (ObservedRatcliffObershelp) + +Instance-level recall: 1.7 (strict) +Instance-level recall: 36.2 (soft) +Instance-level recall: 59.85 (Levenshtein) +Instance-level recall: 51.85 (RatcliffObershelp) + +======= Citation metadata ======= + +Evaluation on 2000 random PDF files out of 1998 PDF (ratio 1.0). + +======= Strict Matching ======= (exact matches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +authors 98.35 87.8 82.3 84.96 97183 +date 98.76 91.26 85.11 88.08 97630 +doi 99.12 70.44 82.73 76.09 16894 +first_author 99.25 94.68 88.66 91.57 97183 +inTitle 97.59 82.07 78.31 80.14 96430 +issue 99.54 94.17 89.63 91.84 30312 +page 97.67 96.43 79.55 87.18 88597 +pmcid 99.94 65.54 79.18 71.72 807 +pmid 99.87 69.23 82.9 75.45 2093 +title 97.94 84.56 82.76 83.65 92463 +volume 99.38 95.72 94.54 95.13 87709 + +all (micro avg.) 98.86 89.61 84.57 87.02 707301 +all (macro avg.) 98.86 84.72 84.15 84.16 707301 + + +======== Soft Matching ======== (ignoring punctuation, case and space characters mismatches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +authors 98.5 88.98 83.4 86.1 97183 +date 98.76 91.26 85.11 88.08 97630 +doi 99.26 75.23 88.36 81.27 16894 +first_author 99.31 95.12 89.07 92 97183 +inTitle 98.86 91.49 87.3 89.35 96430 +issue 99.54 94.17 89.63 91.84 30312 +page 97.67 96.43 79.55 87.18 88597 +pmcid 99.96 74.87 90.46 81.93 807 +pmid 99.88 72.83 87.2 79.37 2093 +title 99.02 92.72 90.75 91.73 92463 +volume 99.38 95.72 94.54 95.13 87709 + +all (micro avg.) 99.1 92.41 87.21 89.73 707301 +all (macro avg.) 99.1 88.07 87.76 87.63 707301 + + +==== Levenshtein Matching ===== (Minimum Levenshtein distance at 0.8) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +authors 99.19 94.17 88.27 91.13 97183 +date 98.76 91.26 85.11 88.08 97630 +doi 99.32 77.53 91.06 83.75 16894 +first_author 99.33 95.28 89.23 92.15 97183 +inTitle 98.99 92.51 88.27 90.34 96430 +issue 99.54 94.17 89.63 91.84 30312 +page 97.67 96.43 79.55 87.18 88597 +pmcid 99.96 74.87 90.46 81.93 807 +pmid 99.88 72.83 87.2 79.37 2093 +title 99.4 95.61 93.58 94.58 92463 +volume 99.38 95.72 94.54 95.13 87709 + +all (micro avg.) 99.22 93.74 88.47 91.03 707301 +all (macro avg.) 99.22 89.12 88.81 88.68 707301 + + += Ratcliff/Obershelp Matching = (Minimum Ratcliff/Obershelp similarity at 0.95) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +authors 98.79 91.17 85.46 88.22 97183 +date 98.76 91.26 85.11 88.08 97630 +doi 99.28 76.1 89.38 82.21 16894 +first_author 99.26 94.72 88.71 91.62 97183 +inTitle 98.68 90.19 86.06 88.07 96430 +issue 99.54 94.17 89.63 91.84 30312 +page 97.67 96.43 79.55 87.18 88597 +pmcid 99.94 65.54 79.18 71.72 807 +pmid 99.87 69.23 82.9 75.45 2093 +title 99.3 94.88 92.86 93.86 92463 +volume 99.38 95.72 94.54 95.13 87709 + +all (micro avg.) 99.14 92.76 87.55 90.08 707301 +all (macro avg.) 99.14 87.22 86.67 86.67 707301 + +===== Instance-level results ===== + +Total expected instances: 98799 +Total extracted instances: 98106 +Total correct instances: 43435 (strict) +Total correct instances: 54390 (soft) +Total correct instances: 58658 (Levenshtein) +Total correct instances: 55355 (RatcliffObershelp) + +Instance-level precision: 44.27 (strict) +Instance-level precision: 55.44 (soft) +Instance-level precision: 59.79 (Levenshtein) +Instance-level precision: 56.42 (RatcliffObershelp) + +Instance-level recall: 43.96 (strict) +Instance-level recall: 55.05 (soft) +Instance-level recall: 59.37 (Levenshtein) +Instance-level recall: 56.03 (RatcliffObershelp) + +Instance-level f-score: 44.12 (strict) +Instance-level f-score: 55.24 (soft) +Instance-level f-score: 59.58 (Levenshtein) +Instance-level f-score: 56.23 (RatcliffObershelp) + +Matching 1 : 77974 + +Matching 2 : 4494 + +Matching 3 : 4656 + +Matching 4 : 2334 + +Total matches : 89458 + +======= Citation context resolution ======= + +Total expected references: 98797 - 49.4 references per article +Total predicted references: 98106 - 49.05 references per article + +Total expected citation contexts: 142862 - 71.43 citation contexts per article +Total predicted citation contexts: 137088 - 68.54 citation contexts per article + +Total correct predicted citation contexts: 116658 - 58.33 citation contexts per article +Total wrong predicted citation contexts: 20430 (wrong callout matching, callout missing in NLM, or matching with a bib. ref. not aligned with a bib.ref. in NLM) + +Precision citation contexts: 85.1 +Recall citation contexts: 81.66 +fscore citation contexts: 83.34 + +======= Fulltext structures ======= + +Evaluation on 2000 random PDF files out of 1998 PDF (ratio 1.0). + +======= Strict Matching ======= (exact matches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +availability_stmt 99.88 0 0 0 0 +figure_title 92.94 4.19 3.55 3.84 13172 +funding_stmt 99.84 0 0 0 0 +reference_citation 74.32 71.09 71.31 71.2 147470 +reference_figure 91.78 74.25 66.19 69.99 47984 +reference_table 98.13 48.8 80.69 60.82 5957 +section_title 94.45 72.68 69.35 70.98 32399 +table_title 98.38 4.37 3.92 4.13 2961 + +all (micro avg.) 91.67 67.18 65.93 66.55 249943 +all (macro avg.) 91.67 45.9 49.17 46.83 249943 + + +======== Soft Matching ======== (ignoring punctuation, case and space characters mismatches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +availability_stmt 99.87 0 0 0 0 +figure_title 97.03 68.63 58.07 62.91 13172 +funding_stmt 99.83 0 0 0 0 +reference_citation 83.68 83.1 83.36 83.23 147470 +reference_figure 91.23 74.97 66.84 70.67 47984 +reference_table 97.99 49.21 81.38 61.34 5957 +section_title 94.75 76.6 73.09 74.8 32399 +table_title 99.04 50.77 45.53 48.01 2961 + +all (micro avg.) 93.95 78.49 77.03 77.75 249943 +all (macro avg.) 93.95 67.21 68.04 66.83 249943 + + +************************************************************************************ +COUNTER: org.grobid.core.engines.counters.TableRejectionCounters +************************************************************************************ +------------------------------------------------------------------------------------ + CANNOT_PARSE_LABEL_TO_INT: 189 + CONTENT_SIZE_TOO_SMALL: 58 + CONTENT_WIDTH_TOO_SMALL: 2 + EMPTY_LABEL_OR_HEADER_OR_CONTENT: 2884 + HEADER_NOT_STARTS_WITH_TABLE_WORD: 191 + HEADER_AND_CONTENT_DIFFERENT_PAGES: 91 + HEADER_NOT_CONSECUTIVE: 368 + HEADER_AND_CONTENT_INTERSECT: 205 +==================================================================================== + +************************************************************************************ +COUNTER: org.grobid.core.engines.counters.ReferenceMarkerMatcherCounters +************************************************************************************ +------------------------------------------------------------------------------------ + MATCHED_REF_MARKERS_AFTER_POST_FILTERING: 3212 + UNMATCHED_REF_MARKERS: 6290 + STYLE_AUTHORS: 40616 + STYLE_NUMBERED: 55630 + MANY_CANDIDATES: 6038 + MANY_CANDIDATES_AFTER_POST_FILTERING: 657 + NO_CANDIDATES: 7307 + INPUT_REF_STRINGS_CNT: 98077 + MATCHED_REF_MARKERS: 137088 + NO_CANDIDATES_AFTER_POST_FILTERING: 974 + STYLE_OTHER: 1831 +==================================================================================== + +************************************************************************************ +COUNTER: org.grobid.core.engines.counters.FigureCounters +************************************************************************************ +------------------------------------------------------------------------------------ + SKIPPED_BAD_STANDALONE_FIGURES: 5293 + SKIPPED_DUE_TO_MISMATCH_OF_CAPTIONS_AND_VECTOR_AND_BITMAP_GRAPHICS: 16 + SKIPPED_SMALL_STANDALONE_FIGURES: 2699 + SKIPPED_BIG_STANDALONE_FIGURES: 2594 + TOO_MANY_FIGURES_PER_PAGE: 5 +==================================================================================== + +************************************************************************************ +COUNTER: org.grobid.core.engines.label.TaggingLabelImpl +************************************************************************************ +------------------------------------------------------------------------------------ + HEADER_DOCTYPE: 99 + CITATION_TITLE: 96066 + HEADER_DATE: 158 + HEADER_KEYWORD: 1003 + NAME-HEADER_MIDDLENAME: 5296 + TABLE_FIGDESC: 3093 + NAME-HEADER_SURNAME: 13687 + NAME-CITATION_OTHER: 633110 + CITATION_BOOKTITLE: 4989 + HEADER_FUNDING: 102 + HEADER_ADDRESS: 7460 + HEADER_AFFILIATION: 7901 + FULLTEXT_SECTION_MARKER: 2 + CITATION_NOTE: 1931 + FULLTEXT_CITATION_MARKER: 191792 + TABLE_NOTE: 2028 + HEADER_EMAIL: 2607 + FULLTEXT_TABLE_MARKER: 19482 + CITATION_WEB: 7136 + HEADER_GROUP: 8 + TABLE_LABEL: 2300 + FULLTEXT_SECTION: 68125 + DATE_YEAR: 100832 + NAME-HEADER_FORENAME: 14070 + TABLE_CONTENT: 4699 + CITATION_COLLABORATION: 145 + CITATION_ISSUE: 30567 + HEADER_MEETING: 1 + CITATION_SERIES: 101 + CITATION_JOURNAL: 90753 + NAME-CITATION_SURNAME: 406137 + TABLE_FIGURE_HEAD: 4717 + FULLTEXT_EQUATION_MARKER: 3831 + CITATION_OTHER: 543253 + FULLTEXT_FIGURE_MARKER: 84993 + HEADER_TITLE: 2086 + CITATION_TECH: 530 + FIGURE_CONTENT: 4942 + FIGURE_LABEL: 12326 + FULLTEXT_EQUATION_LABEL: 6817 + HEADER_OTHER: 7714 + FULLTEXT_EQUATION: 16943 + CITATION_DATE: 101308 + CITATION_AUTHOR: 97965 + FULLTEXT_FIGURE: 32007 + FULLTEXT_TABLE: 9187 + CITATION_EDITOR: 936 + FULLTEXT_OTHER: 931 + HEADER_SUBMISSION: 46 + NAME-HEADER_OTHER: 16306 + FIGURE_FIGDESC: 15749 + HEADER_AVAILABILITY: 86 + NAME-HEADER_SUFFIX: 11 + CITATION_VOLUME: 88339 + CITATION_LOCATION: 3784 + NAME-CITATION_SUFFIX: 167 + NAME-HEADER_TITLE: 509 + DATE_MONTH: 5934 + HEADER_WEB: 38 + HEADER_ABSTRACT: 2316 + CITATION_INSTITUTION: 908 + HEADER_REFERENCE: 174 + CITATION_PAGES: 90417 + HEADER_AUTHOR: 2852 + NAME-HEADER_MARKER: 11922 + DATE_OTHER: 8350 + NAME-CITATION_FORENAME: 405525 + CITATION_PUBLISHER: 3875 + HEADER_PUBNUM: 115 + CITATION_PUBNUM: 22308 + NAME-CITATION_MIDDLENAME: 90642 + FULLTEXT_PARAGRAPH: 504548 + HEADER_COPYRIGHT: 34 + FIGURE_FIGURE_HEAD: 23639 + DATE_DAY: 3987 +==================================================================================== + +************************************************************************************ +COUNTER: FigureCounters +************************************************************************************ +------------------------------------------------------------------------------------ + STANDALONE_FIGURES: 1752 + ASSIGNED_GRAPHICS_TO_FIGURES: 4110 +==================================================================================== +====================================================================================