# Best practices to work with your data

## Filter for documents that belong to one category

In Project 46 you want to get all documents that belong to category with ID 63.

In [3]:
import re

from konfuzio_sdk.data import Project
from konfuzio_sdk.regex import regex_spans, suggest_regex_for_string, merge_regex

prj = Project(id_=46)
category = prj.get_category_by_id(63)

category.documents()


[Gehalt.pdf: 44823,
 Festlohn.pdf: 44834,
 vermögenswirksame Leistungen.pdf: 44839,
 betriebliche Altersvorsorge AG finanziert.pdf: 44840,
 Weihnachtsgeld.pdf: 44841,
 Stundenlohn.pdf: 44842,
 Fahrtkostenzuschuss pauschal versteuert.pdf: 44843,
 Betirebliche Altersvorsorge Mischfinanzierung.pdf: 44845,
 Darlehen.pdf: 44846,
 Dienstwagen mit Gehaltsverzicht.pdf: 44847,
 Auswertungspaket - unterschiedliche B_N-Auswertungen.pdf_1.pdf: 44848,
 Auswertungspaket - unterschiedliche B_N-Auswertungen.pdf_2.pdf: 44850,
 Auswertungspaket - unterschiedliche B_N-Auswertungen.pdf_4.pdf: 44851,
 Auswertungspaket - unterschiedliche B_N-Auswertungen.pdf_3.pdf: 44852,
 Auswertungspaket - unterschiedliche B_N-Auswertungen.pdf_5.pdf: 44853,
 Auswertungspaket - unterschiedliche B_N-Auswertungen.pdf_6.pdf: 44854,
 Auswertungspaket - unterschiedliche B_N-Auswertungen.pdf_7.pdf: 44855,
 Auswertungspaket - unterschiedliche B_N-Auswertungen.pdf_8.pdf: 44856,
 Auswertungspaket - unterschiedliche B_N-Auswertungen

## Edit an Annotation that is online

In [4]:
doc = prj.get_document_by_id(44823)
annotations = doc.annotations(start_offset=10, end_offset=200)
annotations


[Annotation Austellungsdatum (66, 78), (159, 169)]

In [5]:
doc.__dict__

{'id_local': 23579,
 'id_': 44823,
 'file_path': None,
 '_annotations': [Annotation Austellungsdatum (66, 78), (159, 169),
  Annotation Betrag (86, 88),
  Annotation Auszahlungsbetrag (236, 237),
  Annotation Personalausweis (352, 357),
  Annotation Steuerklasse (365, 366),
  Annotation Personalausweis (1194, 1199),
  Annotation Gesamt-Brutto (1498, 1504), (1582, 1587),
  Annotation Vorname (1507, 1518),
  Annotation Nachname (1519, 1527),
  Annotation Lohnart (1758, 1762),
  Annotation Bezeichnung (1763, 1769),
  Annotation Betrag (1831, 1839),
  Annotation Gesamt-Brutto (2111, 2119),
  Annotation Sozialversicherung (2255, 2262),
  Annotation Sozialversicherung (2269, 2274),
  Annotation Sozialversicherung (2281, 2285),
  Annotation Sozialversicherung (2292, 2296),
  Annotation Steuerrechtliche Abzüge (2324, 2330),
  Annotation Auszahlungsbetrag (2619, 2624),
  Annotation Netto-Verdienst (3004, 3012),
  Annotation Steuer-Brutto (3141, 3149),
  Annotation Auszahlungsbetrag (3777, 3785)

Let's look into the first Annotation

In [6]:
first_annotation = annotations[0]
first_annotation.__dict__

{'id_local': 23585,
 'is_correct': True,
 'revised': False,
 'normalized': None,
 'translated_string': None,
 'document': Gehalt.pdf: 44823,
 '_spans': [Span (66, 78), Span (159, 169)],
 'id_': 9208623,
 'confidence': 1,
 'label': Austellungsdatum,
 'label_set': Lohnabrechnung (63),
 'annotation_set': AnnotationSet(78730) of Lohnabrechnung (63) in Gehalt.pdf: 44823.,
 'selection_bbox': {'bottom': 49.786,
  'page_index': 0,
  'top': 20.481,
  'x0': 457.991,
  'x1': 533.908,
  'y0': 791.894,
  'y1': 821.199},
 'page_number': None,
 'top': 23.849,
 'x0': 48.48,
 'x1': 568.801,
 'y0': 797.311,
 'y1': 817.831,
 'bottom': 44.369,
 'bboxes': [{'bottom': 32.849,
   'end_offset': 78,
   'line_number': 1,
   'offset_string': '328927/10103',
   'offset_string_original': '328927/10103',
   'page_index': 0,
   'start_offset': 66,
   'top': 23.849,
   'x0': 462.48,
   'x1': 533.36,
   'y0': 808.831,
   'y1': 817.831},
  {'bottom': 44.369,
   'end_offset': 169,
   'line_number': 2,
   'offset_string'

We want to change the revised status to False.

In [7]:
first_annotation.revised = False

Now we have it locally, but not online. So save it to save it online.

In [8]:
first_annotation.save()

2022-02-16 22:12:09,565 [konfuzio_sdk.data   ] [MainThread] [ERROR   ] [save                ][0915] You cannot update Annotations once saved online.


False

In [9]:
first_annotation.__dict__

{'id_local': 23585,
 'is_correct': True,
 'revised': False,
 'normalized': None,
 'translated_string': None,
 'document': Gehalt.pdf: 44823,
 '_spans': [Span (66, 78), Span (159, 169)],
 'id_': 9208623,
 'confidence': 1,
 'label': Austellungsdatum,
 'label_set': Lohnabrechnung (63),
 'annotation_set': AnnotationSet(78730) of Lohnabrechnung (63) in Gehalt.pdf: 44823.,
 'selection_bbox': {'bottom': 49.786,
  'page_index': 0,
  'top': 20.481,
  'x0': 457.991,
  'x1': 533.908,
  'y0': 791.894,
  'y1': 821.199},
 'page_number': None,
 'top': 23.849,
 'x0': 48.48,
 'x1': 568.801,
 'y0': 797.311,
 'y1': 817.831,
 'bottom': 44.369,
 'bboxes': [{'bottom': 32.849,
   'end_offset': 78,
   'line_number': 1,
   'offset_string': '328927/10103',
   'offset_string_original': '328927/10103',
   'page_index': 0,
   'start_offset': 66,
   'top': 23.849,
   'x0': 462.48,
   'x1': 533.36,
   'y0': 808.831,
   'y1': 817.831},
  {'bottom': 44.369,
   'end_offset': 169,
   'line_number': 2,
   'offset_string'

## Assume you want to test how good your Tokenizer covers any text.

Let's assume you use a pretty easy tokenizer which captures elements by surrounding whitespaces.

In [10]:
# Fill all elements in a document which are not yet labeled
set(doc.annotations(fill=True, use_correct=False)) - set(doc.annotations(use_correct=False))



{Annotation None (0, 66),
 Annotation None (78, 84), (84, 85), (85, 86),
 Annotation None (88, 159),
 Annotation None (169, 177), (177, 178), (178, 179), (179, 180), (180, 236),
 Annotation None (237, 351), (351, 352),
 Annotation None (357, 365),
 Annotation None (366, 434), (434, 435), (435, 436), (436, 437), (437, 561), (561, 562), (562, 607), (607, 608), (608, 669), (669, 670), (670, 671), (671, 672), (672, 791), (791, 792), (792, 837), (837, 838), (838, 920), (920, 921), (921, 1027), (1027, 1028), (1028, 1043), (1043, 1044), (1044, 1132), (1132, 1133), (1133, 1167), (1167, 1168), (1168, 1194),
 Annotation None (1199, 1210), (1210, 1211), (1211, 1283), (1283, 1284), (1284, 1329), (1329, 1330), (1330, 1412), (1412, 1413), (1413, 1419), (1419, 1420), (1420, 1498),
 Annotation None (1504, 1505), (1505, 1507),
 Annotation None (1518, 1519),
 Annotation None (1527, 1582),
 Annotation None (1587, 1588), (1588, 1605), (1605, 1606), (1606, 1622), (1622, 1623), (1623, 1636), (1636, 1637), (

In [11]:
# You might be curious why the second Annotation has five Spans even it is one sequence of text.
doc.text[169:236]
# We divide any Sequence into Spans if there are Line Breaks. Line Breaks cannot be annotated so they are seperated
# as Spans.

' Bat:  1\n \nPersonal-Nr.  Geburtsdatum ski Faktor  Ki,Frbtr.Konfessi'

## Build an easy tokenizer by examples
We use the correct Annotations in one document to build a tokenizer for the overall model.

In [12]:
positive_examples = []
for annotation in doc.annotations():
    for span in annotation.spans:
        positive_examples.append(doc.text[span.start_offset: span.end_offset])

positive_examples

['328927/10103',
 '22.05.2018',
 '00104',
 '1',
 '00104',
 '198,34',
 '10,89',
 'Erna-Muster',
 'Eiermann',
 '2000',
 'Gehalt',
 '3.120,00',
 '3.120,00',
 '3.12000',
 '25732',
 '2062',
 '1417',
 '292,11',
 '2.189,07',
 '42.42950',
 '2.189,07']

In [13]:
tokens = set([suggest_regex_for_string(positive_example) for positive_example in positive_examples])
tokenizer = merge_regex(tokens)
tokenizer

'(?:\\d\\d\\d\\d\\d\\d/\\d\\d\\d\\d\\d|\\d\\d\\.\\d\\d\\.\\d\\d\\d\\d|\\d\\d\\.\\d\\d\\d\\d\\d|\\d\\.\\d\\d\\d\\,\\d\\d|\\d\\.\\d\\d\\d\\d\\d|Erna[-]Muster|\\d\\d\\d\\,\\d\\d|\\d\\d\\d\\d\\d|\\d\\d\\,\\d\\d|\\d\\d\\d\\d|Eiermann|Gehalt|\\d)'

In [14]:
set([(span['value'], span['start_offset'], span['end_offset']) for span in regex_spans(doc.text, tokenizer)])



{('0', 61, 62),
 ('0', 408, 409),
 ('0', 415, 416),
 ('0', 416, 417),
 ('0', 657, 658),
 ('0', 668, 669),
 ('0', 1208, 1209),
 ('0', 1386, 1387),
 ('0', 4395, 4396),
 ('00104', 79, 84),
 ('00104', 352, 357),
 ('00104', 1194, 1199),
 ('1', 176, 177),
 ('1', 365, 366),
 ('1', 619, 620),
 ('1', 652, 653),
 ('1', 656, 657),
 ('1', 658, 659),
 ('1', 665, 666),
 ('1', 1040, 1041),
 ('1', 1105, 1106),
 ('1', 1236, 1237),
 ('1', 4338, 4339),
 ('1', 4394, 4395),
 ('1', 4407, 4408),
 ('10,89', 1582, 1587),
 ('10103', 1247, 1252),
 ('10103', 1868, 1873),
 ('1111', 660, 664),
 ('12345', 1047, 1052),
 ('12345', 1095, 1100),
 ('1417', 2292, 2296),
 ('15,83', 1562, 1567),
 ('15035', 358, 363),
 ('17029', 883, 888),
 ('198,34', 1498, 1504),
 ('2', 62, 63),
 ('2', 1041, 1042),
 ('2', 1209, 1210),
 ('2', 1237, 1238),
 ('2', 3907, 3908),
 ('2', 4008, 4009),
 ('2.189,07', 3004, 3012),
 ('2.189,07', 3777, 3785),
 ('2000', 1758, 1762),
 ('2018', 136, 140),
 ('2062', 2281, 2285),
 ('22.05.2018', 159, 169),
 

## More Details of Annotations

Keep the information of Text in a Document.

Todo: the endpoint test_get_project_labels does no longer include the document annotation_sets, as the relation of
a label and a annotation_set can be configured by a user while labeling. We might ne to model the relation of many
Annotations to one AnnotationSet in a more explicit way.

Example document: "I earn 15 Euro per hour."

Assume the word "15" should be labeled. The project contains the labels "Amount" and "Tax".

# CREATE

Annotations can be created by:

- Human: Who is using the web interface
- Import: A human user imports extractions and uses "Copy extractions to annotations" admin action
- Training: Using the konfuzio package you create an annotation online, via an Bot user
- Text FB: Text Feedback - External API user, sends new extraction without ID, which contains only the offset string
- Extraction: Internal Process after we receive a new document from an External API user
- Extraction FB: External Feedback - External API user, sends feedback to existing extraction incl. ID

ID column: relates to the Annotation instance created in the database
is_revised: A human revisor had a look at this annotation
correct: Human claims that this annotation should be extracted in future documents

The KONFUZIO package will use annotations which are revised or (no XOR) correct.

| ID | Creator       | is_revised  | correct       | User      | Label   | Action  |
|:---|:--------------|:------------|:------------- |:----------|:--------|:--------|
| 1  | Human         | False       | True          | Human     | Amount  | ALLOWED |
| 2  | Import        | False       | False         | None      | Amount  | ALLOWED | Extraction.created_by_import
| 3  | Training      | False       | False         | Bot       | Amount  | ALLOWED |
| 4  | Extraction    | False       | False         | External  | Amount  | ALLOWED | one annotation per extraction
| X  | Text FB       | -----       | -----         | ---       | Amount  | see 2   | only create extraction

# REVISE

Annotations, as they heave been created, can be revised by:

- Human: Who is using the web interface
- Revise Feedback: ?

## Positive Feedback will change

| ID | Revisor       | is_revised  | correct       | User      | Label   | Action  |
|:---|:--------------|:------------|:------------- |:----------|:--------|:--------|
| 1  | Human         | NA          | NA            | NA        | Amount  | HIDDEN  |
| 2  | Human         | True        | True          | Human     | Amount  | ALLOWED |
| 3  | Human         | True        | True          | Bot       | Amount  | ALLOWED | -> ? does PUT update User
| 4  | Human         | NA          | NA            | External  | Amount  | HIDDEN  |
| 1  | Extraction FB | True        | True          | Human     | Amount  | ALLOWED |
| 2  | Extraction FB | ----        | ----          | ----      | ----    | ----    | External user does not get ID
| 3  | Extraction FB | ----        | ----          | ----      | ----    | ----    | External user does not get ID
| 4  | Extraction FB | True        | True          | Bot       | Amount  | ALLOWED |

As positive feedback displays the annotation in the interface but stores them as correct examples, the
word "15" should NOT be labeled anew. This time the creator might choose between label "Amount" and "Tax".

| ID | Creator       | is_revised  | correct       | User      | Label   | Action  |
|:---|:--------------|:------------|:------------- |:----------|:--------|:--------|
| 5  | Human         | False       | True          | Human     | Amount  | DENIED  |
| 6  | Import        | True        | False         | None      | Amount  | DENIED  |
| 7  | Training      | False       | False         | Bot       | Amount  | DENIED  |
| 8  | Extraction FB | ?           | ?             | ?         | Amount  | DENIED  |
| 9  | Human         | False       | True          | Human     | Tax     | DENIED  |
| 10 | Import        | ----        | ----          | ----      | Tax     | DENIED  | External user does not get ID
| 11 | Training      | ----        | ----          | ----      | Tax     | DENIED  | External user does not get ID
| 12 | Extraction FB | ?           | ?             | ?         | Tax     | DENIED  |

## Negative Feedback will change

- The user clicks on delete button next to the annotation in the web interface.
- Incorrect or deleted annotations will no longer be displayed in the web interface.

| ID | Revisor       | is_revised  | correct       | User      | Label   | Action  |
|:---|:--------------|:------------|:------------- |:----------|:--------|:--------|
| 1  | Human         | DELETED     | DELETED       | DELETED   | Amount  | ALLOWED | delete revised=F, correct=T
| 2  | Human         | True        | False         | None      | Amount  | ALLOWED | Update three fields
| 3  | Human         | True        | False         | Bot       | Amount  | ALLOWED | Does update is_revised field
| 4  | Human         | ?           | ?             | ?         | Amount  | ALLOWED |
| 1  | Extraction FB | True        | False         | ?         | Amount  | ALLOWED |
| 2  | Extraction FB | ----        | ----          | ----      | Amount  | ALLOWED | External user does not get ID
| 3  | Extraction FB | ----        | ----          | ----      | Amount  | ALLOWED | External user does not get ID
| 4  | Extraction FB | True        | False         | External  | Amount  | ALLOWED |

As negative feedback removed any annotation from the web interface but stores them as incorrect examples, the
word "15" can be labeled anew. This time the creator might choose between label "Amount" and "Tax".

| ID | Creator       | is_revised  | correct       | User      | Label   | Action  |
|:---|:--------------|:------------|:------------- |:----------|:--------|:--------|
| 5  | Human         | False       | True          | Human     | Amount  | ?DENIED | -> in contrast to annotation 1
| 6  | Import        | ---         | ---           | ---       | ---     | DENIED  |
| 7  | Training      | ---         | ---           | ---       | ---     | DENIED  |
| 8  | Extraction FB | ?           | ?             | ?         | Amount  | NA      | Need to send new document
| 9  | Human         | False       | True          | Human     | Tax     | ALLOWED | now we have 2 annotations
| 10 | Import        | False       | False         | None      | Tax     | ALLOWED |
| 11 | Training      | False       | False         | Bot       | Tax     | ALLOWED |
| 12 | Extraction FB | ?           | ?             | ?         | Tax     | NA      | Need to send new document