In [10]:
import json
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

In [11]:
def convert_doccano_to_spacy(input_file, output_file):
    nlp = spacy.blank("de")
    db = DocBin()
    with open(input_file, "r", encoding="utf-8") as f:
        data = [json.loads(line) for line in f]
    for entry in tqdm(data):
        text = entry["text"]
        entities = entry["label"]
        doc = nlp.make_doc(text)
        ents = []  
        for start, end, label in entities:
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span:
                ents.append(span)      
        doc.ents = ents
        db.add(doc)
    db.to_disk(output_file)
    print(f"Saved spaCy-formatted data to {output_file}")
convert_doccano_to_spacy("/Users/jan/Documents/Promotion/BIBB GW/hdd/training_data.jsonl", "train.spacy")
print("Done")

100%|██████████████████████████████████████████| 57/57 [00:00<00:00, 296.18it/s]


Saved spaCy-formatted data to train.spacy
Done


In [12]:
!python -m spacy init config config.cfg --lang de --pipeline ner --optimize efficiency
print("Done")


[38;5;1m✘ The provided output file already exists. To force overwriting the
config file, set the --force or -F flag.[0m

Done


In [13]:
from spacy.cli.train import train

train("config.cfg", output_path="output", overrides={"paths.train": "train.spacy", "paths.dev": "train.spacy"})

[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00    157.33    0.00    0.00    0.00    0.00
  3     200       2359.02   6104.86   90.15   87.28   93.21    0.90
  7     400         97.91    288.04   93.58   91.07   96.23    0.94
 10     600        135.14    139.20   98.68   98.86   98.49    0.99
 14     800        405.95     99.98   98.88   98.14   99.62    0.99
 17    1000        730.63     89.35   99.07   98.15  100.00    0.99
 21    1200         72.34     41.20   99.43   99.62   99.25    0.99
 25    1400         48.97     33.57   99.06   98.51   99.62    0.99
 28    1600         93.27     46.02   99.62   99.62   99.62    1.00
 32    1800         44.48     32.31   99.62  1

In [14]:
nlp = spacy.load("output/model-last")
text = "Wir suchen einen Altenpfleger/in (m/w/d) zum nächstmöglichen Zeitpunkt."
doc = nlp(text)
print("Entities:", [(ent.text, ent.label_) for ent in doc.ents])

Entities: [('Altenpfleger/in', 'Searched_Name')]


In [15]:
input_file = "/Users/jan/Documents/Promotion/BIBB GW/hdd/To_Doccano/validierung_2016.jsonl"  # Your input file
output_file = "annotated_data.json"  # Output file
years = ["2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023"]
for year in years:
    with open(f"/Users/jan/Documents/Promotion/BIBB GW/hdd/To_Doccano/doccano_{year}.jsonl", "r", encoding="utf-8") as f:
        data = [json.loads(line) for line in f]
    for entry in data:
        text = entry["text"]
        doc = nlp(text)
        entities = [(ent.text, ent.label_) for ent in doc.ents]
        entry["label"] = entities
    with open(f"/Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_data_names/doccano_{year}.jsonl", "w", encoding="utf-8") as f:
        for entry in data:
            f.write(json.dumps(entry) + "\n")
    print(f"Annotated data saved {year} doccano")
    with open(f"/Users/jan/Documents/Promotion/BIBB GW/hdd/To_Doccano/validierung_{year}.jsonl", "r", encoding="utf-8") as f:
        data = [json.loads(line) for line in f]
    for entry in data:
        text = entry["text"]
        doc = nlp(text)
        entities = [(ent.text, ent.label_) for ent in doc.ents]
        entry["label"] = entities
    with open(f"/Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_data_names/validierung_{year}.jsonl", "w", encoding="utf-8") as f:
        for entry in data:
            f.write(json.dumps(entry) + "\n")
    print(f"Annotated data saved {year} validierung")
    with open(f"/Users/jan/Documents/Promotion/BIBB GW/hdd/To_Doccano/rest_daten_{year}.jsonl", "r", encoding="utf-8") as f:
        data = [json.loads(line) for line in f]
    for entry in data:
        text = entry["text"]
        doc = nlp(text)
        entities = [(ent.text, ent.label_) for ent in doc.ents]
        entry["label"] = entities
    with open(f"/Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_data_names/rest_daten_{year}.jsonl", "w", encoding="utf-8") as f:
        for entry in data:
            f.write(json.dumps(entry) + "\n")
    print(f"Annotated data saved {year} rest_daten")

Annotated data saved 2016 doccano
Annotated data saved 2016 validierung
Annotated data saved 2016 rest_daten
Annotated data saved 2017 doccano
Annotated data saved 2017 validierung
Annotated data saved 2017 rest_daten
Annotated data saved 2018 doccano
Annotated data saved 2018 validierung
Annotated data saved 2018 rest_daten
Annotated data saved 2019 doccano
Annotated data saved 2019 validierung
Annotated data saved 2019 rest_daten
Annotated data saved 2020 doccano
Annotated data saved 2020 validierung
Annotated data saved 2020 rest_daten
Annotated data saved 2021 doccano
Annotated data saved 2021 validierung
Annotated data saved 2021 rest_daten
Annotated data saved 2022 doccano
Annotated data saved 2022 validierung
Annotated data saved 2022 rest_daten
Annotated data saved 2023 doccano
Annotated data saved 2023 validierung
Annotated data saved 2023 rest_daten


In [18]:
import json
import glob
global_path = "/Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_data_names/"
global_out = "/Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/"
for year in years:
    input_files = []
    for pattern in [f"{global_path}doccano_{year}.jsonl", f"{global_path}validierung_{year}.jsonl", f"{global_path}rest_daten_{year}.jsonl"]:
        input_files.extend(glob.glob(pattern))
    output_file = f"{global_out}all_annotations_{year}.jsonl"
    with open(output_file, "w", encoding="utf-8") as outfile:
        for file in input_files:
            with open(file, "r", encoding="utf-8") as f:
                for line in f:  # Read each line (each is a JSON object)
                    outfile.write(line)  # Write it directly
    
    print(f"✅ Merged {len(input_files)} files into {output_file}")

✅ Merged 3 files into /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/all_annotations_2016.jsonl
✅ Merged 3 files into /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/all_annotations_2017.jsonl
✅ Merged 3 files into /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/all_annotations_2018.jsonl
✅ Merged 3 files into /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/all_annotations_2019.jsonl
✅ Merged 3 files into /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/all_annotations_2020.jsonl
✅ Merged 3 files into /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/all_annotations_2021.jsonl
✅ Merged 3 files into /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/all_annotations_2022.jsonl
✅ Merged 3 files into /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/all_annotations_2023.jsonl


In [23]:
for year in years:
    input_file = f"{global_out}all_annotations_{year}.jsonl"
    output_file = f"{global_out}filtered_annotations_{year}.jsonl"
    
    with open(input_file, "r", encoding="utf-8") as f_in, open(output_file, "w", encoding="utf-8") as f_out:
        for line in f_in:
            entry = json.loads(line)
            new_labels = []
    
            for entity_text, entity_label in entry.get("label", []):
                if len(entity_text.split()) <= 3:  # Check if entity has 3 words or fewer
                    new_labels.append([entity_text, entity_label])
    
            entry["label"] = new_labels  # Update labels in the entry
            f_out.write(json.dumps(entry) + "\n")  # Write the modified entry back
    
    print(f"✅ Saved filtered data to {output_file}")


✅ Saved filtered data to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/filtered_annotations_2016.jsonl
✅ Saved filtered data to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/filtered_annotations_2017.jsonl
✅ Saved filtered data to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/filtered_annotations_2018.jsonl
✅ Saved filtered data to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/filtered_annotations_2019.jsonl
✅ Saved filtered data to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/filtered_annotations_2020.jsonl
✅ Saved filtered data to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/filtered_annotations_2021.jsonl
✅ Saved filtered data to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/filtered_annotations_2022.jsonl
✅ Saved filtered data to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/filtered_annotations_2023.jsonl


In [25]:
for year in years:
    entities = set()
    with open(f"{global_out}filtered_annotations_{year}.jsonl", "r", encoding="utf-8") as f:
        for line in f:
            entry = json.loads(line)
            for entity_text, entity_label in entry.get("label", []):
                if entity_label == "Searched_Name":
                    entities.add(entity_text)
    entities = sorted(entities)
    print(f"Unique entities with label 'Searched_Name' in {year}:")
    for entity in entities[:20]:
        print(entity)
    print("")

Unique entities with label 'Searched_Name' in 2016:
(Gesundheits-) u. Kranken-Pfleger/in
(Kinder)Krankenpfleger/in
(Kinder-) Krankenpfleger
(Kinder-) Krankenpfleger/in
(Kinder-)Gesundheits- und Krankenpfleger/in
(Kinder-)Krankenschwester
+
+Pflegefachkraft
-/Gesundheitspfleger
-Altenpfleger
-Altenpfleger/in
-Berufsanfänger/innen
-Gesundheits- und Krankenpfleger/in
-Krankenpfleger
-Pfleger
-Wiedereinsteiger/in
-in
-krankenpfleger
-pfleger
-pfleger/ (Heil-)Erzieher/in

Unique entities with label 'Searched_Name' in 2017:
#
(Gesundheits-) u. Kranken-Pfleger/in
) Pflegekräfte
-Altenpfleger
-Altenpfleger/in
-Berufsanfänger/innen
-Berufseinsteiger/innen
-Familienpfleger/in
-Gesundheits- und Krankenpfleger/in
-Gesundheitspfleger
-Heilerziehungspfleger/in
-Kinderkrankenpfleger
-Pfleger
-in
-krankenpfleger
-pfleger
-pfleger /
-pfle­ger
-wiedereinsteiger/in
. Pflegefachkraft

Unique entities with label 'Searched_Name' in 2018:
#
(Alten)-Pflegefachkraft (m/w) in
(Fach-)Gesundheits-und Krankenpfleg

In [44]:
search_texts = ["", ""] # Falsche Berufsbezeichnungen (erkennen anhand der Ausgabe der Zeile davor)
for year in years:
    searched_entries = []
    remaining_entries = []
    input_file = f"/Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/all_annotations_{year}.jsonl"
    false_file = f"/Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/check_annotations_{year}.jsonl"
    correct_file = f"/Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/correct_annotations_{year}.jsonl"
    valid_substrings = ['Pflegefach', 'Krankenschwester', 'Krankenpflege', 'Altenpflege', 'Kinderkrankenschwester', 'Kinderkrankenpflege'] 
    with open(input_file, "r", encoding="utf-8") as f_in, \
         open(correct_file, "w", encoding="utf-8") as f_correct, \
         open(false_file, "w", encoding="utf-8") as f_false:
    
        for line in f_in:
            entry = json.loads(line)
            labels = entry.get("label", [])
            
            # Check if ANY "Searched_Name" entity contains a valid substring
            matches = any(
                any(substring.lower() in entity_text.lower() for substring in valid_substrings)
                for entity_text, entity_label in labels if entity_label == "Searched_Name"
            )
    
            # Write entry to the correct file if a match is found, otherwise to the false file
            if matches:
                f_correct.write(json.dumps(entry) + "\n")
            else:
                f_false.write(json.dumps(entry) + "\n")
    
    print(f"✅ Entries with valid 'Searched_Name' labels saved to {correct_file}")
    print(f"❌ Entries without valid 'Searched_Name' labels saved to {false_file}")

# Nächster Schritt: SteAs in checked_annotations_year.jsonl händisch ein-/aussortieren via doccano

✅ Entries with valid 'Searched_Name' labels saved to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/correct_annotations_2016.jsonl
❌ Entries without valid 'Searched_Name' labels saved to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/check_annotations_2016.jsonl
✅ Entries with valid 'Searched_Name' labels saved to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/correct_annotations_2017.jsonl
❌ Entries without valid 'Searched_Name' labels saved to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/check_annotations_2017.jsonl
✅ Entries with valid 'Searched_Name' labels saved to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/correct_annotations_2018.jsonl
❌ Entries without valid 'Searched_Name' labels saved to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/check_annotations_2018.jsonl
✅ Entries with valid 'Searched_Name' labels saved to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all

In [45]:
false_filtered_file = f"/Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/real_check_annotations_{year}.jsonl"
for year in years:
    false_file = f"/Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/check_annotations_{year}.jsonl"
    false_filtered_file = f"/Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/real_check_annotations_{year}.jsonl"
    with open(false_file, "r", encoding="utf-8") as f_in, \
         open(false_filtered_file, "w", encoding="utf-8") as f_out:
    
        for line in f_in:
            entry = json.loads(line)
            
            # Check if "azubiyo" is in the text (case-insensitive)
            if "azubiyo" not in entry.get("text", "").lower():
                f_out.write(json.dumps(entry) + "\n")  # Keep the entry if it doesn't contain "azubiyo"
    
    print(f"✅ Filtered entries saved to {false_filtered_file} (removed 'azubiyo' occurrences).")

✅ Filtered entries saved to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/real_check_annotations_2016.jsonl (removed 'azubiyo' occurrences).
✅ Filtered entries saved to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/real_check_annotations_2017.jsonl (removed 'azubiyo' occurrences).
✅ Filtered entries saved to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/real_check_annotations_2018.jsonl (removed 'azubiyo' occurrences).
✅ Filtered entries saved to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/real_check_annotations_2019.jsonl (removed 'azubiyo' occurrences).
✅ Filtered entries saved to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/real_check_annotations_2020.jsonl (removed 'azubiyo' occurrences).
✅ Filtered entries saved to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/real_check_annotations_2021.jsonl (removed 'azubiyo' occurrences).
✅ Filtered entries saved to /Users/jan/Documents/Pro

In [46]:
list_two = ["leitung", "management"]
for year in years:
    # File paths
    false_filtered_file = f"/Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/real_check_annotations_{year}.jsonl"
    check_really_file = f"/Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/check_forreal_annotations_{year}.jsonl"
    
    # Open files for processing
    with open(false_filtered_file, "r", encoding="utf-8") as f_in, \
         open(check_really_file, "w", encoding="utf-8") as f_out:
    
        for line in f_in:
            entry = json.loads(line)
            labels = entry.get("label", [])
    
            # Check if ANY "Searched_Name" entity matches partially with list_two
            matches = any(
                any(substring.lower() in entity_text.lower() for substring in list_two)
                for entity_text, entity_label in labels if entity_label == "Searched_Name"
            )
    
            # If NO matches are found, write the entry to check_really_{year}.jsonl
            if not matches:
                f_out.write(json.dumps(entry) + "\n")
    
    print(f"✅ Filtered entries saved to {check_really_file} (for manual review).")


✅ Filtered entries saved to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/check_forreal_annotations_2016.jsonl (for manual review).
✅ Filtered entries saved to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/check_forreal_annotations_2017.jsonl (for manual review).
✅ Filtered entries saved to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/check_forreal_annotations_2018.jsonl (for manual review).
✅ Filtered entries saved to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/check_forreal_annotations_2019.jsonl (for manual review).
✅ Filtered entries saved to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/check_forreal_annotations_2020.jsonl (for manual review).
✅ Filtered entries saved to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/check_forreal_annotations_2021.jsonl (for manual review).
✅ Filtered entries saved to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/check_forreal_a

In [47]:

for year in years:
    # File paths
    false_filtered_file = f"/Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/check_forreal_annotations_{year}.jsonl"
    check_really_file = f"/Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/ohne_helfer_annotations_{year}.jsonl"
    
    # Open files for processing
    with open(false_filtered_file, "r", encoding="utf-8") as f_in, \
         open(check_really_file, "w", encoding="utf-8") as f_out:
    
        for line in f_in:
            entry = json.loads(line)
            labels = entry.get("label", [])

            # Check if ANY "Searched_Name" entity contains "helf"
            contains_helf = any(
                "helf" in entity_text.lower()
                for entity_text, entity_label in labels if entity_label == "Searched_Name"
            )

            # Skip entries that contain "helf"
            if contains_helf:
                continue  
            
            # Otherwise, keep the entry
            f_out.write(json.dumps(entry) + "\n")
    
    print(f"✅ Filtered entries saved to {check_really_file} (entries containing 'helf' removed).")


✅ Filtered entries saved to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/ohne_helfer_annotations_2016.jsonl (entries containing 'helf' removed).
✅ Filtered entries saved to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/ohne_helfer_annotations_2017.jsonl (entries containing 'helf' removed).
✅ Filtered entries saved to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/ohne_helfer_annotations_2018.jsonl (entries containing 'helf' removed).
✅ Filtered entries saved to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/ohne_helfer_annotations_2019.jsonl (entries containing 'helf' removed).
✅ Filtered entries saved to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/ohne_helfer_annotations_2020.jsonl (entries containing 'helf' removed).
✅ Filtered entries saved to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/ohne_helfer_annotations_2021.jsonl (entries containing 'helf' removed).
✅ Filtered entries sav

In [48]:

for year in years:
    # File paths
    false_filtered_file = f"/Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/ohne_helfer_annotations_{year}.jsonl"
    check_really_file = f"/Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/ohne_azubis_annotations_{year}.jsonl"
    
    # Open files for processing
    with open(false_filtered_file, "r", encoding="utf-8") as f_in, \
         open(check_really_file, "w", encoding="utf-8") as f_out:
    
        for line in f_in:
            entry = json.loads(line)
            labels = entry.get("label", [])

            # Check if ANY "Searched_Name" entity contains "helf"
            contains_helf = any(
                "Ausbildung" in entity_text.lower()
                for entity_text, entity_label in labels if entity_label == "Searched_Name"
            )

            # Skip entries that contain "helf"
            if contains_helf:
                continue  
            
            # Otherwise, keep the entry
            f_out.write(json.dumps(entry) + "\n")
    
    print(f"✅ Filtered entries saved to {check_really_file} (entries containing 'helf' removed).")


✅ Filtered entries saved to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/ohne_azubis_annotations_2016.jsonl (entries containing 'helf' removed).
✅ Filtered entries saved to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/ohne_azubis_annotations_2017.jsonl (entries containing 'helf' removed).
✅ Filtered entries saved to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/ohne_azubis_annotations_2018.jsonl (entries containing 'helf' removed).
✅ Filtered entries saved to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/ohne_azubis_annotations_2019.jsonl (entries containing 'helf' removed).
✅ Filtered entries saved to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/ohne_azubis_annotations_2020.jsonl (entries containing 'helf' removed).
✅ Filtered entries saved to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/ohne_azubis_annotations_2021.jsonl (entries containing 'helf' removed).
✅ Filtered entries sav

In [49]:
for year in years:
    # File paths
    input_file = f"/Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/ohne_azubis_annotations_{year}.jsonl"
    output_file = f"/Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/annotate_again_{year}.jsonl"

    # Read, modify, and write back
    with open(input_file, "r", encoding="utf-8") as f_in, \
         open(output_file, "w", encoding="utf-8") as f_out:
        
        for line in f_in:
            entry = json.loads(line)
            entry["label"] = ""  # Remove labels
            f_out.write(json.dumps(entry) + "\n")

    print(f"✅ Labels removed in {output_file} (all entries now have 'label': '').")


✅ Labels removed in /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/annotate_again_2016.jsonl (all entries now have 'label': '').
✅ Labels removed in /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/annotate_again_2017.jsonl (all entries now have 'label': '').
✅ Labels removed in /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/annotate_again_2018.jsonl (all entries now have 'label': '').
✅ Labels removed in /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/annotate_again_2019.jsonl (all entries now have 'label': '').
✅ Labels removed in /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/annotate_again_2020.jsonl (all entries now have 'label': '').
✅ Labels removed in /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/annotate_again_2021.jsonl (all entries now have 'label': '').
✅ Labels removed in /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/annotate_again_2022.jsonl (all entries now have

In [50]:

for year in years:
    with open(f"/Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/annotate_again_{year}.jsonl", "r", encoding="utf-8") as f:
        data = [json.loads(line) for line in f]
    for entry in data:
        text = entry["text"]
        doc = nlp(text)
        entities = [(ent.text, ent.label_) for ent in doc.ents]
        entry["label"] = entities
    with open(f"/Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/spacy_two_{year}.jsonl", "w", encoding="utf-8") as f:
        for entry in data:
            f.write(json.dumps(entry) + "\n")
    print(f"Annotated data saved {year}")

Annotated data saved 2016
Annotated data saved 2017
Annotated data saved 2018
Annotated data saved 2019
Annotated data saved 2020
Annotated data saved 2021
Annotated data saved 2022
Annotated data saved 2023


In [52]:
# Define substrings to filter out
exclude_substrings = ["helf", "hilf", "management", "leitung", "Pflegekraft"]
valid_substrings = [v.lower() for v in valid_substrings]
for year in years:
    # File paths
    input_file = f"/Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/spacy_two_{year}.jsonl"
    output_file = f"/Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/spacy_two_filtered_{year}.jsonl"

    # Read, filter, and write back
    with open(input_file, "r", encoding="utf-8") as f_in, \
         open(output_file, "w", encoding="utf-8") as f_out:

        for line in f_in:
            entry = json.loads(line)
            labels = entry.get("label", [])

            # Extract "Searched_Name" entities
            searched_names = [entity_text.lower() for entity_text, entity_label in labels if entity_label == "Searched_Name"]

            # Condition 1: Exclude if any label contains an excluded substring
            if any(any(excl in name for excl in exclude_substrings) for name in searched_names):
                print(searched_names)
                continue  # Skip this entry
            
           # Condition 2: Exclude if none of the labels match valid_substrings
            if not any(any(valid.lower() in name for valid in valid_substrings) for name in searched_names):
                if not searched_names:
                    f_out.write(json.dumps(entry) + "\n")
                continue  # Skip this entry

            # If entry passed both checks, keep it
            f_out.write(json.dumps(entry) + "\n")
            print ("kept entry")

    print(f"✅ Filtered entries saved to {output_file} (entries matching excluded substrings or not in valid_substrings removed).")
# Define paths
global_output_path = "/Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/"
final_output_file = f"{global_output_path}spacy_two_filtered_all.jsonl"

# Get all filtered files
filtered_files = glob.glob(f"{global_output_path}spacy_two_filtered_*.jsonl")

# Merge all files into one
with open(final_output_file, "w", encoding="utf-8") as f_out:
    for file in filtered_files:
        with open(file, "r", encoding="utf-8") as f_in:
            for line in f_in:
                f_out.write(line)

print(f"✅ Merged {len(filtered_files)} files into {final_output_file}")


['aushilfsfahrer']
['pflegehilfskraft']
['psychologe', 'erzieher', 'hilfskraft']
['pflegehilfskraft']
['pflegehilfskraft']
['pflegehilfskraft']
['aushilfe\n\n  wir bieten:\n\n  * eine verantwortungsvolle mitarbeit in einem netten team.\n  * einen sicheren arbeitsplatz.\n  * eine gute arbeitsatmosphäre mit ausreichend zeit für die betreuung unserer kunden.\n  * eine leistungsgerechte vergütung mit weihnachts- und urlaubsgeld\n  * zusätzliche altersversorgung.\n  * regelmäßige fort- und weiterbildungen.\n  * zusätzliche bonuszahlung.\n\n  wenn sie interesse haben, bewerben sie sich gerne! sie können dies postalisch oder per e-mail machen. bei fragen rufen sie uns gerne auch an.\n\n  kontakt:\n\n\n  zentrale für private fürsorge\n\n\n  hrn. dr. vosteen\n\n\n  arberger straße 8\n\n\n  28205 bremen\n\n\n  telefon: 0421 70 00 55\n\n\n  e-mail: diese e-mail-adresse ist vor spambots geschützt! zur anzeige muss javascript eingeschaltet sein!\n\n  aktuelle veranstaltungen']
✅ Filtered entries sa

In [53]:

# Define file paths
input_file = "/Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/spacy_two_filtered_all.jsonl"
output_file = "/Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/spacy_two_filtered_valid.jsonl"

# Read the input file, filter based on valid_substrings, and write to the output file
with open(input_file, "r", encoding="utf-8") as f_in, \
     open(output_file, "w", encoding="utf-8") as f_out:
    
    for line in f_in:
        entry = json.loads(line)
        text = entry.get("text", "")
        
        # Check if any of the valid substrings is a partial match in the text
        if any(valid.lower() in text.lower() for valid in valid_substrings):
            f_out.write(json.dumps(entry) + "\n")  # Write to output file if match is found
            print('+1')
print(f"✅ Filtered entries saved to {output_file} (only entries matching valid substrings).")


+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+

In [64]:
nlp = spacy.load("output/model-best")
with open(f"/Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/spacy_two_filtered_all.jsonl", "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]
    for entry in data:
        text = entry["text"]
        entry["label"] = ""
        doc = nlp(text)
        entities = [(ent.text, ent.label_) for ent in doc.ents]
        entry["label"] = entities
    with open(f"/Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/spacy_three.jsonl", "w", encoding="utf-8") as f:
        for entry in data:
            f.write(json.dumps(entry) + "\n")
    print(f"Annotated data saved :)")

Annotated data saved :)


In [65]:
# Define substrings to exclude
exclude_substrings = ["leitung", "management", "helf", "hilf"]

# Define file paths
input_file = "/Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/spacy_three.jsonl"
output_file = "/Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/spacy_three_filtered.jsonl"
nothing = 0
something = 0
# Read the input file, filter based on conditions, and write to the output file
with open(input_file, "r", encoding="utf-8") as f_in, \
     open(output_file, "w", encoding="utf-8") as f_out:
    
    for line in f_in:
        entry = json.loads(line)
        labels = entry.get("label", [])

        # Extract the "Search_Entry" entities
        search_entries = [entity_text.lower() for entity_text, entity_label in labels if entity_label == "Search_Entry"]
        # Check if "Search_Entry" exists
        if not search_entries:
            nothing += 1
            continue  # Skip if no "Search_Entry" entities

        # Check for partial match with valid_substrings
        valid_match = any(any(valid.lower() in entity for valid in valid_substrings) for entity in search_entries)

        # Check for exclusion (if it partially matches any of the exclude substrings)
        exclude_match = any(any(excl in entity for excl in exclude_substrings) for entity in search_entries)

        # If no valid match and no exclusion, keep the entry
        if valid_match and not exclude_match:
            f_out.write(json.dumps(entry) + "\n")  # Write to the output file if both conditions are satisfied
            somehting += 1

print(f"✅ Filtered entries saved to {output_file} (entries matching valid substrings and not matching excluded substrings).")
print(something)
print(nothing)


✅ Filtered entries saved to /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/spacy_three_filtered.jsonl (entries matching valid substrings and not matching excluded substrings).
0
19541


In [68]:
import json

# Define the file path
false_file = f"/Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/check_annotations_{year}.jsonl"

# Initialize the counter
entry_count = 0
total = 0
for year in years:
    # Open the file and count the number of entries
    with open(false_file, "r", encoding="utf-8") as f:
        for line in f:
            entry_count += 1
    
    # Print the result
    print(f"Number of entries in {false_file}: {entry_count}")
    total += entry_count
    print(total)

Number of entries in /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/check_annotations_2016.jsonl: 1660
1660
Number of entries in /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/check_annotations_2016.jsonl: 3320
4980
Number of entries in /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/check_annotations_2016.jsonl: 4980
9960
Number of entries in /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/check_annotations_2016.jsonl: 6640
16600
Number of entries in /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/check_annotations_2016.jsonl: 8300
24900
Number of entries in /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/check_annotations_2016.jsonl: 9960
34860
Number of entries in /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/check_annotations_2016.jsonl: 11620
46480
Number of entries in /Users/jan/Documents/Promotion/BIBB GW/hdd/Annotated_names_all/check_annotations_2016.jsonl: 13280
59760
