In [None]:
import pandas as pd
import stanza
import re
import os
from tqdm import tqdm


In [None]:
# Download and load Dutch NER model
# stanza.download('nl')
nlp = stanza.Pipeline(lang='nl', processors='tokenize,ner')


In [None]:

# Read the cleaned dataset
cleaned_df = pd.read_excel(directory_in + "/cleaned_df4llm.xlsx")

In [None]:

# Get the column names and their indices as a dictionary
column_dict = {column_name: column_index for column_index, column_name in enumerate(cleaned_df.columns)}
column_dict

In [None]:
# Define the column name for anonymization
column_name = 'text1'

In [None]:
# Initialize the counter variable
iteration_counter = 1
changed_counter = 1
changes_per_iteration = {}

# Define function for anonymization
def anonymize(text):
    global iteration_counter, changes_per_iteration
    
    # Initialize changes for this iteration
    changes = []

    print('Iteration number:', iteration_counter)
    # Increment the iteration counter
    iteration_counter += 1

    if isinstance(text, str):
        # Perform named entity recognition (NER)
        doc = nlp(text)
        names = [ent.text for ent in doc.ents if ent.type == 'PER']
        if names:
            print('Names to be anonymized: ', names)
        for name in names:
            if name not in ['Doktor', 'Corona', 'Coeliakie', 'Vingerprik', 'Hemelvaart', 'Had',
                            'CT-Scan', 'Covid', 'J3', 'A', 'Alrijne', 'Glas', 'Allen', 'A4tje', 'Orthoptist',
                            'Poli', 'OD', 'Chirurgie', 'COVID', 'Zoon', 'Holter', 'God', 'Arts', 'Lumc', 'AIOS',
                            'Tijd', 'Patiént', 'Toppie', 'Anthonie van Leeuwenhoek', 'Sinterklaas', 
                            'Hodgkin', 'Tem\r\nPo', 'Dr', 'dr.', 'LUMC', 'CTscan', 'Hellp', 'Moeder', 'Marsh 3b',
                            'Marsh 3a', 'Marsh 3A', 'Marsh 3C', 'Coeliaki', 'Reuma', 'Vegan', 'Pfeiffer',  'Glutenvrij', 
                            'Osteopenie', 'Dietist', 'Crohn', 'Marsh 3', 'Marsh 1', 'GV', 'Marsh 3b', 'Alzheimer',
                            'HB', 'Pfeifer', 'Graves', 'Dokter', 'Turner', 'Duhring', 'Brain Fog', 'Fog', 'Fysiek',
                            'Albert Heijn', 'Marsh', 'Vega', 'Non Hodgkin(', 'Brinta', 'Brainfog', 'Hebbik',
                            'Duringh']: 
                print('Replaced: ', name)
                text = text.replace(name, '[name]')
                # Record the change
                changes.append({'type': 'Name', 'original': name, 'anonymized': '[name]'})

        # Remove patient numbers
        text, num_changes = re.subn('\s\d{7}([\s,])', '[patientnr]', text, flags=re.S)
        if num_changes > 0:
            print('Removed patient numbers:', num_changes)
            # Record the change
            changes.append({'type': 'Patient Number', 'count': num_changes})

        # Remove Dutch personal identification numbers (BSN)
        text, num_changes = re.subn('\s\d{9}([\s,])', '[BSN]', text, flags=re.S)
        if num_changes > 0:
            print('Removed BSNs:', num_changes)
            # Record the change
            changes.append({'type': 'BSN', 'count': num_changes})

        # Anonymize dates
        text = re.sub('\d{1,2}[/-]\d{1,2}[/-]\d{4}', '[date]', text, flags=re.S)
        text = re.sub('\d{4}[/-]\d{1,2}[/-]\d{1,2}', '[date]', text, flags=re.S)

        # Remove phone numbers
        text, num_changes = re.subn("06[- ]{0,1}\d{8}", "[tel]", text, flags=re.S)
        text, num_changes = re.subn("0\d{2}[- ]\d{7}", "[tel]", text, flags=re.S)
        text, num_changes = re.subn("0\d{3}[- ]\d{6}", "[tel]", text, flags=re.S)
        text, num_changes = re.subn("0\d{3}[- ]\d{3}\s\d{3}", "[tel]", text, flags=re.S)
        text, num_changes = re.subn("0\d{4}[- ]\d{5}", "[tel]", text, flags=re.S)
        text, num_changes = re.subn("\d{3}\s\d{3}\s\d{4}", "[tel]", text, flags=re.S)
        if num_changes > 0:
            print('Removed phone numbers:', num_changes)
            # Record the change
            changes.append({'type': 'Phone Number', 'count': num_changes})

        # Remove email addresses
        text, num_changes = re.subn("\S+@[\.\w]+\.[^\d\W]{2,3}", "[email]", text, flags=re.S)
        if num_changes > 0:
            print('Removed email addresses:', num_changes)
            # Record the change
            changes.append({'type': 'Email', 'count': num_changes})

        # Remove document numbers
        text, num_changes = re.subn("\s\d{10}", "[docnr]", text, flags=re.S)
        if num_changes > 0:
            print('Removed document numbers:', num_changes)
            # Record the change
            changes.append({'type': 'Document Number', 'count': num_changes})

        # Store changes for this iteration
        changes_per_iteration[iteration_counter] = changes

        return text.strip()
    else:
        return ''


In [None]:
# Anonymize the text in the specified column
anon_text = cleaned_df[column_name].apply(anonymize)



In [None]:
output_path = os.path.join(directory_out, "anon_text.xlsx")
anon_text.to_excel(output_path)


In [None]:
anon_cleaned_df = cleaned_df
anon_cleaned_df['anon_text1'] = anon_text
anon_cleaned_df.to_excel(directory_out + "/anon_cleaned_df.xlsx", index=False)