In [None]:
%pip install ipython matplotlib seaborn numpy pandas openpyxl

In [None]:
%config InlineBackend.figure_formats = ['svg']

import re
import itertools
import typing
from pathlib import Path
import shutil
import csv
import warnings
warnings.filterwarnings('ignore')

import IPython.display as ipd                               
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import openpyxl


# Konfiguration
sns.set_style("whitegrid")

# Pfade aufsetzen
output = Path(f"output")
output.mkdir(exist_ok=True)
input = Path('input')
if not input.exists() or next(input.iterdir(), None) is None:
    raise Exception(f"Expected input data in {input.name} not found")
workdir = Path("workdir")
if workdir.is_dir():
    shutil.rmtree(workdir)
workdir.mkdir(exist_ok=True)

# Standorte; Reihenfolge nach Priorität 
# d.h. Originale werden immer aus dem Standort
# genommen, der hier zuerst steht 
AMS = "AMS"
locations = [AMS, 'MZ', 'STG', 'BAD',  'KA', 'TUE', 'MA', 'FR']



In [None]:
# Konfiguration: CSV einlesen

na_values = [
    '0\'00000"',
    '',
    'UNBEKANNT',
    'Unbekannt',
    '-',
    'o. A',
    'o.A.',
    # BNR: Platzhalter-Werte
    '[]',
    '[-]', 
    '[unbekannt]',
    '[PROMO]',
    '[- PROMO]',
    '[[ PROMO]]',
    '[ PROMO]',
    '[Promo]',
    '[Promo-CD]',
    '[o. A.]',
    '[ohne Nummer]',
    '[ohne Nr.]',
    '[o. Nr.]'
    # RHTI: Leere Werte
    ' "',
    ' " ',
    # MIT: Leere Werte
    # 'Diverse',
    'Nicht genannt',
    'nicht genannt'
]

# ordered by preference
black_list_mit = [['Ensemble', 'Chor', 'Diverse', 'Orchester', 'Original Cast'], ['u.a.']]

"""
Sonstige, überdurchschnittlich häufige Werte, ggf. auch als leer interpretieren?
    # RHTI: Allweltstitel 
    'Live',
    'Streichquartette',
    'Lieder',
    'Kammermusik',
    'Live USA',
    'Greatest hits', 
    'Greatest Hits',
    'Die großen Erfolge',
    # MIT: Gruppenbezeichnungen
    'Diverse',
    'Ensemble',
    'Orchester',
    'Original Cast'
"""


for value in list(na_values):
    na_values.append(value.lower())
    na_values.append(value.upper())

dtypes = {
         'BEST': 'string',
         'ANR': 'string',  # Archivnummern erhalten entgegen des Namens manchmal Zeichen
         'EAN': 'string',
         'PEAN': 'string',
         'LC': 'string',
         'LN': 'string',
         'BNR':'string',
         'TTS': 'string',
         'MIT': 'string',
         'MIT_TYP': 'string',
         'ABS_DAUER': 'string',
         'T-ISRC': 'string',
         'T-RHTI': 'string',
         'RHTI': 'string',
         'TRÄGER': 'Int64',
         'SEITE': 'Int64',
         'TAKE': 'Int64',
         'AMO_ID': 'string',
         'KAT_GATT': 'string',
         'TTYP': 'string',
         'TPRIO': 'string',
         'TITEL': 'string',
         'T-TITEL': 'string'
}

for dtype in list(dtypes.keys()):
    dtypes[f"Q_{dtype}"] = dtypes[dtype]

pd_args = dict(encoding='latin1',
               sep=';',
               na_values=na_values,
               low_memory=False,
               dtype=dtypes,
               usecols=lambda x: 'l_' not in x and 'unnamed' not in x.lower())

In [None]:
# Eingabedaten lesen und in besser händelbare Form 
# bringen, d.h. eine CSV per Standort 
# (wie von `location_data` erwartet)

input_dirs = [dir for dir in input.iterdir() if dir.is_dir()]

every_location_has_dir = all(dir.name in locations for dir in input_dirs)

if not every_location_has_dir:
    raise Exception(f"Data not as expected in {input}")

for dir in input_dirs:
    location = dir.name
    csv_files = [f for f in dir.glob('**/*') if f.is_file() and f.suffix == '.csv']
    src = None
    dst = f"{workdir/location}.csv"
    tmp_src_csv = []
    for csv_file in csv_files:
        tmp_df = pd.read_csv(csv_file, index_col=1, **pd_args)
        tmp_df['COVER_PDF'] = 'AMS' in csv_file.name and 'Mit_PDF' in csv_file.name
        tmp_df['COVER_KEIN_PDF'] = 'AMS'  in csv_file.name and 'Ohne_PDF'  in csv_file.name
        tmp_src_csv.append(tmp_df)
    con_src_csv = pd.concat(tmp_src_csv)
    con_src_csv.to_csv(dst, encoding='latin1', sep=';')
    src_datei = dst
    if src_datei == None:
        raise Exception(f"Couldn't collect data for {location}")
    print(f"Collected {location} -> {dst}")





In [None]:
# Daten einlesen

def get_data():
    dfs = []
    for location in locations:
        src = f"{workdir/location}.csv"
        tmp_df = pd.read_csv(src, index_col=0, **pd_args)
        tmp_df.index = tmp_df.index.astype(str)
        dfs.append(tmp_df)
    data = pd.concat(dfs)

    # Daten aufbereiten

    # Bestand als Kategorie setzen 
    data['BEST'] = data['BEST'].astype('category')
    data['BEST'] = data['BEST'].cat.set_categories(locations, ordered=True)

    # EAN/PEAN zusammenfassen
    data['EAN/PEAN'] = data['EAN']
    data['EAN/PEAN'].fillna(data['PEAN'], inplace=True)


    # Labelcode
    data['LC'].fillna(data['LN'], inplace=True)

    # Titel finden
    data['RHTI'] = ""
    data['T-RHTI'] = data['T-TITEL']
    ## immer den RHTI mit höchster PRIO behalten
    for prio in ["3", "2", "1"]:
        mask = data['TPRIO'] == prio 
        data.loc[mask, 'RHTI'] = data.loc[mask, 'TITEL']


    # MIT und MIT_TYP sind vertauscht
    data['MIT'] = data['MIT_TYP']
    # Es kann mehrere MIT geben, wobe die, die in 
    ## black_list_mit[0] sind nur mit Prio 2 genommen werden sollen und die in black_list_mit[3] nur mit Prio 3
    # Bevorzugten MIT finden
    def correct_mit(dd: pd.Series):
         if(len(dd) < 2):
              return dd.iloc[0]
         has_prio_3_mit = dd.loc[dd.isin(black_list_mit[1])]
         has_prio_2_mit = dd.loc[dd.isin(black_list_mit[0])]
         has_prio_1_mit = dd.loc[~dd.isin([*black_list_mit[0], *black_list_mit[0]])]

         if(len(has_prio_1_mit) > 0):
              return has_prio_1_mit.iloc[0]
         if(len(has_prio_2_mit) > 0):
              return has_prio_2_mit.iloc[0]
         if(len(has_prio_3_mit) > 0):
              return has_prio_3_mit.iloc[0]
         return dd.iloc[0]
    data['MIT'] = data['MIT'].groupby('ANR').apply(correct_mit)


    data.to_csv('rohdaten.csv')
    print(f"Total before deduplication: {len(data)}")
    # Duplikat-ANRs entfernen (durch mehrere Titel, MIT etc. entstanden)
    data = data[~data.index.duplicated(keep='first')]
    print(f"Total after deduplication: {len(data)} \n")
    data.to_csv('putzdaten.csv')

    # Irrelevante Spalten entfernen
    cols_to_delete = ['TTS', 'DAUER', 'ABS_DAUER', 'TPRIO', 'TITEL', 
                      'T-TITEL', 'MIT_TYP', 'LN', 'PEAN', 'EAN',
                      'TTYP', 'KAT_GATT']
    deleted_cols = []
    for col_to_delete in cols_to_delete:
        del data[col_to_delete]
        deleted_cols.append(col_to_delete)
        q_col = f"Q_{col_to_delete}"
        if q_col in data:
            del data[q_col]
            deleted_cols.append(q_col)

    print(f"Deleted cols: {deleted_cols} \n")        
    cols = [col for col in list(data.columns)]
    print(f"Kept cols: {cols} \n")
    
    # Plausibilisieren, dass MIT und RHTI korrekt gesetzt sind

    assert data.loc['1254061']['RHTI'] == 'Stark'
    assert data.loc['1254061']['MIT'] == 'Feller, Linda'

    return data

data = get_data()


In [None]:
# Wertverteilung visualisieren
missing_values_dir = output / Path("fehlende_werte") 
missing_values_dir.mkdir(exist_ok=True)
for location in locations:
    cols = data.columns
    has_value = [ data.loc[data['BEST'] == location, col].count() for col in cols]
    no_value = [ data.loc[data['BEST'] == location, col].isnull().sum() for col in cols]
    compare_cols = pd.DataFrame({'fehlt': no_value, 'vorhanden': has_value}, index=cols)
    ax = compare_cols.plot(kind='barh', stacked=True, title=f'{location}: Fehlende Daten')
    ax.set(xlabel='Einheiten', ylabel='Spalten')
    plt.title(f'{location}: Fehlende Werte')
    plt.savefig(f"{missing_values_dir / location}.svg")
    plt.show()

In [None]:
# Hilfsfunktion, um Vergleiche hinzuzufügen

ALLOWED_CONDITIONS = [
    'ignore_minimal_distance', # ndw: Schließe Dublette auch ein, wenn der Mindestabstand nicht erfüllt ist
    'no_tst', # tmf: Träger, Seite, Take haben keinen Wert
    'minimal_bnr', # BNR muss eine Mindestlänge (siehe unten) erfüllen 
    'shorten_z_bnr', # Kürze erste Stelle Z-BNR, wenn Z-BNR aus 6 oder mehr Ziffern besteht
    'no_bnr', # bnf: BNR fehlt,
    'check_q_locators'
]

LOCALISATORS = ['TRÄGER', 'SEITE', 'TAKE']

# Spalten, die später hinzugefügt werden
ADDITIONAL_COLS = ['Z-BNR']



# Vergleich hinzufügen
def compare(df: pd.DataFrame, to_compare: dict, use_cols: list, name_prefix = None, check_q_compare = True, use_conditions: typing.Dict[str, bool] = dict(), name_suffix: typing.Optional[str] = None) -> None:
    name = ", ".join(use_cols)
    if name_prefix is not None:
        name = f'{name_prefix}: {name}'
    if name_suffix is not None:
        name = f'{name} {name_suffix}'
    
    duplicated_comparision = any(set(use_cols) == set(comparision['cols']) for comparision in to_compare.values())
    if duplicated_comparision:
        raise Exception(f'Comparision already included: {name}')
        
    unknown_cols = [col for col in use_cols if col not in [*df.columns, *ADDITIONAL_COLS]]
    if len(unknown_cols) > 0:
        raise Exception(f'Column(s) ({unknown_cols}) unknown.')
    
    unknown_conds = [cond for cond in use_conditions if cond not in ALLOWED_CONDITIONS]
    if len(unknown_conds) > 0:
        raise Exception(f'Condition(s) ({unknown_conds}) unknown.')
    
    
   
    to_compare[name] = dict()
    to_compare[name]['cols'] = use_cols
    to_compare[name]['conditions'] = use_conditions
    print(f"Added: {name}")


In [None]:
# Vergleiche hinzufügen
to_compare = dict()

tst = ['TRÄGER', 'SEITE', 'TAKE']

compare(data, to_compare,
        ['AMO_ID'],
        use_conditions=dict(ignore_minimal_distance=True))

compare(data, to_compare,
        [*tst, 'EAN/PEAN', 'LC', 'Z-BNR', 'MIT', 'RHTI', 'T-ISRC', 'T-RHTI'],
        name_prefix="alles",
        use_conditions=dict(ignore_minimal_distance=True))

compare(data, to_compare,
        ['LC', 'Z-BNR', 'T-ISRC'],
        use_conditions=dict(ignore_minimal_distance=True))

compare(data, to_compare,
        ['EAN/PEAN', 'T-ISRC'],
        use_conditions=dict(ignore_minimal_distance=True))

compare(data, to_compare,
        ['RHTI', 'T-ISRC'],
        use_conditions=dict(ignore_minimal_distance=True))


compare(data, to_compare,
        [*tst, 'EAN/PEAN', 'LC', 'Z-BNR', 'MIT', 'RHTI'])

compare(data, to_compare,
        [*tst, 'EAN/PEAN', 'T-RHTI', 'RHTI'])

compare(data, to_compare,
        [*tst, 'EAN/PEAN', 'LC', 'Z-BNR', 'RHTI'])

compare(data, to_compare,
        [*tst, 'EAN/PEAN', 'RHTI'])

compare(data, to_compare,
        ['EAN/PEAN', 'LC', 'Z-BNR', 'RHTI'],
        use_conditions=dict(no_tst=True))

compare(data, to_compare,
        ['EAN/PEAN', 'LC', 'Z-BNR'],
        use_conditions=dict(no_tst=True))

# Tübinger Methode
compare(data, to_compare,
        ['MIT', 'LC', *tst, 'Z-BNR'],
        name_prefix="Tübinger Methode",
        use_conditions=dict(shorten_z_bnr=True))

compare(data, to_compare,
        [*tst, 'EAN/PEAN', 'LC'])

compare(data, to_compare,
        [*tst, 'LC', 'Z-BNR', 'T-RHTI'],
        use_conditions=dict(ignore_minimal_distance=True))

compare(data, to_compare,
        ['LC', 'Z-BNR', 'T-RHTI'],
        use_conditions=dict(no_tst=True))

compare(data, to_compare,
        ['LC', 'Z-BNR', 'RHTI'],
        use_conditions=dict(no_tst=True))


compare(data, to_compare,
        [*tst, 'LC', 'Z-BNR', 'RHTI'],
        use_conditions=dict(minimal_bnr=True))

compare(data, to_compare,
        [*tst, 'MIT', 'RHTI'])

compare(data, to_compare,
        ['RHTI', 'T-RHTI'],
        use_conditions=dict(no_tst=True,
        no_bnr=True))

compare(data, to_compare,
        ['LC', 'MIT', 'RHTI'],
        use_conditions=dict( no_tst=True,
        no_bnr=True))

compare(data, to_compare, 
        ['LC', 'RHTI', 'T-RHTI'])


In [None]:
# Hilfsfunktion: Vergleich vorbereiten

PATTERN_NON_ALPHANUMERIC = re.compile(r'\W+')
PATTERN_ONLY_NUMBERS = re.compile(r'\D')
MINIMAL_DISTANCE = 80
MINIMAL_BNR_LEN = 4
minimal_distance_col = f'distance >= {MINIMAL_DISTANCE}?'
    
def prepare_data(data: pd.DataFrame):
    PATTERN_NON_ALPHANUMERIC = re.compile(r'\W+')
    PATTERN_ONLY_NUMBERS = re.compile(r'\D')
    MINIMAL_DISTANCE = 80
    MINIMAL_BNR_LEN = 4
    minimal_distance_col = f'distance >= {MINIMAL_DISTANCE}?'

    data['hat Dubletten?'] = False
    data['ist Dublette?'] = False
    data[minimal_distance_col] = False
    data['Original ANR'] = np.nan
    data['Original BEST'] = np.nan
    data['Original RHTI'] = np.nan
    data['Fundmethode'] = np.nan
    data['AMO geteilt'] = False
    data['Abstand'] = np.nan
    data['Abstand'] = data['Abstand'].astype('Int64')

    data['MIT'] = data['MIT'].str.replace(PATTERN_NON_ALPHANUMERIC, '').str.lower()
    data['RHTI'] = data['RHTI'].str.replace(PATTERN_NON_ALPHANUMERIC, '').str.lower()
    data['T-RHTI'] = data['T-RHTI'].str.replace(PATTERN_NON_ALPHANUMERIC, '').str.lower()

    print('Generate Z-BNR (numbers only)')
    data['Z-BNR'] = data['BNR'].str.replace(PATTERN_ONLY_NUMBERS, '').str.lower()

    # EAN/PEAN 1. Stelle ignorieren, wenn länger als 12 Zeichen
    print('Shorten PEAN if necessary')
    data['O-PEAN'] = data['EAN/PEAN']
    mask = data['EAN/PEAN'].str.len() > 12
    column_name = 'EAN/PEAN'
    data.loc[mask, column_name] = data.loc[mask, column_name].apply(lambda v: v[1:13])
    data['t_ANR'] = data.index
    
    return data


In [None]:
# Hilfsfunktion: Ergebnisverteilung visualisieren

def visualize_results():
    result_dir = output / Path("vergleich_ergebnis") 
    result_dir.mkdir(exist_ok=True, parents=True)

    x = data.loc[data['ist Dublette?'] == True, 'Fundmethode'].value_counts()
    y =  x.index
    ax = sns.barplot(x=x, y=y)
    ax.set(ylabel='Reihenfolge: Vergleichsmethode', xlabel='Dubletten (abs.)')
    plt.title('Fundmethoden nach Fundmenge')
    save_to = result_dir / 'fundmethoden.svg'
    plt.savefig(save_to, bbox_inches='tight')
    plt.show()



    x = data.loc[(data['ist Dublette?'] == True), ['BEST', 'Original BEST']].value_counts().unstack().sort_values('BEST')
    ax = x.plot(kind='bar')
    plt.title('Wo stehen die Originale der Dubletten?')
    ax.legend(title='Standort des Originals')
    ax.set(ylabel='Originale', xlabel='Standort der Dublette')
    save_to = result_dir / 'dubletten_originale.svg'
    plt.savefig(save_to, bbox_inches='tight')
    plt.show()


    x = data.loc[:, ['BEST', 'ist Dublette?']].value_counts()
    ax = x.unstack(level=1).plot(kind='bar', stacked=True)
    ax.legend(['Original', 'Dublette'])
    plt.title('Dubletten nach Standort (bezogen auf die jeweilige Gesamtmenge)')
    ax.set(ylabel='Einheiten', xlabel='Standort')
    save_to = result_dir / 'dubletten_nach_standort.svg'
    plt.savefig(save_to, bbox_inches='tight')
    plt.show()

    for location in locations: 
        if(len(data.loc[data['BEST'] == location]) == 0):
            print(location)
            continue
        x = data.loc[data['BEST'] == location, ['BEST', 'ist Dublette?']].value_counts(normalize=True) * 100
        ax = x.unstack().plot(kind='bar', stacked=True)
        ax.legend(['Original', 'Dublette'])
        plt.title(f'{location}: Dubletten im Bestand (prozentual)')
        ax.set(ylabel='Einheiten (%)', xlabel=None)
        save_to = result_dir / f'{location}_dubletten_nach_standort.svg'
        plt.savefig(save_to, bbox_inches='tight')
        plt.show()
        

In [None]:
# AKTUELL

data = prepare_data(get_data())

# Vergleich durchführen

# Lauf 1: Nur physische Locations
physical_locations = {
        "comparisons": to_compare,
        "found": dict(),
}

# Lauf 2: AMS + physische Locations 
ams_and_results = {
        "comparisons": to_compare,
        "found": dict(),
}
# Nur AMS-Daten
ams_data = data[(data['BEST'] == AMS) == True]
# Läufe
runs = [physical_locations, ams_and_results]

idx_originals = []

for run_num, run in enumerate(runs, 1):
    # Lauf konfigurieren

    is_ams_run = run_num == 2
    

    # Vergleichsreihenfolge 
    print("Comparing...")
    print("Priority")
    comp_locations = locations if is_ams_run else [l for l in locations if l != AMS]
    for i, location in enumerate(comp_locations):
        print(f"{i}. {location}")
    print('\n')

    # Ergebniszahlensicherung vorbereiten
    run["found"]['total'] = dict()
    ignored = 'ignorierte Dubletten (mit weniger Kriterien anderes Original gefunden, zur Sicherheit verworfen (Ergebnisse ohne Abstandsprüfung!)'
    run["found"][ignored] = 0
    run["found"]['total']['total'] = 0
    run["found"]['total'][minimal_distance_col] = 0
    run["found"]['total']['marked'] = 0

    # Vergleich starten
    for position, method in enumerate(to_compare, 1):
        



        # Zu vergleichende Daten
        run_data = data.drop(data[(data['BEST'] == AMS) == True].index)
        
        if not is_ams_run: 
            print(f"dropping {AMS}")
            print(f"before {len(data)}")
            print(f"after {len(run_data)}")

        if is_ams_run: # Zu vergleichende Daten auf Originale aus vorherigem Lauf beschränken
            data_originals_only = run_data[(run_data['ist Dublette?'] == False)]
            print(f"adding {AMS}")
            print(f"before (originals only) {len(data_originals_only)}")
            run_data = ams_data.append(data_originals_only)
            print(f"after {len(run_data)}")

        run_data['AMS run?'] = is_ams_run


        is_found_by_amo_id = position == 0
        found_by_method = f'{position}: {method}' # Fundmethode
        cols = to_compare[method]['cols'] # Zu vergleichende Spalten
        conditions = to_compare[method]['conditions'] # Vergleichsbedingungen
        # Vergleichskonfiguration ausgeben
        print(f'\n Comparing: : {found_by_method} ({position}/{len(to_compare)})')
        print(f'-> Columns: {cols}')
        print(f"-> Conditions: {conditions}")
 
        # Nur die noch nicht erkannten Kandidaten vergleichen
        # und mit einer Kopie der Daten arbeiten, um data unbeschädigt zu halten
        comp_data = run_data[(run_data['ist Dublette?'] == False)].copy()
        print(f"Vergleichsdaten: #1 {len(data[(data['hat Dubletten?'] == True) ])}")
        # Vor-Bedingungen anwenden
        if 'minimal_bnr' in conditions:
            comp_data = comp_data.drop( (comp_data[ ~(comp_data['BNR'].str.len() >= MINIMAL_BNR_LEN) ] ).index)
        if 'no_bnr' in conditions:
            comp_data = comp_data.drop( (comp_data[ ~(comp_data['BNR'].isnull()) ] ).index)
        if 'shorten_z_bnr' in conditions:
            mask = comp_data['Z-BNR'].str.len() >= 6
            column_name = 'Z-BNR'
            comp_data.loc[mask, column_name] = comp_data.loc[mask, column_name].apply(lambda v: v[1:])
        
        print(f"Vergleichsdaten: #2 {len(data[(data['hat Dubletten?'] == True) ])}")
        # Reihen mit leeren Werten ausstoßen
        comp_data = comp_data.dropna(subset=cols)
        print(f"Vergleichsdaten: #3 {len(data[(data['hat Dubletten?'] == True) ])}")
        # Vergleichsreihenfolge entsprechend `locations` sicherstellen
        comp_data = comp_data.sort_values(['BEST'])
        print(f"Vergleichsdaten: #4 {len(data[(data['hat Dubletten?'] == True) ])}")


     
        # Dubletten und korrespondierende Originale finden
        dubletten = comp_data.duplicated(subset=cols)
        originale = comp_data.groupby(cols)['t_ANR'].transform('first').values
        comp_data['ist Dublette?'] = dubletten
        comp_data['Original ANR'] = originale

        # Nach-Bedingungen anwenden
        if 'no_tst' in conditions:
            tst = ['TRÄGER', 'SEITE', 'TAKE']
            comp_data = comp_data.drop( (comp_data[ ~(comp_data[tst].isnull().all('columns')) ] ).index)
            print(f"Vergleichsdaten: #5 {len(data[(data['hat Dubletten?'] == True) ])}")

        # Schon als Originale in vorherigen Vergleichen registrierte Einheiten nicht als Dubletten zählen
        if not is_ams_run: # im AMS-Lauf sollen diese trotzdem gezählt werden
            run["found"][ignored] += comp_data.loc[(comp_data['ist Dublette?'] == True) & (data['hat Dubletten?'] == True), 'ist Dublette?'].sum() 
            comp_data.loc[ (comp_data['ist Dublette?'] == True) & (data['hat Dubletten?'] == True), 'ist Dublette?'] = False

   

        if len(comp_data[comp_data['ist Dublette?'] == True]) > 0:

            # Metadaten zum Original setzen
            comp_data['Original BEST'] = data.loc[comp_data['Original ANR'], 'BEST'].values
            comp_data['Original RHTI'] = data.loc[comp_data['Original ANR'], 'RHTI'].values
            print(f"Vergleichsdaten: #6  {len(data[(data['hat Dubletten?'] == True) ])}")

            # Nur noch Dubletten behalten
            comp_data = comp_data.drop( comp_data[(comp_data['ist Dublette?'] == False)].index)
            print(f"Vergleichsdaten: #7  {len(data[(data['hat Dubletten?'] == True) ])}")

            # Duplikat aus AMS-Lauf entfernen, deren Original nicht im AMS ist
            if is_ams_run:
                # AMS-interne Dubletten ignorieren
                print(f"before removing ams-internal dubletten: {len(data[(data['hat Dubletten?'] == True) ])}")
                to_drop = comp_data[
                        (comp_data['AMS run?'] == True) &
                        (comp_data['BEST'] == AMS) & 
                        (comp_data['ist Dublette?'] == True) & 
                        (comp_data['Original BEST'] == AMS)
                    ].index
                comp_data = comp_data.drop(to_drop)
                print(f"after removing ams-internal dubletten: {len(data[(data['hat Dubletten?'] == True) ])}")
                print(f"Vergleichsdaten: #8  {len(data[(data['hat Dubletten?'] == True) ])}")
                # Physische Standort-interne Dubletten ignorieren
                print(f"before removing physical location-internal dubletten: {len(data[(data['hat Dubletten?'] == True) ])}")
                to_drop = comp_data[
                        (comp_data['AMS run?'] == True) &
                        (comp_data['BEST'] != AMS) & 
                        (comp_data['ist Dublette?'] == True) & 
                        (comp_data['Original BEST'] != AMS)
                    ].index
                comp_data.drop(to_drop, inplace=True)
                print(f"after removing physical location-internal dubletten: {len(data[(data['hat Dubletten?'] == True) ])}")
                print(f"Vergleichsdaten: #9  {len(data[(data['hat Dubletten?'] == True) ])}")
            
            # Abstand zwischen Dubletten berechnen
            comp_data['eigene ANR (numerisch)'] = pd.to_numeric(comp_data.index.str.replace(r'[^\d]', '', regex=True))
            comp_data['originale ANR (numerisch)'] = pd.to_numeric(comp_data['Original ANR'].str.replace(r'[^\d]', '', regex=True))
            comp_data['Abstand'] = abs(comp_data['eigene ANR (numerisch)'] - comp_data['originale ANR (numerisch)'])
            print(f"Vergleichsdaten: #10  {len(data[(data['hat Dubletten?'] == True) ])}")
            # Gefundene Dubletten mit Mindestabstand in Ergebnisdaten schreiben
            data.loc[comp_data.index, 'ist Dublette?'] = comp_data['Abstand'] >= MINIMAL_DISTANCE
            print(f"Vergleichsdaten: #11  {len(data[(data['hat Dubletten?'] == True) ])}")
            if ('ignore_minimal_distance' in conditions) or is_ams_run:
                # Wenn Minimaldistanz ignoriert werden soll oder AMS-Lauf
                # alle Dubletten nehmen
                data.loc[comp_data.index, 'ist Dublette?'] = True
            print(f"Vergleichsdaten: #12  {len(data[(data['hat Dubletten?'] == True) ])}")
            # Dubletten bzw. Original-Metadaten in Ergebnisdaten schreiben
            data.loc[comp_data.index, 'Original BEST'] = comp_data['Original BEST']
            data.loc[comp_data.index, 'Original ANR'] = comp_data['Original ANR']
            data.loc[comp_data.index, 'Original RHTI'] = comp_data['Original RHTI']
            data.loc[data.index.isin(comp_data['Original ANR']), 'hat Dubletten?'] = True
            data.loc[comp_data.index, 'Fundmethode'] = found_by_method
            if is_found_by_amo_id:
                data.loc[comp_data.index, 'AMO geteilt'] = True
            data.loc[comp_data.index, 'Abstand'] = comp_data['Abstand']
            data.loc[comp_data.index, minimal_distance_col] =  comp_data['Abstand'] >= MINIMAL_DISTANCE
            print(f"Vergleichsdaten: #13  {len(data[(data['hat Dubletten?'] == True) ])}")
        print(f"Vergleichsdaten: #14  {len(data[(data['hat Dubletten?'] == True) ])}")
        # Vergleich-Ergebnisse speichern
        found_total = 0
        found_marked = 0
        found_without_too_near = 0

        has_found_something = len(comp_data[comp_data['ist Dublette?'] == True]) > 0
        if has_found_something:
            print(f"Vergleichsdaten: #15  {len(data[(data['hat Dubletten?'] == True) ])}")
            found_total = len(comp_data['ist Dublette?'])
            found_marked = (data.loc[comp_data.index, 'ist Dublette?']).sum()
            found_without_too_near = len(comp_data[(comp_data['ist Dublette?'] == True) & (comp_data['Abstand'] >= MINIMAL_DISTANCE)])
        print(f"Vergleichsdaten: #16  {len(data[(data['hat Dubletten?'] == True) ])}")
        print(f'found (total): {found_total}')
        print(f'found (excluding too near): {found_without_too_near}')
        print(f'found (marked as dublette): {found_marked}\n')
        
        run["found"][found_by_method] = dict()
        run["found"][found_by_method]['total'] = found_total
        run["found"][found_by_method]['marked'] = found_marked
        run["found"][found_by_method][minimal_distance_col] = found_without_too_near
        run["found"]['total']['total'] += run["found"][found_by_method]['total']
        run["found"]['total']['marked'] += run["found"][found_by_method]['marked']
        run["found"]['total'][minimal_distance_col] += run["found"][found_by_method][minimal_distance_col]
        
        comp_originals = comp_data[comp_data['hat Dubletten?']]['t_ANR'].to_list()
        idx_originals = [*idx_originals, *comp_originals]
        del comp_data 
    
    if not is_ams_run:
        assert len(data[ (data['ist Dublette?'] == True) & (data.index.isin(data['Original ANR'])) ]) == 0
        assert len(data[ (data['ist Dublette?'] == True) & (data['hat Dubletten?'] == True) ]) == 0
            
    ipd.display(run["found"]['total'])

del data['t_ANR']
print('Fertig!')



In [None]:
visualize_results()

In [None]:
# Ergebnis-Matrix
for run in runs:
    g = run['found'].copy()

    del g['total']
    del g[ignored]

    for m in g:
        i = g[m]['total']
        a = g[m][minimal_distance_col]
        print(f'{i},{a}')

    print('\n')
    for m in g:
        i = g[m]['total']
        a = g[m][minimal_distance_col]
        print(f'{m}: {i},{a}')

In [None]:
out = data.copy()
bool_cols = list()
for col, col_type in dict(out.dtypes).items():
    if col_type == 'bool':
        out[col] = out[col].astype('object')
        out.loc[(out[col] == True), col] = 'x'
        out.loc[(out[col] == False), col] = np.nan



In [None]:
result_file = output / Path(f'ergebnis.xlsx') 
with pd.ExcelWriter(result_file) as writer:
    out.to_excel(writer, sheet_name='Alles')
    for location in locations:
        out.loc[out['BEST'] == location, :].to_excel(writer, sheet_name=f'{location} Alles')
        out.loc[out['BEST'] == location, :].sample(10).to_excel(writer, sheet_name=f'{location} Random sample (10)')
        out.loc[(out['BEST'] == location) & (out['ist Dublette?'] == 'x'), :].to_excel(writer, sheet_name=f'{location} Dubletten')