# iDev for Streamlit app

Space to work interactively with the inputs and filters involved in the streamlit app.

In [None]:
from __future__ import annotations

import sys
if '../' not in sys.path:
    sys.path.append('../')
import os
import glob
import re 
import xml.etree.ElementTree as ET
import json

import pandas as pd
import requests
import pickle

# from z3950.PyZ3950 import zoom
from z3950.Marc.marc_tools import Record

In [None]:
p5_root = (
    r"G:\DigiSchol\Digital Research and Curator Team\Projects & Proposals\00_Current Projects"
    r"\LibCrowds Convert-a-Card (Adi)\OCR\20230504 TKB Export P5 175 GT pp\1016992\P5_for_Transkribus"
)

In [None]:
cards_df = pickle.load(open("C:\\Users\\HLloyd\\Downloads\\cards_df.p", "rb"))

In [None]:
cards_df

In [None]:
cards_df = pickle.load(open("cards_df.p", "rb"))

In [None]:
cards_df.head(30)

In [None]:
# pickle.dump(cards_df, open("cards_df.p", "wb"))

In [None]:
nulls = len(cards_df) - len(cards_df.dropna(subset="worldcat_matches"))
errors = len(cards_df.query("worldcat_matches == 'Error'"))
cards_to_show = cards_df.query("worldcat_matches != 'Error'").dropna(subset="worldcat_matches").loc[:,("title", "author", "shelfmark", "worldcat_matches", "lines", "selected_match", "match_needs_editing")]

In [None]:
cards_to_show.iloc[40:60]

I checked if the Record dicts all have monotonically increasing keys, they do. I wondered if it was possible whether some might have been dropped as surrogate diagnostics but I guess not.

`to_show["worldcat_result"].apply(lambda x: pd.Index(list(x.keys())).is_monotonic_increasing)`

In [None]:
card_idx = 157

In [None]:
search_ti = cards_to_show.loc[card_idx, 'title'].replace(' ', '+')
if cards_to_show.loc[card_idx, 'author']:
    search_au = cards_to_show.loc[card_idx, 'author'].replace(' ', '+')
else:
    search_au = ""

In [None]:
au_exists = bool(search_au)

In [None]:
cards_to_show

In [None]:
match_df = pd.DataFrame({"record": list(cards_to_show.loc[card_idx, "worldcat_matches"].values())})
match_df["has_title"] = match_df["record"].apply(lambda x: bool(x.get_fields("245")))
match_df["has_author"] = match_df["record"].apply(lambda x: bool(x.get_fields("100", "110", "111", "130")))

In [None]:
match_df.query("has_title == True and (has_author == True or not @au_exists)")

In [None]:
print(match_df.loc[3, "record"])

In [None]:
lang_dict = json.load(open("..\\data\\raw\\marc_lang_codes.json", "r"))

In [None]:
lang_040b_re = re.compile(r"\$b[a-z]+\$")
match_df["language_040$b"] = match_df["record"].apply(lambda x: lang_040b_re.search(x.get_fields("040")[0].__str__()).group())
match_df["language"] = match_df["language_040$b"].str[2:-1].map(lang_dict["codes"])

In [None]:
filtered_df = match_df.query("language in ['English', 'German']").copy()

In [None]:
# sort options
subject_access = ["600", "610", "611", "630", "647", "648", "650", "651", "653", "654", "655", "656", "657", "658", "662", "688"]

filtered_df["num_subject_access"] = filtered_df["record"].apply(lambda x: len(x.get_fields(*subject_access)))
filtered_df["num_linked"] = filtered_df["record"].apply(lambda x: len(x.get_fields("880")))
filtered_df["has_phys_desc"] = filtered_df["record"].apply(lambda x: bool(x.get_fields("300")))
filtered_df["good_encoding_level"] = filtered_df["record"].apply(lambda x: x.get_fields("LDR")[0][17] not in [3, 5, 7])
filtered_df["record_length"] = filtered_df["record"].apply(lambda x: len(x.get_fields()))

In [None]:
def sort_fields_idx(index):
    if index.name == "MARC Field":
        key = [0 if x == "LDR" else int(x) for x in index]
        return key
    elif index.name == "Repeat Field ID":
        key = [x.split("$")[1] if "$" in x else x for x in index]
        return key


def gen_unique_idx(df: pd.DataFrame) -> pd.DataFrame:
    """
    Generate a unique index from one that contains repeated fields
    @param df: pd.DataFrame
    @return: pd.DataFrame
    """
    df["Repeat Field ID"] = ""
    dup_idx = df.index[df.index.duplicated()].unique()
    unhandled_fields = [x for x in dup_idx if x not in ["650", "880"]]
    if "650" in dup_idx:
        str_add = df.loc["650", df.columns[0]].copy()
        str_add = [" " + str(x) for x in range(len(str_add))]
        df.loc["650", "Repeat Field ID"] = df.loc["650", df.columns[0]].str.split(" ").transform(lambda x: x[0]) + str_add
    if "880" in dup_idx:
        str_add = df.loc["880", df.columns[0]].copy()
        str_add = [" " + str(x) for x in range(len(str_add))]
        df.loc["880", "Repeat Field ID"] = df.loc["880", df.columns[0]].str.split("/").transform(lambda x: x[0]) + str_add
    for dup in unhandled_fields:
        df.loc[dup, "Repeat Field ID"] = [str(x) for x in range(len(df.loc[dup]))]

    return df.set_index("Repeat Field ID", append=True)

In [None]:
matches_to_show = filtered_df#.sort_values(by=None, ascending=False)

In [None]:
cols = []
for i in range(3):
    res = matches_to_show.iloc[i, 0].get_fields()
    ldr = matches_to_show.iloc[i, 0].get_fields("LDR")
    col = pd.DataFrame(
        index=pd.Index(["LDR"] + [x.tag for x in res], name="MARC Field"),
        data=ldr + [x.__str__()[6:] for x in res],
        columns=[matches_to_show.iloc[i].name]
    )
    cols.append(gen_unique_idx(col))

In [None]:
[x.index.has_duplicates for x in cols]

In [None]:
pd.concat(cols, axis=1).sort_index(key=sort_fields_idx)