# iDev for Streamlit app

Space to work interactively with the inputs and filters involved in the streamlit app.

In [None]:
from __future__ import annotations

import sys
if '../' not in sys.path:
    sys.path.append('../')
import os
import glob
import re 
import xml.etree.ElementTree as ET
import json

import numpy as np
import pandas as pd
import requests
import pickle

from pymarc import Record
from src.utils import streamlit_utils as st_utils

When not wanting latex to display

In [None]:
pd.options.display.html.use_mathjax = False

In [None]:
p5_root = (
    r"G:\DigiSchol\Digital Research and Curator Team\Projects & Proposals\00_Current Projects"
    r"\LibCrowds Convert-a-Card (Adi)\OCR\20230504 TKB Export P5 175 GT pp\1016992\P5_for_Transkribus"
)

In [None]:
cards_df_v0 = pickle.load(open("../data/processed/401_cards.p", "rb"))
cards_df = cards_df_v0.dropna(subset="worldcat_matches_subtyped").copy().set_index(np.arange(1,196))

In [None]:
cards_df.head()

In [None]:
cards_df.loc[40]

In [None]:
# pickle.dump(cards_df, open("cards_df.p", "wb"))

In [None]:
# nulls = len(cards_df) - len(cards_df.dropna(subset="worldcat_matches"))
cards_to_show = cards_df.dropna(subset="worldcat_matches_subtyped").loc[:,("title", "author", "shelfmark", "worldcat_matches_subtyped", "lines", "selected_match", "match_needs_editing")]
# cards_to_show.insert(loc=0, column="card_id", value=range(1, len(cards_to_show) + 1))
# cards_to_show_selections = cards_to_show.copy()
# cards_to_show_selections.insert(loc=1, column="Select", value=False)

In [None]:
cards_to_show

In [None]:
card_idx = 55

In [None]:
search_ti = cards_to_show.loc[card_idx, 'title'].replace(' ', '+')
if cards_to_show.loc[card_idx, 'author']:
    search_au = cards_to_show.loc[card_idx, 'author'].replace(' ', '+')
else:
    search_au = ""

In [None]:
lang_dict = json.load(open("..\\data\\raw\\marc_lang_codes.json", "r"))

match_df = pd.DataFrame({"record": list(cards_to_show.loc[card_idx, "worldcat_matches_subtyped"])})
match_df = st_utils.create_filter_columns(match_df, lang_dict, search_au)

In [None]:
match_df.query("publication_date > -9999")["publication_date"].sort_values().dropna().unique().astype(int)

In [None]:
match_df

In [None]:
def add_subfield_rpt(df, field, split_chr, split_idx, na_subfield):
    repeat_id = [str(x) for x in range(len(df.loc[field:field]))]
    if repeat_id == ["0"]:
        df.loc[field, "Subfield"] = df.loc[field, df.columns[0]].split(split_chr)[split_idx]
    elif na_subfield:
        df.loc[field, "Subfield"] = None
    else:
        df.loc[field, "Subfield":"Subfield"] = df.loc[field, df.columns[0]].str.split(split_chr).transform(lambda x: x[split_idx])
    df.loc[field, "Rpt":"Rpt"] = repeat_id

subfield_handler = { # [split_chr, split_idx, na_subfield]
    "500":["$", 0, True],
    "650":["$", 0, False],
    "880":["$", 1, False]
}    
    
def gen_unique_idx(df: pd.DataFrame, subfield_handler: dict) -> pd.DataFrame:
    """
    Generate a unique index from one that contains repeated fields
    @param out_df: pd.DataFrame
    @return: pd.DataFrame
    """
    out_df = df.copy()
    out_df["Subfield"] = ""
    out_df["Rpt"] = ""
    dup_idx = out_df.index[out_df.index.duplicated()].unique()
    unhandled_fields = [x for x in dup_idx if x not in subfield_handler]
        
    for field, subfield_args in subfield_handler.items():
        split_chr, split_idx, na_subfield = subfield_args
        if field in out_df.index:
            add_subfield_rpt(out_df, field, split_chr, split_idx, na_subfield)
    for dup in unhandled_fields:
        out_df.loc[dup, "Rpt"] = [str(x) for x in range(len(out_df.loc[dup]))]

    return out_df.set_index(["Subfield", "Rpt"], append=True)

In [None]:
filtered_df = match_df
matches_to_show = filtered_df

In [None]:
unedited_recs[-1].index

In [None]:
unedited_recs[-1].sort_index().loc["500":]

In [None]:
recs, unedited_recs = [], []
for i in range(len(matches_to_show)):
    print(i)
    res = matches_to_show.iloc[i, 0].get_fields()
    ldr = [matches_to_show.iloc[i, 0].leader]
    rec = pd.DataFrame(
        index=pd.Index(["LDR"] + [x.tag for x in res], name="Field"),
        data=ldr + [x.__str__()[6:] for x in res],
        columns=[matches_to_show.iloc[i].name]
    )
    unedited_recs.append(rec)
    recs.append(gen_unique_idx(rec, subfield_handler))

In [None]:
recs[0]

In [None]:
def sort_fields_idx(index):
    if index.name == "Field":
        key = [0 if x == "LDR" else int(x) for x in index]
        return pd.Index(key)
    elif index.name == "Subfield":
        key = [x.split("$")[1] if "$" in x else x for x in index]
        return pd.Index(key)
    elif index.name == "Rpt":
        return index

In [None]:
marc_df = pd.concat(recs, axis=1).sort_index()
# marc_df = marc_df.sort_index(key=sort_fields_idx)

In [None]:
marc_df.loc[("500")]

In [None]:
marc_df

In [None]:
def simplify_6xx(df):
    """
    Pandas magic
    Arbitrarily assinging repeat_id vals to repeat fields in records means common field values are not matched to each other
    Get round this by reindexing on an index of all unique values for a subfield
    Set the values for the reindexed subfield to the newly reindexed one.
    If there has been reordering this leaves rows of NA at the end of the subfield that can be dropped
    """
    if df.shape[1] == 1:
        return df
    tidy_df = df.copy()
    subfields = tidy_df.loc["650"].index.get_level_values(0).unique()
    for sf in subfields:
        sf_orig = tidy_df.loc[("650", sf), :]
        sf_unique_vals = pd.Series(sf_orig.values.flatten()).dropna().unique()
        if len(sf_orig) < len(sf_unique_vals):
            continue
        sf_unique_df = pd.DataFrame(data=sf_unique_vals, columns=pd.Index(["unique_vals"]))
        for x in sf_orig.columns:
            sf_unique_df = sf_unique_df.merge(sf_orig[x], how="left", left_on="unique_vals", right_on=x)
        replacement_df = sf_unique_df.set_index(sf_orig.index[:len(sf_unique_df)]).reindex(sf_orig.index).drop(
            columns="unique_vals")
        replacement_df["Field"] = "650"
        replacement_df["Subfield"] = sf
        replacement_df = replacement_df.set_index(["Field", "Subfield"], append=True).reorder_levels([1, 2, 0])
        if not replacement_df.reset_index(drop=True).equals(sf_orig.reset_index(drop=True)):
            tidy_df.loc[("650", sf), :] = replacement_df
        else: # No overlapping terms so flatten naively
            sf_orig_blank_idx = sf_orig.reset_index(drop=True)
            blank_idx_df = sf_orig.reset_index().drop(columns=sf_orig.columns)
            for x in sf_orig_blank_idx.columns:
                blank_idx_df = blank_idx_df.join(sf_orig_blank_idx[x].dropna().reset_index(drop=True))
            blank_idx_df["Field"] = "650"
            blank_idx_df["Subfield"] = sf
            replacement_df = blank_idx_df.set_index(["Field", "Subfield", "Rpt"])
            tidy_df.loc[("650", sf), :] = replacement_df
    return tidy_df.dropna(how="all")

def simplify_8xx():
    pass

def tidy_concat():
    pass

In [None]:
marc_df

In [None]:
simplify_6xx(marc_df).sort_index(key=sort_fields_idx)

In [None]:
subfield_to_replace = marc_df.loc[("650", "\\7"), :]
subfield_unique_vals = pd.Series(subfield_to_replace.values.flatten()).dropna().unique()
unique_vals = pd.DataFrame(data=subfield_unique_vals, columns=pd.Index(["unique_vals"]))

In [None]:
subfield_to_replace[0].reset_index()

In [None]:
for x in marc_df.columns:
    unique_vals = unique_vals.merge(subfield_to_replace[x], how="left", left_on="unique_vals", right_on=x)

In [None]:
replacement_vals = unique_vals.set_index(subfield_to_replace.index[:len(unique_vals)]).reindex(subfield_to_replace.index).drop(columns="unique_vals")

In [None]:
replacement_vals["Field"] = "650"
replacement_vals["Subfield"] = "\\7"

In [None]:
replacement_vals = replacement_vals.set_index(["Field", "Subfield"], append=True).reorder_levels([1,2,0])

In [None]:
replacement_vals

In [None]:
subfield_to_replace

In [None]:
marc_df.loc[("650", "\\7"), :] = replacement_vals

In [None]:
marc_df.loc[("650", "\\7")]

In [None]:
marc_df.loc[("650", "\\7")]

In [None]:
marc_df.loc[("650", "\\7")].drop(index=[str(x) for x in range(len(unique_vals), len(marc_df.loc[("650", "\\7")]))], inplace=True)
marc_df.loc[("650", "\\7")] = unique_vals

In [None]:
marc_df.loc[("650", "\\7")]

In [None]:
unique_vals.drop(columns="unique_vals")

In [None]:
all_records = cards_df["worldcat_matches_subtyped"].dropna().sum()
all_650_raw = [x.get_fields("650") for x in all_records]
all_650 = []
[all_650.extend(x) if x else None for x in all_650_raw]
all_650_indicators = [x.__str__()[6:8] for x in all_650]

In [None]:
set(all_650_indicators)

In [None]:
all_650[4].__str__()[6:8]

In [None]:
marc_df.xs[("650", )]

In [None]:
def gen_gmap(col):
    counts = col.value_counts()
    to_highlight = counts[counts > 1]
    no_highlight = counts[counts == 1]
    colour_vals = np.linspace(0, 1, len(to_highlight) + 2)[1:-1]
    mapping = {k:v for k,v in zip(to_highlight.index, colour_vals)}
    for val in no_highlight.index:
        mapping[val] = -10
    return col.map(mapping, na_action='ignore')

In [None]:
gmap = marc_df.apply(gen_gmap, axis=1)
gmap[gmap.isna()] = -10
gmap[1::3] += 0.05
gmap[2::3] += 0.1

In [None]:
marc_df.style.background_gradient(gmap=gmap, vmin=0, vmax=1, axis=None)

In [None]:
type(marc_df.style)

In [None]:
def filter_on_generic_fields(df, fields, terms, include_recs_without_field):
    """
    Input the marc_table_df with all records
    sum all repeat columns and transpose to make searching easier
    Search each column in fields for the corresponding term in terms
    Return a df with only records that match the search terms
    @param df:
    @param fields:
    @param terms:
    @return:
    """
    t_df = df.groupby(level=0).sum().T
    terms = [x.strip() for x in terms.split(";")]
    filter_df = pd.concat([t_df[field].str.contains(term) for field, term in zip(fields, terms)], axis=1)
    if include_recs_without_field:
        filter = filter_df.all(axis=1)
    else:
        filter = filter_df.where(lambda x: ~x.isna(), False).all(axis=1)
    return df.T[filter].T

In [None]:
filter_on_generic_fields(marc_df, ["001", "029"], "ocn; CHBIS", False)

In [None]:
filter_df = pd.concat([marc_df.groupby(level=0).sum().T[field].str.contains(term) for field, term in zip(["001", "029"], ["ocn", "CHBIS"])], axis=1)

In [None]:
filter_df

In [None]:
marc_df.T[filter_df.where(lambda x: ~x.isna(), False).all(axis=1)].T

In [None]:
filter_df.where(lambda x: ~x.isna(), False).all(axis=1)

In [None]:
def blank_nan_format(s):
    if np.isnan(s):
        return ""
    else:
        return s

In [None]:
marc_df.style.format(na_rep="")

In [None]:
cards_df.loc[0, ["selected_match", "match_needs_editing", "selected_match_ocn"]] = None

In [None]:
cards_df