# Extract information from Chinese catalogue card xmls

In [None]:
from __future__ import annotations
import sys
if '../' not in sys.path:
    sys.path.append('../')
import os
import glob
import re 
import io
import xml.etree.ElementTree as ET
from IPython.display import Image
from urllib.parse import quote
import pickle

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
from pymarc import marcxml, Record
from tqdm import tqdm
from dotenv import load_dotenv
import bookops_worldcat as bw

In [None]:
with open("..\\sidebar_docs.txt", encoding="utf-8") as f:
    docs_txt = f.read()

In [None]:
tqdm.pandas()

## Parsing xmls

In [None]:
p5_local = "..\\data\\raw\\chinese"

In [None]:
attempts = 0
# page_xml_loc = os.path.join(p5_root, "page")
page_xml_loc = os.path.join(p5_local, "1016992")
while attempts < 3:
    xmls = glob.glob(os.path.join(p5_local, "1016992", "*.pxml"))
    if len(xmls) > 0:
        break
    else:
        attempts += 1
        continue
else:
    raise IOError(f"Failed to connect to {page_xml_loc}")
    
xmls

In [None]:
xmlroots = []
print(f"\nGetting xml roots from {page_xml_loc}")
for file in tqdm(xmls):
    fileName = os.fsdecode(file)
    attempts = 0
    while attempts < 3:
        try:
            tree = ET.parse(fileName)
            break
        except FileNotFoundError:
            attempts += 1
            continue
    else:
        raise FileNotFoundError(f"Failed to connect to: {fileName}")
    root = tree.getroot()
    xmlroots.append(root)

In [None]:
len(xmlroots)

In [None]:
def extractLines(root: xml.etree.ElementTree.Element):
    lines = []

    textRegions = [x for x in root[1] if len(x) > 2]  # Empty Text Regions Removed

    for textRegion in textRegions:
        textLines = textRegion[1:-1]  # Skip coordinate data in first child
        for textLine in textLines:
            lines.append(textLine[-1][0].text)  # Text equivalent for line
    return lines

def extractLinesForVol(vol: list[xml.etree.ElementTree.Element]):
    allLines = []
    for root in tqdm(vol):
        rootLines = extractLines(root)
        allLines.append(rootLines) 
    return allLines

In [None]:
def extractLinesForVol(vol: list[xml.etree.ElementTree.Element]):
    allLines = []
    for root in tqdm(vol):
        rootLines = extractLines(root)
        allLines.append(rootLines) 
    return allLines

In [None]:
caps_regex = re.compile("[A-Z][A-Z][A-Z]+")
c_num_regex = re.compile("C\.[0-9]")  # C number title references
i_num_regex = re.compile("I[ABC]\.\s[0-9]")  # I number title references
date_regex = re.compile("1[45][0-9][0-9]")  # Date format regexes (specific to this volume)
smark_regex = re.compile("[0-9]{1,5}[\s\.]{1,2}[\w]{1,3}[\s\.]{1,2}[\w0-9]{1,5}")
author_regex = re.compile("[A-Z]+[\s]+\([A-Z][a-z]+\)")
isbn_regex = re.compile("ISBN\s[0-9\-\s]+")

In [None]:
cards = extractLinesForVol(xmlroots)

In [None]:
cards_df_v0 = pd.DataFrame(
    data={
        "xml": [os.path.basename(x) for x in xmls],
        "lines": cards,
        "dummy": [None for x in cards]
    }
)

In [None]:
def find_author(lines, dummy):
    author, title = None, None
    
    for i, l in enumerate(lines):
        if author_regex.search(l):  # look for an author format match
            author = l
            break
    
    if author:
        if i >= 2:  # author is after the second line (where we expect the title)
            title = " ".join(lines[1:i])
        elif i == 1:  # author is the second line
            title = lines[2]
    else:
        title = lines[1]  # default to the title being the second line
        
    return title, author

In [None]:
def isbn_search(x):
    if type(x) is not list:
        raise TypeError(f"List expected not {type(x)}")
    res = isbn_regex.search("".join(x))
    if res:
        return res.group().replace("-", "").replace(" ", "").lstrip("ISBN")
    else:
        return None

In [None]:
def shelfmark_search(x):
    if smark_regex.search(x[0]):
        result = smark_regex.search(x[0]).group()
    elif smark_regex.search(x[1]):
        result = smark_regex.search(x[1]).group()
    else:
        return None
    return result.replace(" ", "")

In [None]:
cards_df_v0["shelfmark"] = cards_df_v0["lines"].transform(shelfmark_search)
t_a = cards_df_v0.loc[:,('lines', 'dummy')].transform(lambda x: find_author(x[0], x[1]), axis=1).rename(columns={"lines":"title", "dummy":"author"})
cards_df = cards_df_v0.drop(columns="dummy").join(t_a)
cards_df["isbn"] = cards_df["lines"].transform(lambda x:isbn_search(x))
cards_df["title"] = cards_df["title"].str.replace("\"", "\'")

In [None]:
cards_df.info()

In [None]:
# cards_df.to_csv("..\\data\\processed\\401_cards_no_oclc.csv", index=False)

## OCLC API Queries

In [None]:
cards_df = pickle.load(open("401_cards.p", "rb"))

In [None]:
cards_df

In [None]:
load_dotenv()
client_id = os.environ["CLIENT_ID"]
client_secret = os.environ["CLIENT_SECRET"]

In [None]:
token = bw.WorldcatAccessToken(
    key=client_id,
    secret=client_secret,
    scopes="WorldCatMetadataAPI"
)

session = bw.MetadataSession(authorization=token)

In [None]:
def list_formats(recs):
    if not recs.get("briefRecords"):
        return None
    else:
        return [(x.get("generalFormat"), x.get("specificFormat")) for x in recs["briefRecords"]]

formats = cards_df["brief_bibs"].apply(lambda x:list_formats(x))
set(formats.dropna().sum())

In [None]:
# item_subtypes = "artchap-artcl, book-mic, book-thsis, book-printbook, jrnl-print"

In [None]:
# itemSubType devised by looking at itemSubType for all returned records
# help.oclc.org/Librarian_Toolbox/Searching_WorldCat_Indexes/Bibliographic_records/Format_Document_Type_values_and_codes/WorldShare_and_WorldCat_Discovery
search_kwargs = {
    "inCatalogLanguage":None,
    "limit": 50,
    "orderBy": "bestMatch",
    "itemSubType": "artchap-artcl, book-mic, book-thsis, book-printbook, jrnl-print"
}

In [None]:
def apply_search_brief_bib(ti=None, au=None, isbn=None, session=None, search_kwargs={}):
    """
    search_brief_bib applicable to df
    Known issue with specifying offset/limit
    So specify acceptable itemSubTypes and hope correct result is in first 50 records
    """
    
    res = None
    
    if isbn:
        query = f'bn:{isbn}'
        res = session.brief_bibs_search(q=query, **search_kwargs)
        
    if not res or res.json()["numberOfRecords"] == 0:
        query = f'ti:"{ti}" and au:"{au}"'
        res = session.brief_bibs_search(q=query, **search_kwargs)
    
    return res.json()

def apply_get_full_bib(brief_bibs, session):
    if brief_bibs["numberOfRecords"] == 0:
        return None
    else:
        recs = brief_bibs["briefRecords"]
        matched_xml = [session.bib_get(rec["oclcNumber"]).text for rec in recs]
        matched_records = [marcxml.parse_xml_to_array(io.StringIO(x))[0] for x in matched_xml]
        return matched_records

In [None]:
cards_df["brief_bibs_subtyped"] = cards_df.progress_apply(lambda x: apply_search_brief_bib(x["title"], x["author"], x["isbn"], session=session, search_kwargs=search_kwargs), axis=1)
cards_df["num_records_subtyped"] = cards_df["brief_bibs_subtyped"].apply(lambda x: x["numberOfRecords"])

In [None]:
cards_df["worldcat_matches_subtyped"] = cards_df["brief_bibs_subtyped"].progress_apply(apply_get_full_bib, session=session)
cards_df["selected_match_ocn"], cards_df["selected_match"], cards_df["match_needs_editing"] = None, None, None 

In [None]:
def count_field_occurences(lst):
    fields = []
    if lst:
        [fields.extend([y.tag for y in r.fields]) for r in lst]
    return fields

In [None]:
field_sum = cards_df["worldcat_matches_subtyped"].apply(count_field_occurences)

In [None]:
all_fields = pd.Series(index=field_sum.sum(), data=np.ones(len(field_sum.sum())))
#all_fields.groupby().count()

In [None]:
all_fields.groupby(level=0).count().head(50)#.sort_values(ascending=False).head(50)

In [None]:
def check_field_contents(lst):
    if lst:
        nat_bib_ctrl = []
        [nat_bib_ctrl.extend(r.get_fields("016")) for r in lst]
        if nat_bib_ctrl:
            return list(set(nat_bib_ctrl))
        else:
            return None
    else:
        return None

In [None]:
nat_bib_agencies = cards_df["worldcat_matches_subtyped"].apply(check_field_contents).dropna()

In [None]:
def find_2_subfield(lst):
    return [r.get_subfields("2") for r in lst]

In [None]:
subf_2 = pd.Series([x[0] for x in nat_bib_agencies.apply(find_2_subfield).sum() if x])

In [None]:
pd.Series(index=subf_2, data=np.ones(len(subf_2))).groupby(level=0).count()

In [None]:
# pickle.dump(cards_df, open("401_cards.p", "wb"))
# cards_df.to_csv("..\\data\\processed\\401_cards.csv")

### Explore bug with offset/limit
https://community.oclc.org/t5/oclc-apis-discussions/worldcat-metadata-search-brief-bibs-amp-limit-and-amp-offset/td-p/49236

In [None]:
big_ones = cards_df[cards_df["num_records"] > 50].iloc[:2].progress_apply(lambda x: apply_search_brief_bib(x["title"], x["author"], x["isbn"], session=session), axis=1)

In [None]:
big_ones

In [None]:
query_test = f'ti:"FEI LONG QUAN ZHUAN" and au:"WU (Xuan)"'

In [None]:
first_50 = session.search_brief_bibs(q=query_test, inCatalogLanguage=None, limit=50, orderBy="bestMatch")
first_50_oclc = [x["oclcNumber"] for x in first_50.json()["briefRecords"]]

In [None]:
unique_oclcs = []
unique_oclcs.extend(first_50_oclc)

In [None]:
second_20 = session.search_brief_bibs(q=query_test, inCatalogLanguage=None, offset=51, limit=24, orderBy="bestMatch")

In [None]:
len(first_50.json()["briefRecords"]), len(second_20.json()["briefRecords"])

In [None]:
second_20_oclc = [x["oclcNumber"] for x in second_20.json()["briefRecords"]]

In [None]:
[x for x in second_20_oclc if x in first_50_oclc]

In [None]:
unique_oclcs.extend(second_20_oclc)

In [None]:
# found all the unique ones in one order by setting limit=24

# for x in range(1,25):
#     res = session.search_brief_bibs(q=query_test, inCatalogLanguage=None, offset=50, limit=x)
#     oclc = [y["oclcNumber"] for y in res.json()["briefRecords"]]
#     [unique_oclcs.append(z) for z in oclc if z not in unique_oclcs]

In [None]:
offset_lim_df = pd.DataFrame(index=unique_oclcs)
offset_lim_df.insert(loc=0, column="idx", value=list(range(1, len(unique_oclcs) + 1)))

In [None]:
for x in range(1,25):
    res = session.search_brief_bibs(q=query_test, inCatalogLanguage=None, offset=51, limit=x, orderBy="bestMatch")
    oclc = [y["oclcNumber"] for y in res.json()["briefRecords"]]
    oclc_series = pd.Series(index=oclc, data=[x for x in range(1, len(oclc) + 1)], name=str(x))
    offset_lim_df = offset_lim_df.join(oclc_series)

In [None]:
pd.options.display.max_columns = 25
pd.options.display.max_rows = 75

In [None]:
# mwh_offset_lim_df = offset_lim_df.copy()

In [None]:
offset_test = session.search_brief_bibs(q=query_test, inCatalogLanguage=None, offset=51, limit=20, orderBy="bestMatch")
len(offset_test.json()["briefRecords"])

In [None]:
count_df = pd.DataFrame(offset_lim_df.count()).T.rename(index={0:"Count"})
exp_count_df = count_df.rename(index={"Count": "Expected Count"})
exp_count_df.loc["Expected Count"] = np.arange(25)
exp_count_df.loc["Expected Count", "idx"] = 70

In [None]:
display_df = pd.concat([offset_lim_df, count_df, exp_count_df]).rename(columns={"idx":"Record#"}).rename_axis(index="oclcNumber", columns="&limit=")
display_df

In [None]:
idx = pd.IndexSlice
slice_ = idx[idx["760523067":"300845905"], idx["1":"24"]]
display_df.style.hide(subset=slice("680615393", "300642591"), axis=0) \
            .format(precision=0) \
            .background_gradient(axis=None, vmin=0, vmax=40, cmap="BuGn", subset=slice_).highlight_null("white")

### Result selection

Once the results are back from Worldcat we need to pick which to use if multiple have been returned. For Urdu this is less of an issue as there are fewer results returned. For Chinese there are in some instances upwards of 500 records so we do need to select.

Victoria Morris suggested:
- Leader position 17
    - Encoding level, avoid 3/5/7
- 016 where \$2 subfield contains Uk (existing BL holding)
    - 16 is National bibliographic agency control number, $2Uk indicates it's the BL
- 040 
    - Cataloguing source (if "OCLC" appears lots of times then that's good)
- 042
    - Authentication code (quality control process)
- 100/110 and 245 (Searching on these)
    - Author/Corporate Author and Title
- 264/260
    - Old/new format publication information - 260 preferable to 264
- 300
    - Physical description
- 6XX 
    - Subject access
- 880 fields (linked to 100/245 or otherwise)
    - Linked to other fields, indicating original text formats

In [None]:
def n_str(record, string):
    if type(record) is Record:
        f040 = record.get_fields("040")[0].text().split(" ")
        oclc_occurences = sum([1 for x in f040 if string in x])
        return oclc_occurences
    else:
        return None

    
def len_record(record):
    return len(record.get_fields())


def apply_n_str(x, string):
    if type(x) is dict:
        return [n_str(v, string) for v in x.values()]
    else:
        return None

    
def apply_len_record(x):
    if type(x) is dict:
        return [len_record(v) for v in x.values()]
    else:
        return None
    
    
def blx_holdings_exist(record):
    no_holdings = "NO HOLDINGS IN BLX" in record.get_fields("948")[0].text()
    if record.get_fields("016"):
        bl_system_number = "$2Uk" not in record.get_fields("016")[0].__str__()
    if no_holdings or bl_system_number:
        return 0
    else:
        return 1

In [None]:
len(all_records)

In [None]:
blx_holdings = [blx_holdings_exist(r) for r in all_records]

In [None]:
sum(blx_holdings)

In [None]:
oclc_count = res.apply(apply_n_str, string="OCLC")
bl_count = res.apply(apply_n_str, string="BLX")
record_len = res.apply(apply_len_record)

In [None]:
cards_df["040_oclc_count"] = oclc_count
cards_df["040_bl_count"] = bl_count
cards_df["record_len"] = record_len

In [None]:
sum(cards_df['040_bl_count'].dropna().sum())

In [None]:
record_len.dropna().apply(lambda x: len(x)).describe()

In [None]:
record_len.dropna().apply(lambda x: len(x)).hist(bins=40)

In [None]:
cards_df.dropna(subset=["040_oclc_count"])

In [None]:
oclc_count_df = pd.DataFrame(
    data={
        "040_oclc_count": cards_df['040_oclc_count'].dropna().sum(),
        "record_len": cards_df['record_len'].dropna().sum(),
    }
)

In [None]:
oclc_count_df.shape

In [None]:
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter("record_len", "040_oclc_count", data=oclc_count_df)
ax.set_xlabel("Fields in record", fontsize='x-large')
ax.set_ylabel("Occurences of 'OCLC' in MARC field 040", fontsize='x-large')
ax.set_title("Relationship between number of occurences of 'OCLC'\nin MARC field 040 in a record and total fields in a record", fontsize="x-large")
ax.tick_params(labelsize="x-large")

In [None]:
# fig.savefig("..//reports//figures//OCLC_record_length.png", dpi=300, bbox_inches="tight")