# Extract information from Chinese catalogue card xmls

In [None]:
from __future__ import annotations
import sys
if '../' not in sys.path:
    sys.path.append('../')
import os
import glob
import re 
import xml.etree.ElementTree as ET
from IPython.display import Image
from numpy import object_
import pandas as pd
import requests
import pickle
import matplotlib.pyplot as plt
from z3950.PyZ3950 import zoom
from z3950.Marc.marc_tools import Record
from tqdm import tqdm

## Parsing xmls

In [None]:
p5_local = "..\\data\\raw\\chinese"

In [None]:
attempts = 0
# page_xml_loc = os.path.join(p5_root, "page")
page_xml_loc = os.path.join(p5_local, "1016992")
while attempts < 3:
    xmls = glob.glob(os.path.join(p5_local, "1016992", "*.pxml"))
    if len(xmls) > 0:
        break
    else:
        attempts += 1
        continue
else:
    raise IOError(f"Failed to connect to {page_xml_loc}")
    
xmls

In [None]:
xmlroots = []
print(f"\nGetting xml roots from {page_xml_loc}")
for file in tqdm(xmls):
    fileName = os.fsdecode(file)
    attempts = 0
    while attempts < 3:
        try:
            tree = ET.parse(fileName)
            break
        except FileNotFoundError:
            attempts += 1
            continue
    else:
        raise FileNotFoundError(f"Failed to connect to: {fileName}")
    root = tree.getroot()
    xmlroots.append(root)

In [None]:
len(xmlroots)

In [None]:
def extractLines(root: xml.etree.ElementTree.Element):
    lines = []

    textRegions = [x for x in root[1] if len(x) > 2]  # Empty Text Regions Removed

    for textRegion in textRegions:
        textLines = textRegion[1:-1]  # Skip coordinate data in first child
        for textLine in textLines:
            lines.append(textLine[-1][0].text)  # Text equivalent for line
    return lines

def extractLinesForVol(vol: list[xml.etree.ElementTree.Element]):
    allLines = []
    for root in tqdm(vol):
        rootLines = extractLines(root)
        allLines.append(rootLines) 
    return allLines

In [None]:
def extractLinesForVol(vol: list[xml.etree.ElementTree.Element]):
    allLines = []
    for root in tqdm(vol):
        rootLines = extractLines(root)
        allLines.append(rootLines) 
    return allLines

In [None]:
caps_regex = re.compile("[A-Z][A-Z][A-Z]+")
c_num_regex = re.compile("C\.[0-9]")  # C number title references
i_num_regex = re.compile("I[ABC]\.\s[0-9]")  # I number title references
date_regex = re.compile("1[45][0-9][0-9]")  # Date format regexes (specific to this volume)
smark_regex = re.compile("[0-9]{1,5}[\s\.]{1,2}[\w]{1,3}[\s\.]{1,2}[\w0-9]{1,5}")
author_regex = re.compile("[A-Z]+[\s]+\([A-Z][a-z]+\)")
isbn_regex = re.compile("ISBN\s[0-9\-\s]+")

In [None]:
cards = extractLinesForVol(xmlroots)

In [None]:
cards_df_v0 = pd.DataFrame(
    data={
        "xml": [os.path.basename(x) for x in xmls],
        "lines": cards,
        "dummy": [None for x in cards]
    }
)

In [None]:
def find_author(lines, dummy):
    author, title = None, None
    
    for i, l in enumerate(lines):
        if author_regex.search(l):  # look for an author format match
            author = l
            break
    
    if author:
        if i >= 2:  # author is after the second line (where we expect the title)
            title = " ".join(lines[1:i])
        elif i == 1:  # author is the second line
            title = lines[2]
    else:
        title = lines[1]  # default to the title being the second line
        
    return title, author

In [None]:
def isbn_search(x):
    if type(x) is not list:
        raise TypeError(f"List expected not {type(x)}")
    res = isbn_regex.search("".join(x))
    if res:
        return res.group().replace("-", "").replace(" ", "").lstrip("ISBN")
    else:
        return None

In [None]:
def shelfmark_search(x):
    if smark_regex.search(x[0]):
        result = smark_regex.search(x[0]).group()
    elif smark_regex.search(x[1]):
        result = smark_regex.search(x[1]).group()
    else:
        return None
    return result.replace(" ", "")

In [None]:
cards_df_v0["shelfmark"] = cards_df_v0["lines"].transform(shelfmark_search)
t_a = cards_df_v0.loc[:,('lines', 'dummy')].transform(lambda x: find_author(x[0], x[1]), axis=1).rename(columns={"lines":"title", "dummy":"author"})
cards_df = cards_df_v0.drop(columns="dummy").join(t_a)
cards_df["ISBN"] = cards_df["lines"].transform(lambda x:isbn_search(x))
cards_df["title"] = cards_df["title"].str.replace("\"", "\'")

In [None]:
cards_df.dropna()

In [None]:
cards_df.info()

In [None]:
def OCLC_query(title="", author="", ISBN=None):
    # TODO Connection currently only handles 450 results at once before closing, extend or allow to reopen after 450
    conn = zoom.Connection(
        host='zcat.oclc.org',
        port=210,
        user='100270667',
        password='oclccat',
        databaseName='OLUCWorldCat',
        preferredRecordSyntax='USMARC',
        charset="UTF-8"
    )
    res = None
    
    if author is None:
        author = ""
        
    if ISBN:
        q = f'isbn="{ISBN}"'
        print(q)
        query = zoom.Query(typ="CCL", query=q)
        res = conn.search(query)

    if not res:
        q = f'ti="{title}" and au="{author}"'
        print(q)
        query = zoom.Query(typ="CCL", query=q)
        res = conn.search(query)

    if res:
        try:
            res_dict = {i: r for i, r in enumerate(res)}
            conn.close()
            return res_dict
        except zoom.Bib1Err:
            print("Bib1Err")
            return "Bib1Err"
        except zoom.ProtocolError:
            print("ProtocolError - likely Diag")
            return "ProtocolError"
        except TypeError:
            print("Diag error")
            return None
    else:
        conn.close()
        return None


In [None]:
res = cards_df.loc[245:255].apply(lambda x: OCLC_query(x['title'], x['author'], x['ISBN']),axis=1)

In [None]:
pickle.dump(res, open("401_res.p", 'wb'))

In [None]:
# res = pickle.load(open("res.p", "rb"))  # pickled it because it takes about 10 mins to run the query

In [None]:
all_records = res[res.str.contains("Error").isna()].dropna().apply(lambda x:[v for v in x.values()]).sum()

In [None]:
cards_df['worldcat_result'] = res

In [None]:
print(cards_df.loc[0, "worldcat_result"][0])

In [None]:
sample_match_indices = [
    [0, 0, 1, 3],
    [1, 0, 1, 2],
    [6, 0, 1, 2]
]

In [None]:
cards_df.dropna(subset="worldcat_result").iloc[6]

In [None]:
idx

In [None]:
cards_df.dropna(subset="worldcat_result").iloc[1, -4]

In [None]:
for idx in sample_match_indices:
    print(cards_df.dropna(subset="worldcat_result").iloc[idx[0], 4])
    sample = cards_df.dropna(subset="worldcat_result").iloc[idx[0], -4]
    f001s = [print(x.get_fields("001")[0]) for x in [list(sample.values())[x] for x in idx[1:]]]

In [None]:
sample1 = cards_df.dropna(subset="worldcat_result").iloc[6, -4]
[print(x.get_fields("001")[0]) for x in sample1.values()]

### Result selection

Once the results are back from Worldcat we need to pick which to use if multiple have been returned. For Urdu this is less of an issue as there are fewer results returned. For Chinese there are in some instances upwards of 500 records so we do need to select.

Victoria Morris suggested:
- Leader position 17
    - Encoding level, avoid 3/5/7
- 016 where \$2 subfield contains Uk (existing BL holding)
    - 16 is National bibliographic agency control number, $2Uk indicates it's the BL
- 040 
    - Cataloguing source (if "OCLC" appears lots of times then that's good)
- 042
    - Authentication code (quality control process)
- 100/110 and 245 (Searching on these)
    - Author/Corporate Author and Title
- 264/260
    - Old/new format publication information - 260 preferable to 264
- 300
    - Physical description
- 6XX 
    - Subject access
- 880 fields (linked to 100/245 or otherwise)
    - Linked to other fields, indicating original text formats

In [None]:
def n_str(record, string):
    if type(record) is Record:
        f040 = record.get_fields("040")[0].text().split(" ")
        oclc_occurences = sum([1 for x in f040 if string in x])
        return oclc_occurences
    else:
        return None

    
def len_record(record):
    return len(record.get_fields())


def apply_n_str(x, string):
    if type(x) is dict:
        return [n_str(v, string) for v in x.values()]
    else:
        return None

    
def apply_len_record(x):
    if type(x) is dict:
        return [len_record(v) for v in x.values()]
    else:
        return None
    
    
def blx_holdings_exist(record):
    no_holdings = "NO HOLDINGS IN BLX" in record.get_fields("948")[0].text()
    if record.get_fields("016"):
        bl_system_number = "$2Uk" not in record.get_fields("016")[0].__str__()
    if no_holdings or bl_system_number:
        return 0
    else:
        return 1

In [None]:
len(all_records)

In [None]:
blx_holdings = [blx_holdings_exist(r) for r in all_records]

In [None]:
sum(blx_holdings)

In [None]:
oclc_count = res.apply(apply_n_str, string="OCLC")
bl_count = res.apply(apply_n_str, string="BLX")
record_len = res.apply(apply_len_record)

In [None]:
cards_df["040_oclc_count"] = oclc_count
cards_df["040_bl_count"] = bl_count
cards_df["record_len"] = record_len

In [None]:
sum(cards_df['040_bl_count'].dropna().sum())

In [None]:
record_len.dropna().apply(lambda x: len(x)).describe()

In [None]:
record_len.dropna().apply(lambda x: len(x)).hist(bins=40)

In [None]:
cards_df.dropna(subset=["040_oclc_count"])

In [None]:
oclc_count_df = pd.DataFrame(
    data={
        "040_oclc_count": cards_df['040_oclc_count'].dropna().sum(),
        "record_len": cards_df['record_len'].dropna().sum(),
    }
)

In [None]:
oclc_count_df.shape

In [None]:
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter("record_len", "040_oclc_count", data=oclc_count_df)
ax.set_xlabel("Fields in record", fontsize='x-large')
ax.set_ylabel("Occurences of 'OCLC' in MARC field 040", fontsize='x-large')
ax.set_title("Relationship between number of occurences of 'OCLC'\nin MARC field 040 in a record and total fields in a record", fontsize="x-large")
ax.tick_params(labelsize="x-large")

In [None]:
# fig.savefig("..//reports//figures//OCLC_record_length.png", dpi=300, bbox_inches="tight")