# Process Transkribus xml to Worldcat matched record

This notebook is part of the convert-a-card project, and was originally developed by Giorgia Tolfo, with some refactoring and extension by Harry Lloyd. The nb is designed to:
- Parse author/title/shelfmark from Page xml data exported from automatic transcription of Urdu and Pinyin catalogue cards in Transkribus.
- Query OCLC worldcat for matches.
- Assign a confidence score to these matches.
- If no match found, set up a minimal record using data extracted from the card.
- (Possibly) allow curators to select the best matched if multiple matches are returned.

In [None]:
import glob
import re 
from urllib.parse import quote
import sys
import xml.etree.ElementTree as ET
import pandas as pd
# import lxml
# from lxml import etree
from bs4 import BeautifulSoup
import requests
from z3950.PyZ3950 import zoom
import filecmp
import os

In [None]:
ns = "{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}"  # xml namespace
cards = glob.glob('.\\urdu_cards_pages\\1032971\\U2_for_Transkribus\\page\\*.xml')  # Urdu drawer 2 from Transkribus

In [None]:
cards[0]

In [None]:
tree = ET.parse(cards[0])
root = tree.getroot()

In [None]:
len(root)

In [None]:
root[0].tag

In [None]:
root[1].tag

In [None]:
root[1].attrib

In [None]:
root.attrib

In [None]:
records = []

for f in cards:
    print(f + '\n')
    tree = ET.parse(f)
    root = tree.getroot()

    shelfmarks = []
    titles = []
    authors = []
    record = []

    for TextRegion in root.iter(ns + 'TextRegion'):
        if re.search('shelfmark', TextRegion.attrib['custom']):
            el_shelfmark = TextRegion.find('./{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextLine/{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextEquiv/{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Unicode') 
            if el_shelfmark is not None:
                shelfmarks.append(el_shelfmark.text)
                
            else:
                shelfmark = None
        
        else:
            shelfmark = None

        if re.search('title', TextRegion.attrib['custom']):
            el_title = TextRegion.find('./{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextLine/{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextEquiv/{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Unicode')
            
            if el_title is not None:
                titles.append(el_title.text)
    
            else:
                title = None
        else:
            title = None
            
        if re.search('author', TextRegion.attrib['custom']):
            el_author = TextRegion.find('./{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextLine/{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextEquiv/{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Unicode')
            if el_author is not None:
                authors.append(el_author.text)
                
            else:
                author = None
        
        
        else:
            author = None

    print('Shelfmark: ', shelfmarks)
    print('Title: ', titles)
    print('Author: ', authors)
    
    
    record = {'card_xml': f, 'title' : titles, 'author': authors, 'shelfmark': shelfmarks}
    records.append(record)
    print(record, '\n\n')

In [None]:
xml_df = pd.DataFrame(records).sort_values(by=['card_xml'])
xml_df

In [None]:
# df.to_csv('test_with_lxml_new.csv')

In [None]:
def test_OCLC_query(title):
    conn = zoom.Connection('zcat.oclc.org', 210, user='100270667', password='oclccat')
    conn.databaseName = 'OLUCWorldCat'
    conn.preferredRecordSyntax = 'USMARC'

    query = zoom.Query(f'ti="{title}"')

    res = conn.search(query)
    for r in res:
        print(str(r))
    conn.close()


def test_OCLC_query(title, author):
    conn = zoom.Connection('zcat.oclc.org', 210, user='100270667', password='oclccat')
    conn.databaseName = 'OLUCWorldCat'
    conn.preferredRecordSyntax = 'USMARC'

    query = zoom.Query(f'ti="{title}" and au="{author}"')

    res = conn.search(query)
    for i, r in enumerate(res):
        print(f"Record {i}")
        print(str(r))
    conn.close()
    
    
def OCLC_query(title, author):
    conn = zoom.Connection('zcat.oclc.org', 210, user='100270667', password='oclccat')
    conn.databaseName = 'OLUCWorldCat'
    conn.preferredRecordSyntax = 'USMARC'

    query = zoom.Query(f'ti="{title}" and au="{author}"')

    res = conn.search(query)
    conn.close()
    if res:
        res_dict = {i:r for i, r in enumerate(res)}
        return res_dict
    else:
        return None
    
    
def test_BL_query(blid='018948571'):
    conn = zoom.Connection('z3950cat.bl.uk', 9909, user='COLMET2912', password='2m5v2Qyv')
    conn.databaseName = 'ZBLACU'
    conn.preferredRecordSyntax = 'USMARC'
    query = zoom.Query(f'id="{blid}"')
    res = conn.search(query)
    for r in res:
        print(str(r))
    conn.close()

    
def remove_duplicates(x):
    return list(dict.fromkeys(x))

In [None]:
test_OCLC_query("Madame Bovary", 'Flaubert')

In [None]:
OCLC_query(xml_df.loc[0,'title'], xml_df.loc[0,'author'])

In [None]:
xml_df.head()

In [None]:
xml_df.iloc[0]

In [None]:
xml_df.apply(lambda x: OCLC_query(x['title'][0], x['author'][0]),axis=1)

In [None]:
'''This includes a very cut-down version of https://github.com/asl2/PyZ3950 and some code for parsing MARC records which is loosely based on https://pypi.org/project/pymarc/
 
Hopefully you will be able to run the two example queries in the file test_queries.py and work from there. Both of the queries use authentication credentials – you should probably check whether LibCrowds has ever had its own credentials, especially for querying OCLC as I think we do get charged something annuals for permission to derive records from them.
 
My cut-down version of the scripts will only do z3950 searches (no other kinds of transactions, e.g. sorting, writing, deleting), and only uses CCL query language. See https://help.oclc.org/Resource_Sharing/Relais_ILL/DiscoverItem/DiscoverItem_Search/020Constructing_CCL_query for information on CCL queries, and note that there are other query languages with the same initials! There is probably still a lot of redundant code and dead ends in there – I haven’t had time yet to work out what everything does.'''

In [None]:
# %%capture cap --no-stderr
# print 'stuff'
# with open('output.txt', 'w') as f:
#     f.write(cap.stdout)

In [None]:
#testing 

In [None]:
# tree = ET.parse('./urdu_cards_pages/1032971/U2_for_Transkribus/page/0001_24567971014.xml')
# root = tree.getroot()
    
# shelfmarks = []
# titles = []
# authors = []
# record = []

# for TextRegion in root.iter(ns+'TextRegion'):
#     if re.search('shelfmark', TextRegion.attrib['custom']):
#         y = TextRegion.find('./{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextLine/{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextEquiv/{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Unicode')
#         print(y.text)

In [None]:
root.find('.//{'+ns+'}TextLine/')

In [None]:
tree = ET.parse('./urdu_cards_pages/1032971/U2_for_Transkribus/page/0001_24567971014.xml')
root = tree.getroot()

shelfmarks = []
titles = []
authors = []
record = []

for TextRegion in root.iter(ns+'TextRegion'):
        if re.search('shelfmark', TextRegion.attrib['custom']) :
            el_shelfmark = TextRegion.find('./{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextLine/{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextEquiv/{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Unicode') 
            if el_shelfmark is not None:
                shelfmarks.append(el_shelfmark.text)
                
            else:
                shelfmark = None
        
        else:
            shelfmark = None
      


        if re.search('title', TextRegion.attrib['custom']):
            el_title = TextRegion.find('./{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextLine/{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextEquiv/{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Unicode')
            
            if el_title is not None:
                titles.append(el_title.text)
                
            else:
                title = None
           
            
        else:
            title = None
            



        if re.search('author', TextRegion.attrib['custom']):
            el_author = TextRegion.find('./{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextLine/{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextEquiv/{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Unicode')
            if el_author is not None:
                authors.append(el_author.text)
                
            else:
                author = None
        
        
        else:
            author = None

print('Shelfmark: ', shelfmarks)
print('Title: ', titles)
print('Author: ', authors)