# Process Transkribus xml to Worldcat matched record

This notebook is part of the convert-a-card project, and was originally developed by Giorgia Tolfo, with some refactoring and extension by Harry Lloyd. The nb is designed to:
- Parse author/title/shelfmark from Page xml data exported from automatic transcription of Urdu and Pinyin catalogue cards in Transkribus.
- Query OCLC worldcat for matches.
- Assign a confidence score to these matches.
- If no match found, set up a minimal record using data extracted from the card.
- (Possibly) allow curators to select the best matched if multiple matches are returned.

In [2]:
import sys
if '../' not in sys.path:
    sys.path.append('../')
import os
import glob
import re 
import xml.etree.ElementTree as ET
import pandas as pd
import requests
from z3950.PyZ3950 import zoom

In [3]:
ns = "{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}"  # xml namespace
cards = glob.glob('..\\data\\raw\\urdu_cards_pages\\page\\*.xml')  # Urdu drawer 2 from Transkribus

In [4]:
cards[0]

'..\\data\\raw\\urdu_cards_pages\\page\\0001_24567971014.xml'

In [5]:
def extract_labelled_xml(xml: os.PathLike, namespace):
    root = ET.parse(xml).getroot()

    text_line = f'./{namespace}TextLine/{namespace}TextEquiv/{namespace}Unicode'

    for tr in root.iter(namespace + 'TextRegion'):
        if re.search('shelfmark', tr.attrib['custom']):
            shelfmark = tr.find(text_line).text
            continue

        if re.search('title', tr.attrib['custom']):
            title = tr.find(text_line).text.replace("\"", "\'")
            continue

        if re.search('author', tr.attrib['custom']):
            author = tr.find(text_line).text.replace("\"", "\'")

    record = {'card_xml': xml, 'title': title, 'author': author, 'shelfmark': shelfmark}

    return record

In [6]:
records = [extract_labelled_xml(c, namespace=ns) for c in cards]
records

[{'card_xml': '..\\data\\raw\\urdu_cards_pages\\page\\0001_24567971014.xml',
  'title': 'KAUL-I-TAIYIB',
  'author': 'BARNI (Muhammad Ilyas), Maulana, M.A., LL.B.',
  'shelfmark': '14115. e. 72'},
 {'card_xml': '..\\data\\raw\\urdu_cards_pages\\page\\0002_24567971594.xml',
  'title': 'AL-KAUKABAT AL-SHAHHABIYAH FI KUFARIYAT-I-ABI',
  'author': 'BARELAWI (Muhammad Abmad Riza Khan),',
  'shelfmark': '14105. c. 33'},
 {'card_xml': '..\\data\\raw\\urdu_cards_pages\\page\\0003_24567971784.xml',
  'title': "KASRAT-I-ABKDI KA 'ILAJ",
  'author': "SIDDIEI (Na'Im)",
  'shelfmark': '14106. ggg. 17'},
 {'card_xml': '..\\data\\raw\\urdu_cards_pages\\page\\0004_24567971924.xml',
  'title': 'KASHMIR SAZISH CASE',
  'author': 'SRINAGAR.',
  'shelfmark': '14119. a. 104'},
 {'card_xml': '..\\data\\raw\\urdu_cards_pages\\page\\0005_24567972014.xml',
  'title': 'KASHMIR HAMARA HAI',
  'author': 'KASHFI (Ghulam Ahmad), Mir',
  'shelfmark': '14109. b. 101.'},
 {'card_xml': '..\\data\\raw\\urdu_cards_pages\

In [7]:
cards_df = pd.DataFrame(records).sort_values(by=['card_xml'])
cards_df

Unnamed: 0,card_xml,title,author,shelfmark
0,..\data\raw\urdu_cards_pages\page\0001_2456797...,KAUL-I-TAIYIB,"BARNI (Muhammad Ilyas), Maulana, M.A., LL.B.",14115. e. 72
1,..\data\raw\urdu_cards_pages\page\0002_2456797...,AL-KAUKABAT AL-SHAHHABIYAH FI KUFARIYAT-I-ABI,"BARELAWI (Muhammad Abmad Riza Khan),",14105. c. 33
2,..\data\raw\urdu_cards_pages\page\0003_2456797...,KASRAT-I-ABKDI KA 'ILAJ,SIDDIEI (Na'Im),14106. ggg. 17
3,..\data\raw\urdu_cards_pages\page\0004_2456797...,KASHMIR SAZISH CASE,SRINAGAR.,14119. a. 104
4,..\data\raw\urdu_cards_pages\page\0005_2456797...,KASHMIR HAMARA HAI,"KASHFI (Ghulam Ahmad), Mir",14109. b. 101.
5,..\data\raw\urdu_cards_pages\page\0006_2456797...,"KASHMIR, ADAB-O-SIKAFAT",GUMMI (Salim Khan),14110. e. 28
6,..\data\raw\urdu_cards_pages\page\0007_2456797...,KASHIF AL-HAKA'IK,"ASAR (Imdad Imam), Saiyid.",14117. a. 166.
7,..\data\raw\urdu_cards_pages\page\0008_2456797...,KASHF al-MAHJUB,"ALI, Hujwiri, surnamed DATA GANJ BAKHSH",14104. f. 103
8,..\data\raw\urdu_cards_pages\page\0009_2456797...,KASA'ID-I- ZAUK,"MUHAMMAD IBRAHIM, called ZAUK",14114. g. 87
9,..\data\raw\urdu_cards_pages\page\0010_2456797...,KANTON KI SEJ,NADAWI (Rashid Akhtar),14112. b. 292.


In [12]:
# df.to_csv('test_with_lxml_new.csv')

In [8]:
# can use print(r) to visually print out a whole MARC record retrieved from worldcat

def OCLC_query(title="", author=""):
    # TODO Connection currently only handles 450 results at once before closing, extend or allow to reopen after 450
    conn = zoom.Connection(
        host='zcat.oclc.org',
        port=210,
        user='100270667',
        password='oclccat',
        databaseName='OLUCWorldCat',
        preferredRecordSyntax='USMARC'
    )
    q = f'ti="{title}" and au="{author}"'
    print(q)
    query = zoom.Query(q)
    res = conn.search(query)

    if res:
        res_dict = {i: r for i, r in enumerate(res)}
        conn.close()
        return res_dict
    else:
        conn.close()
        return None

In [None]:
normal_people = OCLC_query(author='Sally Rooney')

In [193]:
normal_people[0]

<z3950.Marc.marc_tools.Record at 0x27a901cc970>

In [None]:
# This query returns > 450 results which at the moment is more than one zoom connection allows
# OCLC_query("Madame Bovary", 'Flaubert')

In [187]:
cards_df.head()

Unnamed: 0,card_xml,title,author,shelfmark
0,..\data\raw\urdu_cards_pages\page\0001_2456797...,KAUL-I-TAIYIB,"BARNI (Muhammad Ilyas), Maulana, M.A., LL.B.",14115. e. 72
1,..\data\raw\urdu_cards_pages\page\0002_2456797...,AL-KAUKABAT AL-SHAHHABIYAH FI KUFARIYAT-I-ABI,"BARELAWI (Muhammad Abmad Riza Khan),",14105. c. 33
2,..\data\raw\urdu_cards_pages\page\0003_2456797...,KASRAT-I-ABKDI KA 'ILAJ,SIDDIEI (Na'Im),14106. ggg. 17
3,..\data\raw\urdu_cards_pages\page\0004_2456797...,KASHMIR SAZISH CASE,SRINAGAR.,14119. a. 104
4,..\data\raw\urdu_cards_pages\page\0005_2456797...,KASHMIR HAMARA HAI,"KASHFI (Ghulam Ahmad), Mir",14109. b. 101.


In [180]:
OCLC_query(cards_df.loc[32,'title'], cards_df.loc[32,'author'])

ti="KHUIBAT-I-KA'ID-I-A'ZAM" and au="JINNAH (Muhammad 'AlI)"


In [11]:
res = cards_df.apply(lambda x: OCLC_query(x['title'], x['author']),axis=1)
cards_df['worldcat_result'] = res

ti="KAUL-I-TAIYIB" and au="BARNI (Muhammad Ilyas), Maulana, M.A., LL.B."
ti="AL-KAUKABAT AL-SHAHHABIYAH FI KUFARIYAT-I-ABI" and au="BARELAWI (Muhammad Abmad Riza Khan),"
ti="KASRAT-I-ABKDI KA 'ILAJ" and au="SIDDIEI (Na'Im)"
ti="KASHMIR SAZISH CASE" and au="SRINAGAR."
ti="KASHMIR HAMARA HAI" and au="KASHFI (Ghulam Ahmad), Mir"
ti="KASHMIR, ADAB-O-SIKAFAT" and au="GUMMI (Salim Khan)"
ti="KASHIF AL-HAKA'IK" and au="ASAR (Imdad Imam), Saiyid."
ti="KASHF al-MAHJUB" and au="ALI, Hujwiri, surnamed DATA GANJ BAKHSH"
ti="KASA'ID-I- ZAUK" and au="MUHAMMAD IBRAHIM, called ZAUK"
ti="KANTON KI SEJ" and au="NADAWI (Rashid Akhtar)"
ti="KANDID" and au="VOLTAIRE (Frangois Marie Aronet de)"
ti="KAMYAB ZINDAGI KA TASAWWAR" and au="DIL (Anwar), Professor."
ti="KAMAN" and au="MALIUABADI (Na'il)"
ti="KALIM" and au="KAISI, Rampuri."
ti="KALI TASWIR" and au="IBN SAFI"
ti="KALI KALI TABASSUM" and au="SHAHIN (Firozah)"
Extracting 2 starting at 0
ti="KALASIKI ADAB" and au="FARUKI (Abmad), Khwajah."
ti="KAKUL-I-S

In [24]:
cards_df.loc[10]

card_xml           ..\data\raw\urdu_cards_pages\page\0011_2456797...
title                                                         KANDID
author                           VOLTAIRE (Frangois Marie Aronet de)
shelfmark                                              14112. a. 198
worldcat_result                                                 None
Name: 10, dtype: object

In [18]:
cards_df.dropna()

Unnamed: 0,card_xml,title,author,shelfmark,worldcat_result
15,..\data\raw\urdu_cards_pages\page\0016_2456797...,KALI KALI TABASSUM,SHAHIN (Firozah),14112. h. 79,{0: =LDR 00890cam 2200253 4500 =001 ocm46...
21,..\data\raw\urdu_cards_pages\page\0022_2456799...,KUCHLI HU'I LASH,IBN SAFI,14112. a. 629,{0: =LDR 01410cam 2200421Ma 4500 =001 ocn81...
31,..\data\raw\urdu_cards_pages\page\0032_2456799...,KHUTUT-I-GHALIB,"GHALIB (Asad Allah), Khan.",14110. c. 53.,{0: =LDR 00719cam 2200241Ma 4500 =001 ocn65...
40,..\data\raw\urdu_cards_pages\page\0041_2456800...,KHETI,MUJIB (Muhammad),14112. aa. 58.,{0: =LDR 01342cam 22003377a 4500 =001 ocm46...
46,..\data\raw\urdu_cards_pages\page\0047_2456800...,KHANDAN,BAT (Raziyah),14112. h. 39,{0: =LDR 00871cam 2200301 a 4500 =001 ocm20...


In [19]:
cards_df.loc[46]

card_xml           ..\data\raw\urdu_cards_pages\page\0047_2456800...
title                                                        KHANDAN
author                                                 BAT (Raziyah)
shelfmark                                               14112. h. 39
worldcat_result    {0: =LDR  00871cam  2200301 a 4500
=001  ocm20...
Name: 46, dtype: object

In [20]:
print(cards_df.loc[46, "worldcat_result"][0])

=LDR  00871cam  2200301 a 4500
=001  ocm20990610#
=003  OCoLC
=005  20230509081726.0
=008  900202s1971####pk############000#1#urd#d
=010  ## $a   72930428 
=029  1# $aAU@$b000028373298
=035  ## $a(OCoLC)20990610
=040  ## $aKQM$beng$cKQM$dCGU$dCUY$dOCLCQ$dOCLCF$dOCLCQ$dOCLCO$dOCLCQ
=050  04 $aPK2200.R4$bK53
=082  04 $a891.4393$bRAZ, Urdu
=100  0# $aRaz̤iyah Baṭ.
=245  10 $aK̲h̲āndān.
=260  ## $aLāhaur :$bĪsṭarn Pablisharz,$c[1971]
=300  ## $a400 pages ;$c18 cm
=336  ## $atext$btxt$2rdacontent
=337  ## $aunmediated$bn$2rdamedia
=338  ## $avolume$bnc$2rdacarrier
=500  ## $aA novel.
=546  ## $aIn Urdu.
=650  #0 $aUrdu fiction.
=650  #7 $aUrdu fiction.$2fast$0(OCoLC)fst01162764
=948  ## $hNO HOLDINGS IN BLX - 13 OTHER HOLDINGS
=994  ## $aZ0$bBLX



In [21]:
print(cards_df.loc[46, "worldcat_result"][1])

=LDR  01035cam  2200313 a 4500
=001  ocm45943867#
=003  OCoLC
=005  20230509081726.0
=008  010215s1999####ii############000#f#urdod
=035  ## $a(OCoLC)45943867
=040  ## $aIGA$beng$cIGA$dZQP$dOCLCQ$dOJ4$dOCLCF$dGK8$dOCLCO$dOCLCA$dOCLCQ
=066  ## $c(3$c(4
=082  04 $aFic$bRAZIYAH BAT
=100  0# $6880-01$aRaẓiyah Baṭ.
=245  10 $6880-02$aK̲h̲āndān /$cRaz̤iyah Baṭ.
=260  ## $6880-03$aNaʼī Dihlī :$bAhlūvāliyah Bukḍipo,$c1999.
=300  ## $a336 pages ;$c19 cm
=336  ## $atext$btxt$2rdacontent
=337  ## $aunmediated$bn$2rdamedia
=338  ## $avolume$bnc$2rdacarrier
=500  ## $aNovel.
=546  ## $aIn Urdu.
=650  #0 $aUrdu fiction.
=650  #7 $aUrdu fiction.$2fast$0(OCoLC)fst01162764
=880  0# $6100-01/(3/r$a(3QVjg HJ.(B
=880  10 $6245-02/(3/r$a(3NGfOGf /(B$c(3QVjg HJ.(B
=880  ## $6260-03/(3/r$a(3fFi Ogdi :(B$b(3GgdhhGdjg HcO(4)(3h,(B,$c1999.
=948  ## $hNO HOLDINGS IN BLX - 6 OTHER HOLDINGS
=994  ## $aZ0$bBLX



In [23]:
print(cards_df.loc[46, "worldcat_result"][4])

=LDR  00699cam  2200253M  4500
=001  ocn976620288
=003  OCoLC
=005  20230509081726.0
=008  740710s1971####pk############000#0#urd#d
=019  ## $a225397538
=035  ## $a(OCoLC)976620288$z(OCoLC)225397538
=040  ## $aCNUTO$beng$cCNUTO$dOCLCQ
=050  #4 $aPK2200 R383$bK4
=050  #4 $aPK2200 .R383K4$bROBA
=082  04 $a891.4393$qOCoLC
=100  1# $aRaz̤iyah Baṭ
=245  10 $aKhāndān.
=260  ## $aLahore$bEastern Publishers$c[1971]
=300  ## $a400 pages
=336  ## $atext$btxt$2rdacontent
=337  ## $aunmediated$bn$2rdamedia
=338  ## $avolume$bnc$2rdacarrier
=948  ## $hNO HOLDINGS IN BLX - 2 OTHER HOLDINGS
=994  ## $aZ0$bBLX



In [None]:
'''This includes a very cut-down version of https://github.com/asl2/PyZ3950 and some code for parsing MARC records which is loosely based on https://pypi.org/project/pymarc/
 
Hopefully you will be able to run the two example queries in the file test_queries.py and work from there. Both of the queries use authentication credentials – you should probably check whether LibCrowds has ever had its own credentials, especially for querying OCLC as I think we do get charged something annuals for permission to derive records from them.
 
My cut-down version of the scripts will only do z3950 searches (no other kinds of transactions, e.g. sorting, writing, deleting), and only uses CCL query language. See https://help.oclc.org/Resource_Sharing/Relais_ILL/DiscoverItem/DiscoverItem_Search/020Constructing_CCL_query for information on CCL queries, and note that there are other query languages with the same initials! There is probably still a lot of redundant code and dead ends in there – I haven’t had time yet to work out what everything does.'''

In [None]:
# %%capture cap --no-stderr
# print 'stuff'
# with open('output.txt', 'w') as f:
#     f.write(cap.stdout)

In [None]:
#testing 

In [None]:
# tree = ET.parse('./urdu_cards_pages/1032971/U2_for_Transkribus/page/0001_24567971014.xml')
# root = tree.getroot()
    
# shelfmarks = []
# titles = []
# authors = []
# record = []

# for TextRegion in root.iter(ns+'TextRegion'):
#     if re.search('shelfmark', TextRegion.attrib['custom']):
#         y = TextRegion.find('./{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextLine/{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextEquiv/{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Unicode')
#         print(y.text)

In [None]:
root.find('.//{'+ns+'}TextLine/')

In [None]:
tree = ET.parse('./urdu_cards_pages/1032971/U2_for_Transkribus/page/0001_24567971014.xml')
root = tree.getroot()

shelfmarks = []
titles = []
authors = []
record = []

for TextRegion in root.iter(ns+'TextRegion'):
        if re.search('shelfmark', TextRegion.attrib['custom']) :
            el_shelfmark = TextRegion.find('./{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextLine/{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextEquiv/{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Unicode') 
            if el_shelfmark is not None:
                shelfmarks.append(el_shelfmark.text)
                
            else:
                shelfmark = None
        
        else:
            shelfmark = None
      


        if re.search('title', TextRegion.attrib['custom']):
            el_title = TextRegion.find('./{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextLine/{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextEquiv/{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Unicode')
            
            if el_title is not None:
                titles.append(el_title.text)
                
            else:
                title = None
           
            
        else:
            title = None
            



        if re.search('author', TextRegion.attrib['custom']):
            el_author = TextRegion.find('./{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextLine/{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextEquiv/{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Unicode')
            if el_author is not None:
                authors.append(el_author.text)
                
            else:
                author = None
        
        
        else:
            author = None

print('Shelfmark: ', shelfmarks)
print('Title: ', titles)
print('Author: ', authors)