In [1]:
import pandas as pd
import lxml
import glob
from lxml import etree
from bs4 import BeautifulSoup
import re 
import requests
from urllib.parse import quote
import sys


ns = "{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}"
cards = './urdu_cards_pages/1032971/U2_for_Transkribus/page/*.xml'

In [2]:
cards_samples = []
for card in glob.glob(cards)[:1]:
    cards_samples.append(card)

cards_samples

['./urdu_cards_pages/1032971/U2_for_Transkribus/page/0024_24567998184.xml']

In [4]:
metadata = pd.DataFrame()
shelfmarks = []
authors = []
titles = []
query = []
query_results = []

for card in cards_samples:
    with open(card, "r") as file:
        print(card)
        
        soup = BeautifulSoup(file, 'xml')
        shelfmark = soup.select('TextRegion[custom*=shelfmark]')[0].find_all('TextLine', recursive=False)[0].find_all('TextEquiv', recursive=False)[0].find_all('Unicode', recursive=False)[0]
        shelfmark = shelfmark.get_text()
        author = soup.select('TextRegion[custom*=author]')[0].find_all('TextLine', recursive=False)[0].find_all('TextEquiv', recursive=False)[0].find_all('Unicode', recursive=False)[0]
        author = author.get_text()
        title = soup.select('TextRegion[custom*=title]')[0].find_all('TextLine', recursive=False)[0].find_all('TextEquiv', recursive=False)[0].find_all('Unicode', recursive=False)[0]
        title = title.get_text()
        
        #SAVE METADATA
        shelfmarks.append(shelfmark)
        authors.append(author)
        titles.append(title)
        
        surname = author.split(',')[0]
        surname_encoded = quote(surname)
        
        #sys.stdout = open(card.split('/')[-1]+'.txt','wt')

        #INITIAL QUERY
        url = 'https://www.worldcat.org/search?q=au%3A'+surname_encoded+'&qt=advanced&dblist=638'
        print(url)
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
        
        #CHECK NUMBER OF RESULTS
        try:
            results_info = soup.select('div[class=resultsinfo]')[0].find_all('td')[0].find_all('strong')[1].get_text()
            n_pages = int(float(results_info.replace(',', '')))//10
        except:
              n_pages = "NO RESULTS"
        
        query.append(surname)
        query_results.append(int(float(results_info.replace(',', ''))))
        
        
        #CONSTRUCT URLS
        urls = [url]  
        if n_pages > 1:
            if n_pages > 10:
                end = 11
            else:
                end=n_pages+1
            for i in range(1,end):
                urls.append(url+'&start='+str(i*10+1)+'&qt=page_number_link')
        
        print('===========')
        print("CARD TRANSCRIPTION")
        print('Shelfmark = '+ shelfmark)
        print('Author = '+ author)
        print('Title = '+ title)
        print()
        print("**PARAMETERS USED ON WORLDCAT**")
        print('Surname: ' + surname)
        print('N. results: ' + str(results_info))
        print('===========')
        print()
        print("RESULTS (limited to max 100):")

        n = 1

        for u in urls:
            page = requests.get(u)
            soup = BeautifulSoup(page.content, 'html.parser')
            results = soup.select('a[id*=result]')
            for result in results:
                s = str(result['href'])
                print(n)
                title = result.find('strong').text  
                print('Title: '+ title)
                ocloc_id = re.search(r'[0-9]{6,}', s)
                print('ocloc_id: '+ ocloc_id.group(0))
                url = 'http://www.worldcat.org/oclc/'+ocloc_id.group(0)
                print('URL: '+ url)
                print('===========')
                print()
                n +=1
                
                




./urdu_cards_pages/1032971/U2_for_Transkribus/page/0024_24567998184.xml
https://www.worldcat.org/search?q=au%3A%27ABD%20al-RAHMAN&qt=advanced&dblist=638
CARD TRANSCRIPTION
Shelfmark = 14117. bbb. 24.
Author = 'ABD al-RAHMAN, Hafiz, Amritsari.
Title = KITAB AL-SARF

**PARAMETERS USED ON WORLDCAT**
Surname: 'ABD al-RAHMAN
N. results: 48,339

RESULTS (limited to max 100):
1
Title: Una crónica anónima de Abd al-Rahman III al Nasir
ocloc_id: 318446008
URL: http://www.worldcat.org/oclc/318446008

2
Title: مختصر سيدي عبد الرحمن الأخضري في العبادات على مذهب الإمام مالك. الجزء الأول /
Mukhtaṣar Sayyidī 'Abd al-Raḥmān al-Akhḍarī fī al-ʻibādāt ʻalà madhhab al-Imām Mālik. al-juz' al-awwal
ocloc_id: 1143686163
URL: http://www.worldcat.org/oclc/1143686163

3
Title: عمل اليوم والليلة. الجزء الأول /
'Amal al-yawm wa-al-laylah. al-juzʻ al-awwal
ocloc_id: 1143689551
URL: http://www.worldcat.org/oclc/1143689551

4
Title: تلبيس ابليس /
Talbīs Iblīs
ocloc_id: 1011502168
URL: http://www.worldcat.

KeyboardInterrupt: 

In [12]:
metadata['shelfmark'] = shelfmarks
metadata['author'] = authors
metadata['title'] = titles
metadata['query'] = query
metadata['query_results'] = query_results
metadata.to_csv('metadata.csv', index=False)