In [25]:
import sqlite3
import pandas as pd
import requests
from tqdm import tqdm

In [3]:
conn = sqlite3.connect('../database/ceneton.db')

df = pd.read_sql_query("SELECT Nummer, Nummering, http as url FROM ceneton2025_fmp12 WHERE http is not null", conn)
print(df)

             nummer nummering                              url
0     Ceneton000080    000080            ../Renaissance/ALFAPS
1     Ceneton000100    000100               A_Tr_Welstandt1660
2     Ceneton000110    000110                    AVHAsotus1650
3     Ceneton000150    000150                AcketClarinde1700
4     Ceneton000250    000250   AelhuisenAranEtTitusJanVos1658
...             ...       ...                              ...
1051  Ceneton119971    119971             CruythofkenDelft1600
1052  Ceneton119971    119971           TafelspelenOK061217UBA
1053  Ceneton120151    120151                   JandePlug1730a
1054  Ceneton120460    120460  VierNieuweTafelSpeelen1608@Trou
1055  Ceneton120671    120671                    Wijnkanne1647

[1056 rows x 3 columns]


In [23]:
root_url = "https://www.let.leidenuniv.nl/Dutch/Ceneton/"

def download_transcript(ceneton_id):
    ceneton_id = ceneton_id.strip()
    url = f"{root_url}{ceneton_id}"
    if not url.endswith(".html"):
        url = f"{url}.html"
    response = requests.head(url)

    status_code = response.status_code
    data = dict(status=status_code, url=response.url)
    if status_code == 200:
        data['etag'] = response.headers.get('ETag')
        data['content_length'] = response.headers.get('Content-Length')
        data['content_type'] = response.headers.get('Content-Type')
        data['last_modified'] = response.headers.get('Last-Modified')
    return data

download_transcript("../Renaissance/ALFAPS")

{'status': 200,
 'url': 'https://www.let.leidenuniv.nl/Dutch/Renaissance/ALFAPS.html',
 'etag': '"3265d-5288148279880-gzip"',
 'content_length': None,
 'content_type': 'text/html',
 'last_modified': 'Mon, 04 Jan 2016 12:30:10 GMT'}

In [26]:
urls = list(df.url.unique())
print(f"Indexing {len(urls)} URLs")
progress = tqdm(urls)


all_data = []
for url in progress:
    progress.set_description(url)
    data = download_transcript(url)
    data['id'] = url
    all_data.append(data)

all_data = pd.DataFrame(all_data)
all_data


Indexing 901 URLs


AugustijnenBalthassar1728%|▎         | 30/901 [00:02<01:19, 10.89it/s]         
AugustijnenBalthassar1728 [00:02<01:19, 10.89it/s]
Wijnkanne1647: 100%|██████████| 901/901 [01:23<00:00, 10.75it/s]                                    


Unnamed: 0,status,url,etag,content_length,content_type,last_modified,id
0,200,https://www.let.leidenuniv.nl/Dutch/Renaissanc...,"""3265d-5288148279880-gzip""",,text/html,"Mon, 04 Jan 2016 12:30:10 GMT",../Renaissance/ALFAPS
1,200,https://www.let.leidenuniv.nl/Dutch/Ceneton/A_...,"""f8aa-5b92c73706a80-gzip""",18212,text/html,"Mon, 18 Jan 2021 13:10:50 GMT",A_Tr_Welstandt1660
2,200,https://www.let.leidenuniv.nl/Dutch/Ceneton/AV...,"""d662-5b92c73336180-gzip""",19808,text/html,"Mon, 18 Jan 2021 13:10:46 GMT",AVHAsotus1650
3,200,https://www.let.leidenuniv.nl/Dutch/Ceneton/Ac...,"""27c82-5b92c73706a80-gzip""",50849,text/html,"Mon, 18 Jan 2021 13:10:50 GMT",AcketClarinde1700
4,200,https://www.let.leidenuniv.nl/Dutch/Ceneton/Ae...,"""3d30e-5cd81b2ee7300-gzip""",54523,text/html,"Mon, 04 Oct 2021 07:17:00 GMT",AelhuisenAranEtTitusJanVos1658
...,...,...,...,...,...,...,...
896,200,https://www.let.leidenuniv.nl/Dutch/Ceneton/Vo...,"""8f4d-5b92d5ff34a80-gzip""",11441,text/html,"Mon, 18 Jan 2021 14:16:58 GMT",VoorburghsSnoepreysje1690
897,200,https://www.let.leidenuniv.nl/Dutch/Ceneton/Vo...,"""9149-5c9b3a25ebd00-gzip""",11491,text/html,"Mon, 16 Aug 2021 21:10:44 GMT",VoorburghsSnoepreysje1690ca
898,200,https://www.let.leidenuniv.nl/Dutch/Ceneton/Ja...,"""921b-5b92c89412200-gzip""",9303,text/html,"Mon, 18 Jan 2021 13:16:56 GMT",JandePlug1730a
899,404,https://www.let.leidenuniv.nl/Dutch/Ceneton/Vi...,,,,,VierNieuweTafelSpeelen1608@Trou


In [27]:
all_data.to_csv("../database/transcripts.csv", index=False)