In [421]:
import glob
import re
import json
import os
import xml.etree.ElementTree as ET 

from pathlib import Path
import uuid
import ftfy


In [419]:
!pip install ftfy

Collecting ftfy
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ftfy
Successfully installed ftfy-6.1.1


In [167]:
corpus_names_dict = {    
    "disco2_1": "Disco V2.1",
    "disco3": "Disco V3",
    "adso": "Sonetos Siglo de Oro",
    "adso100": "ADSO 100 poems corpus",
    "plc": "Poesía Lírica Castellana Siglo de Oro",
    "gongo": "Gongocorpus",
    "ecpa": "Eighteenth Century Poetry Archive",
    "4b4v": "For Better For Verse",
    "mel": "Métrique en Ligne",
    "bibit": "Biblioteca Italiana",
    "czverse": "Corpus of Czech Verse",
    "stichopt": "Stichotheque Portuguese",
}

In [405]:
uuid.uuid4().hex

'3fb352cb2ee342baa9e6db1284be33f5'

In [390]:
corpora_list = list(corpus_names_dict.keys())

In [350]:
for corpus in corpora_list:
    p = Path('corpora') / corpus / "averell" / "parser"
    poem_path_list = p.glob("**/*.json")

In [422]:
ftfy.fix_text(author)

'Gonçalves Dias'

In [383]:
filename_set

{'caca'}

In [384]:
filename_list = ["caca", "culo", "caca"]

In [386]:
filename_list.count("sdasdas")

0

In [417]:
filename_list = []
for corpus in corpora_list:
    p = Path('corpora') / corpus / "averell" / "parser"
    poem_path_list = p.glob("**/*.json")
    for poem_path in poem_path_list:
        
        with open(poem_path, "r") as poem_file:
            poem = json.load(poem_file)
        file_name = poem_path.stem
        filename_list.append(file_name)
        poem_title = poem["poem_title"]
        author = poem["author"]
        corpus_name = corpus_names_dict[poem["corpus"]]
        manually_checked = poem["manually_checked"]

        poem_id = f"{author}_{file_name}"
        
        root = ET.Element("TEI")
        header = ET.SubElement(root, "teiHeader")

        file_desc = ET.SubElement(header, "fileDesc")

        title_stmt = ET.SubElement(file_desc, "titleStmt")
        title_stmt_desc = ET.SubElement(title_stmt, "title")
        author_stmt_desc = ET.SubElement(title_stmt, "author")
        title_stmt_desc.text = title
        author_stmt_desc.text = author

        extent = ET.SubElement(file_desc, "extent")


        pub_stmt = ET.SubElement(file_desc, "publicationStmt")
        publisher = ET.SubElement(pub_stmt, "publisher")
        publisher.text = "UNED University"
        idno = ET.SubElement(pub_stmt, "idno")
        idno.text = poem_id
        availability = ET.SubElement(pub_stmt, "availability")
        availability.attrib["status"] = "free"
        p = ET.SubElement(availability, "p")
        p.text = "The text is freely available."

        series_stmt = ET.SubElement(file_desc, "seriesStmt")
        title_series = ET.SubElement(series_stmt, "title")
        title_series.text = corpus_name

        source_desc = ET.SubElement(file_desc, "sourceDesc")
        bibl_source = ET.SubElement(source_desc, "bibl")
        bibl_title = ET.SubElement(bibl_source, "title")
        bibl_title.text = title
        bibl_author = ET.SubElement(bibl_source, "author")
        bibl_author.text = author

        lg_main = ET.SubElement(root, "lg")
        lg_main.attrib["xmlns"] = "http://www.tei-c.org/ns/1.0"
        lg_main.attrib["type"] = "poem"

        measure_st = ET.SubElement(extent, "measure")
        measure_st.attrib["unit"] = "stanza"
        measure_st.text = str(len(poem["stanzas"]))
        n_lines = 0

        for stanza in poem["stanzas"]:

            n_lines += len(stanza["lines"])

            stanza_number = stanza["stanza_number"]
            stanza_type = stanza.get("stanza_type")
            lg = ET.SubElement(lg_main, "lg")
            lg.attrib["n"] = str(stanza_number)
            if stanza_type:
                lg.attrib["stanza_type"] = stanza_type
            for line in stanza["lines"]:
                l = ET.SubElement(lg, "l")
                l.text = line["line_text"]
                l.attrib["n"] = str(line["line_number"])

                metrical_pattern = line.get("metrical_pattern")
                rhyme = line.get("rhyme")
                line_length = line.get("line_length")

                if metrical_pattern:
                    l.attrib["met"] = str(metrical_pattern)
                if rhyme:
                    l.attrib["rhyme"] = str(rhyme)
                if line_length:
                    l.attrib["line_length"] = str(line_length)

        measure_l = ET.SubElement(extent, "measure")
        measure_l.attrib["unit"] = "line"
        measure_l.text = str(n_lines)
        tree = ET.ElementTree(root)

        #output_path = Path('corpora') / f'{poem["corpus"]}' / 'averell' / 'TEI'
        output_base_path = Path('corpora') / f'{poem["corpus"]}'
        output_path = output_base_path / 'TEI'
        

        #prefix = '{:05d}'.format(filename_list.count(file_name))
        output_file = f"{poem_id}.xml"
            
        if not os.path.exists(output_base_path):
            Path.mkdir(output_base_path)
        if not os.path.exists(output_path):
            Path.mkdir(output_path)
        ET.indent(tree,space=" ", level=0)

        tree.write(f"{Path(output_path) / output_file}", encoding = "UTF-8", xml_declaration = True) 

In [418]:
poem_path

PosixPath('corpora/stichopt/averell/parser/goncalves-dias/canto-primeiro.json')

In [326]:
ET.indent(tree,space=" ", level=0)
ET.dump(tree)

<TEI>
 <teiHeader>
  <fileDesc>
   <titleStmt>
    <title>A la dina dana dina, la dina dana,</title>
    <author>San Juan de la Cruz</author>
   </titleStmt>
   <extent>
    <measure unit="stanza">1</measure>
    <measure unit="line">16</measure>
   </extent>
   <publicationStmt>
    <publisher>UNED University</publisher>
    <idno>plc-no-llora-por-haberle-amor</idno>
    <availability status="free">
     <p>The text is freely available.</p>
    </availability>
   </publicationStmt>
   <seriesStmt>
    <title>Poesía Lírica Castellana Siglo de Oro</title>
   </seriesStmt>
   <sourceDesc>
    <bibl>
     <title>A la dina dana dina, la dina dana,</title>
     <author>San Juan de la Cruz</author>
    </bibl>
   </sourceDesc>
  </fileDesc>
 </teiHeader>
 <lg xmlns="http://www.tei-c.org/ns/1.0" type="poem">
  <lg n="1" stanza_type="...">
   <l n="1" met="+---+-+-+-">No llora por haberle amor llagado,</l>
   <l n="2" met="-+-+-+-+-+-">que no se pena en verse así afligido,</l>
   <l n="3" met=

In [8]:
def get_whole_poem(poem_title):
    poem_items = []
    for poem_item in corpus_json:
        if poem_item["poem_title"] == poem_title:
            poem_items.append(poem_item)
    return poem_items

In [9]:
poem_corpus_json = {}
for idx, poem_title in enumerate(titles):
    poem_items = get_whole_poem(poem_title)
    poem_corpus_json[idx] = poem_items

In [10]:
poem = poem_corpus_json[0]

In [11]:
len(poem)

2

In [12]:
<teiHeader>
 <fileDesc>
  <titleStmt>
   <title>
<!-- title of the resource -->
   </title>
  </titleStmt>
  <publicationStmt>
   <p>
<!-- Information about distribution of the resource -->
   </p>
  </publicationStmt>
  <sourceDesc>
   <p>
<!-- Information about source from which the resource derives -->
   </p>
  </sourceDesc>
 </fileDesc>
</teiHeader>
bibliography

SyntaxError: invalid syntax (2695188288.py, line 1)

In [13]:
stanza_number = 1
root = ET.Element("TEI")
#header = ET.SubElement(root, "teiHeader")
header = ET.SubElement(root, "teiHeader")
title_stmt = ET.SubElement(header, "titleStmt")
title_stmt_desc = ET.SubElement(title_stmt, "title")


lg_main = ET.SubElement(root, "lg")
lg_main.attrib["xmlns"] = "http://www.tei-c.org/ns/1.0"
lg_main.attrib["type"] = "poem"
lg = ET.SubElement(lg_main, "lg")
lg.attrib["n"] = "1"

for line in poem:
    if int(line["stanza_number"]) > stanza_number:
        stanza_number+=1
        lg = ET.SubElement(lg_main, "lg")
        lg.attrib["n"] = str(stanza_number)
    l = ET.SubElement(lg, "l")
    l.text = line["line_text"]
    l.attrib["n"] = str(line["line_number"])
    tree = ET.ElementTree(root)
    #print(line["line_number"], line["line_text"], line["metrical_pattern"])

In [17]:
ET.indent(tree,space=" ", level=0)
ET.dump(tree)

<TEI>
 <teiHeader>
  <titleStmt>
   <title />
  </titleStmt>
 </teiHeader>
 <lg xmlns="http://www.tei-c.org/ns/1.0" type="poem">
  <lg n="1">
   <l n="1">A este que admiramos en luciente,</l>
   <l n="2">émulo del diamante, limpio acero,</l>
  </lg>
 </lg>
</TEI>


In [25]:
tree.write("output.xml", encoding = "UTF-8", xml_declaration = True) 

In [21]:
for key, value in poem_corpus_dict.items():
    title = value["poem"]
    location = value["location"]
    lines = value["poem"].strip().split("\n")
    
    
    root = ET.Element("TEI")

    header = ET.SubElement(root, "teiHeader")

    filedesc = ET.SubElement(header, "fileDesc")
    titlestmt = ET.SubElement(filedesc, "titleStmt")
    tei_title = ET.SubElement(titlestmt, "title")
    tei_title.text = title

    sourcedesc = ET.SubElement(filedesc, "sourceDesc")
    bibl = ET.SubElement(sourcedesc, "bibl")
    col_author = ET.SubElement(bibl, "author")
    col_author.text = COLLECTION_AUTHOR
    
    col_title = ET.SubElement(bibl, "title")
    col_title.text = COLLECTION_TITLE
    
    col_place = ET.SubElement(bibl, "pubPlace")
    col_place.text = COLLECTION_PLACE
    
    col_date = ET.SubElement(bibl, "date")
    col_date.text = COLLECTION_DATE
    
    num = ET.SubElement(bibl, "num")
    num.text = str(key)

    notes_stmt = ET.SubElement(header, "notesStmt")
    note = ET.SubElement(notes_stmt, "note")
    note.set("n", "subcollection")
    note.text = subcollection
    
    if isinstance(location, str):
        setting_desc = ET.SubElement(header, "settingDesc")
        name = ET.SubElement(header, "name")
        name.text = location   
        
    text = ET.SubElement(root, "text")
    
    for line in lines:
        if line != "":
            l = ET.SubElement(text, "l")
            l.text = line.strip()
    
    
    tree = ET.ElementTree(root)
    ET.indent(tree,space=" ", level=0)
    
    # Cambiar debugging a True para NO escribir los archivos, y en su lugar mostrarlo en pantalla.
    debugging = True
    
    if debugging:
        ET.dump(tree)
    else:
        tree.write(f"tei/{key}_{title}.xml",encoding="utf-8")

NameError: name 'poem_corpus_dict' is not defined