In [111]:
CORPORA_PATH = "../../data/raw/corpora/**/*"


In [112]:
import lxml.etree as ET
def get_xslt(citation_scheme):
    additional_xsl = ""
    for level in range(len(citation_scheme)):
        xpath = f"{citation_scheme[level].scope}{citation_scheme[level].xpath}".replace("='?'", "").replace("=\"?\"", "").replace('"', "'")
        name = citation_scheme[level].name or "unknown"
        if level == 0:
            additional_xsl += "\n" + f"""
        <xsl:template match="{xpath}">
            <xsl:text>¬</xsl:text>
            <xsl:text>{name}]][[</xsl:text><xsl:value-of select="@n"/>
            <xsl:apply-templates select="node()">
                <xsl:with-param name="previous"><xsl:value-of select="@n"/></xsl:with-param>
            </xsl:apply-templates>
        </xsl:template>"""
        else:
            additional_xsl += "\n" + f"""
        <xsl:template match="{xpath}">
            <xsl:param name="previous" />
            <xsl:text>¬</xsl:text>
            <xsl:text>{name}]][[</xsl:text><xsl:value-of select="$previous" /><xsl:text>.</xsl:text><xsl:value-of select="@n"/>
            <xsl:apply-templates select="node()">
                <xsl:with-param name="previous"><xsl:value-of select="$previous" /><xsl:text>.</xsl:text><xsl:value-of select="@n"/></xsl:with-param>
            </xsl:apply-templates>
        </xsl:template>"""
    return ET.XSLT(ET.fromstring("""<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
        xmlns:xs="http://www.w3.org/2001/XMLSchema"
        exclude-result-prefixes="xs" xmlns:tei="http://www.tei-c.org/ns/1.0"
        version="1.0">
        <xsl:output method="text" encoding="UTF-8"/>
        <xsl:template match="node()">
            <xsl:param name="previous" />
            <xsl:apply-templates select="node()"><xsl:with-param name="previous" select="$previous" /></xsl:apply-templates>
        </xsl:template>
        <xsl:template match="text()" />
        {previous}

    </xsl:stylesheet>""".format(previous=additional_xsl)))

In [113]:
import glob
from MyCapytain.resolvers.cts.local import CtsCapitainsLocalResolver


repositories = list(glob.glob(CORPORA_PATH, recursive=False))
resolver = CtsCapitainsLocalResolver(repositories)

../../data/raw/corpora/lascivaroma_additional-texts/lascivaroma_additional-texts/data/phi1351/phi005/phi1351.phi005.perseus-eng1.xml is not present
../../data/raw/corpora/lascivaroma_priapeia/lascivaroma_priapeia/data/phi1103/phi001/phi1103.phi001.lascivaroma-eng1.xml is not present
../../data/raw/corpora/lascivaroma_priapeia/lascivaroma_priapeia/data/phi1103/phi001/phi1103.phi001.lascivaroma-eng2.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi0472/phi001/phi0472.phi001.perseus-eng3.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi0472/phi001/phi0472.phi001.perseus-eng4.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi0448/phi001/phi0448.phi001.perseus-eng2.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi0448/phi002/phi0448.phi002.perseus-eng2.xml is 

../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi0690/phi003/phi0690.phi003.perseus-eng2.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi1017/phi011/phi1017.phi011.perseus-eng2.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi0660/phi003/phi0660.phi003.perseus-eng2.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi1002/phi001/phi1002.phi001.perseus-eng2.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi0914/phi001/phi0914.phi001.perseus-eng3.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi0917/phi001/phi0917.phi001.perseus-eng2.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi0119/phi004/phi0119.phi

../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi1351/phi004/phi1351.phi004.perseus-eng1.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi1351/phi001/phi1351.phi001.perseus-eng2.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi1351/phi001/phi1351.phi001.perseus-eng1.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi1351/phi002/phi1351.phi002.perseus-eng1.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi0550/phi001/phi0550.phi001.perseus-eng1.xml is not present


In [114]:
import time

start = time.time()
texts = 0
for epoch in range(5):
    for doc in resolver.texts:
        if doc.lang == "lat":
            xsl = get_xslt(doc.citation)
            out = [
                reff.split("]][[")
                for reff in str(xsl(resolver.getTextualNode(doc.id).xml)).strip().split("¬")
                if reff
            ]
            out = list(out)
            texts += 1
        
print(f"Total time: {time.time()-start:.3f} seconds")
print(f"Time per text: {(time.time()-start)/(texts):.3f} seconds")
print(f"Docs: {texts}")

Total time: 113.100 seconds
Time per text: 0.030 seconds
Docs: 3790


In [115]:
start = time.time()
texts = 0
for epoch in range(5):
    for doc in resolver.texts:
        if doc.lang == "lat":
            citations = list([
                reff
                for level in range(doc.citation.depth)
                for reff in resolver.getReffs(textId=doc.id, level=level+1)
            ])
            texts += 1
        
print(f"Total time: {time.time()-start:.3f} seconds")
print(f"Time per text: {(time.time()-start)/(texts):.3f} seconds")
print(f"Docs: {texts}")

Total time: 73.923 seconds
Time per text: 0.020 seconds
Docs: 3790


In [116]:
import time

start = time.time()
texts = 0
for epoch in range(5):
    for doc in resolver.texts:
        if doc.lang == "lat":
            xsl = get_xslt(doc.citation)
            data = str(xsl(resolver.getTextualNode(doc.id).xml))
            texts += 1
        
print(f"Total time: {time.time()-start:.3f} seconds")
print(f"Time per text: {(time.time()-start)/(texts):.3f} seconds")
print(f"Docs: {texts}")

Total time: 110.086 seconds
Time per text: 0.029 seconds
Docs: 3790


In [117]:
# Retrieve only nodes that are matching the XPATH


def get_xpath(citation):
    return f"{citation.scope}{citation.xpath}".replace("='?'", "").replace("=\"?\"", "").replace('"', "'")

def get_xslt2(citation_scheme):
    additional_xsl = ""
    for level in range(len(citation_scheme)):
        xpath = get_xpath(citation_scheme[level])
        name = citation_scheme[level].name or "unknown"
        
        additional_xsl += '<xsl:template match="'+xpath+'"><xsl:param name="previous" />' + """
        <xsl:text>{'ref':'</xsl:text>
        """
        
        if level > 0:
            cur = """<xsl:value-of select="$previous" /><xsl:text>.</xsl:text><xsl:value-of select="@n"/>"""
        else:
            cur = """<xsl:value-of select="@n"/>"""
        
        additional_xsl += cur
        
        additional_xsl += f"<xsl:text>', 'type': '{name}', 'children':[</xsl:text>"
        additional_xsl += """<xsl:apply-templates select="node()"><xsl:with-param name="previous">"""+cur+"""</xsl:with-param></xsl:apply-templates><xsl:text>]},</xsl:text>
        </xsl:template>"""

    return ET.XSLT(ET.fromstring("""<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
        xmlns:xs="http://www.w3.org/2001/XMLSchema"
        exclude-result-prefixes="xs" xmlns:tei="http://www.tei-c.org/ns/1.0"
        version="1.0">
        <xsl:output method="text" encoding="UTF-8"/>
        <xsl:template match="node()">
            <xsl:param name="previous" />
            <xsl:apply-templates select="node()"><xsl:with-param name="previous" select="$previous" /></xsl:apply-templates>
        </xsl:template>
        <xsl:template match="text()" />
        {previous}

    </xsl:stylesheet>""".format(previous=additional_xsl)))

In [119]:
import time

start = time.time()
texts = 0
for epoch in range(5):
    for doc in resolver.texts:
        if doc.lang == "lat":
            xsl = get_xslt2(doc.citation)
            data = "["+str(xsl(resolver.getTextualNode(doc.id).xml)).strip()[:-1]+"]"
            texts += 1
            elements = eval(data)
            
print(f"Total time: {time.time()-start:.3f} seconds")
print(f"Time per text: {(time.time()-start)/(texts):.3f} seconds")
print(f"Docs: {texts}")

Total time: 259.493 seconds
Time per text: 0.068 seconds
Docs: 3790


In [121]:
import time
import json

start = time.time()
texts = 0
for epoch in range(5):
    for doc in resolver.texts:
        if doc.lang == "lat":
            xsl = get_xslt2(doc.citation)
            data = "["+str(xsl(resolver.getTextualNode(doc.id).xml)).strip()[:-1]+"]"
            texts += 1
            elements = json.loads(data.replace("'", '"').replace(",]", "]"))
            
print(f"Total time: {time.time()-start:.3f} seconds")
print(f"Time per text: {(time.time()-start)/(texts):.3f} seconds")
print(f"Docs: {texts}")

Total time: 246.328 seconds
Time per text: 0.065 seconds
Docs: 3790
