# Main functions

In [19]:
import requests
import zipfile
import csv
import os
import shutil
from io import BytesIO
import glob

TASK_SEPARATOR = "    "
SUBTASK_SEPARATOR = TASK_SEPARATOR * 2
from MyCapytain.resolvers.cts.local import CtsCapitainsLocalResolver


repositories = list(glob.glob("../../data/raw/corpora/**/*", recursive=False))


def download_corpus(tgt, corpus_name, corpus_version):
    """ Download a corpus

    :param tgt: Directory where to download
    :param corpus_name: Corpus Name
    :param corpus_version: Corpus version
    :return: Status
    :rtype: bool
    """
    target_dir = tgt+"/"+corpus_name.replace("/", "_")
    if os.path.isdir(target_dir):
        shutil.rmtree(target_dir)
    print(TASK_SEPARATOR+"Starting download")
    webfile = requests.get("https://github.com/{name}/archive/{version}.zip".format(
        name=corpus_name, version=corpus_version
    ))
    print(TASK_SEPARATOR+"Starting Unzipping")
    with zipfile.ZipFile(BytesIO(webfile.content)) as z:
        z.extractall(target_dir)
    print(TASK_SEPARATOR+"Done")
    return True, target_dir


def download_corpora(src="data/raw/corpora.csv", tgt="data/raw/corpora/", force=False, is_capitains=True):
    with open(src) as src_file:
        corpora = [corpus for corpus in csv.DictReader(src_file, delimiter=";")]
        new_corpora = []
        for corpus in corpora:
            if corpus["Current"] == corpus["Version"] and force is not True:
                print("{} stays on version {}".format(corpus["Name"], corpus["Current"]))
            else:
                print("{}'s version is {}. Downloading {}".format(corpus["Name"], corpus["Current"], corpus["Version"]))
                status, path = download_corpus(tgt, corpus["Name"], corpus["Version"])
                if status is True:
                    corpus["Current"] = corpus["Version"]
                    print(TASK_SEPARATOR+"Cleaning up the corpus")
                    if is_capitains:
                        clean_up_corpora(path)
            new_corpora.append({k: v for k, v in corpus.items()})

    # Update the corpus
    with open(src, "w") as src_file:
        writer = csv.DictWriter(src_file, delimiter=";", fieldnames=["Name", "Version", "Current"])
        writer.writeheader()
        writer.writerows(new_corpora)


def clean_up_corpora(src):
    resolver = CtsCapitainsLocalResolver(glob.glob(src+"/**"))
    translations = [x.path for x in resolver.getMetadata().readableDescendants if x.lang != "lat"]
    for trans in translations:
        try:
            os.remove(trans)
        except Exception as E:
            print(E)
    print(SUBTASK_SEPARATOR+"Removed {} text(s) not in Latin".format(len(translations)))
    print(SUBTASK_SEPARATOR+"Kept {} text(s) in Latin".format(
        len([x for x in resolver.getMetadata().readableDescendants if x.lang == "lat"]))
    )


# Read the versions

In [20]:
repos = {
    
}
with open("../../data/raw/corpora.csv") as f:
    for lineno, line in enumerate(f):
        line = line.strip().split(";")
        if lineno == 0:
            headers = line
        else:
            line = dict(zip(headers, line))
            repos[line["Name"]] = line["Version"]

# Check latest version

In [21]:
import requests

need_update = {
    
}
for repo in repos:
    req = requests.get(f"https://api.github.com/repos/{repo}/releases/latest")
    data = req.json()
    latest = data["tag_name"]
    when = data["created_at"]
    if latest != repos[repo]:
        print(f"{repo} is {latest}, current version on disk {repos[repo]}. Published on {when}")
        need_update[repo] = latest

# Update required versions

In [22]:
repos = []
with open("../../data/raw/corpora.csv") as f:
    for lineno, line in enumerate(f):
        line = line.strip().split(";")
        if lineno == 0:
            headers = line
        else:
            line = dict(zip(headers, line))
            repos.append(line)
            
with open("../../data/raw/corpora.csv", "w") as f:
    f.write(";".join(headers)+"\n")
    for repo in repos:
        if repo["Name"] in need_update:
            repo["Version"] = need_update[repo["Name"]]
        f.write(
            ";".join(
                [repo[key] for key in headers]
            ) + "\n"
        )


with open("../../data/raw/corpora.csv") as f:
    print(f.read())

Name;Version;Current
PerseusDL/canonical-latinLit;0.0.557;0.0.508
OpenGreekAndLatin/csel-dev;1.0.67;1.0.63
lascivaroma/priapeia;1.1.18;1.1.12
lascivaroma/additional-texts;1.0.111;1.0.98
ponteineptique/digiliblt;0.0.32;0.0.32



# Download and clean

In [23]:
download_corpora(
    src="../../data/raw/corpora.csv",
    tgt="../../data/raw/corpora/",
    force=False,
    is_capitains=True
)

PerseusDL/canonical-latinLit's version is 0.0.508. Downloading 0.0.557
    Starting download
    Starting Unzipping


../../data/raw/corpora//PerseusDL_canonical-latinLit/canonical-latinLit-0.0.557/data/phi0448/phi002/phi0448.phi002.perseus-eng2.xml is not present


    Done
    Cleaning up the corpus


../../data/raw/corpora//PerseusDL_canonical-latinLit/canonical-latinLit-0.0.557/data/phi0474/phi051/phi0474.phi051.perseus-eng1.xml is not present
../../data/raw/corpora//PerseusDL_canonical-latinLit/canonical-latinLit-0.0.557/data/phi0474/phi056/phi0474.phi056.perseus-eng1.xml is not present
../../data/raw/corpora//PerseusDL_canonical-latinLit/canonical-latinLit-0.0.557/data/phi0474/phi059/phi0474.phi059.perseus-eng1.xml is not present
../../data/raw/corpora//PerseusDL_canonical-latinLit/canonical-latinLit-0.0.557/data/phi0474/phi053/phi0474.phi053.perseus-eng1.xml is not present
../../data/raw/corpora//PerseusDL_canonical-latinLit/canonical-latinLit-0.0.557/data/phi0474/phi055/phi0474.phi055.perseus-eng1.xml is not present
../../data/raw/corpora//PerseusDL_canonical-latinLit/canonical-latinLit-0.0.557/data/phi0474/phi035/phi0474.phi035.perseus-lat1.xml is not present
../../data/raw/corpora//PerseusDL_canonical-latinLit/canonical-latinLit-0.0.557/data/phi0474/phi058/phi0474.phi058.per

[Errno 2] No such file or directory: '../../data/raw/corpora//PerseusDL_canonical-latinLit/canonical-latinLit-0.0.557/data/phi0448/phi002/phi0448.phi002.perseus-eng2.xml'
[Errno 2] No such file or directory: '../../data/raw/corpora//PerseusDL_canonical-latinLit/canonical-latinLit-0.0.557/data/phi0474/phi051/phi0474.phi051.perseus-eng1.xml'
[Errno 2] No such file or directory: '../../data/raw/corpora//PerseusDL_canonical-latinLit/canonical-latinLit-0.0.557/data/phi0474/phi056/phi0474.phi056.perseus-eng1.xml'
[Errno 2] No such file or directory: '../../data/raw/corpora//PerseusDL_canonical-latinLit/canonical-latinLit-0.0.557/data/phi0474/phi059/phi0474.phi059.perseus-eng1.xml'
[Errno 2] No such file or directory: '../../data/raw/corpora//PerseusDL_canonical-latinLit/canonical-latinLit-0.0.557/data/phi0474/phi053/phi0474.phi053.perseus-eng1.xml'
[Errno 2] No such file or directory: '../../data/raw/corpora//PerseusDL_canonical-latinLit/canonical-latinLit-0.0.557/data/phi0474/phi055/phi0474

../../data/raw/corpora//lascivaroma_additional-texts/additional-texts-1.0.111/data/phi1351/phi005/phi1351.phi005.perseus-eng1.xml is not present


    Starting Unzipping
    Done
    Cleaning up the corpus
[Errno 2] No such file or directory: '../../data/raw/corpora//lascivaroma_additional-texts/additional-texts-1.0.111/data/phi1351/phi005/phi1351.phi005.perseus-eng1.xml'
        Removed 1 text(s) not in Latin
        Kept 23 text(s) in Latin
ponteineptique/digiliblt stays on version 0.0.32
