# Main functions

In [1]:
import requests
import zipfile
import csv
import os
import shutil
from io import BytesIO
import glob

TASK_SEPARATOR = "    "
SUBTASK_SEPARATOR = TASK_SEPARATOR * 2
from MyCapytain.resolvers.cts.local import CtsCapitainsLocalResolver


repositories = list(glob.glob("../../data/raw/corpora/**/*", recursive=False))


def download_corpus(tgt, corpus_name, corpus_version):
    """ Download a corpus

    :param tgt: Directory where to download
    :param corpus_name: Corpus Name
    :param corpus_version: Corpus version
    :return: Status
    :rtype: bool
    """
    target_dir = tgt+"/"+corpus_name.replace("/", "_")
    if os.path.isdir(target_dir):
        shutil.rmtree(target_dir)
    print(TASK_SEPARATOR+"Starting download")
    webfile = requests.get("https://github.com/{name}/archive/{version}.zip".format(
        name=corpus_name, version=corpus_version
    ))
    print(TASK_SEPARATOR+"Starting Unzipping")
    with zipfile.ZipFile(BytesIO(webfile.content)) as z:
        z.extractall(target_dir)
    directory_with_version_in_name = os.listdir(target_dir)[0]
    os.rename(
        os.path.join(target_dir, directory_with_version_in_name),
        os.path.join(target_dir, corpus_name.replace("/", "_"))
    )
    with open(
        os.path.join(target_dir, "current_version.txt".format(directory_with_version_in_name)),
        "w"
    ) as f:
        f.write(directory_with_version_in_name)

    print(TASK_SEPARATOR+"Done")
    return True, target_dir


def download_corpora(src="data/raw/corpora.csv", tgt="data/raw/corpora/", force=False, is_capitains=True):
    with open(src) as src_file:
        corpora = [corpus for corpus in csv.DictReader(src_file, delimiter=";")]
        new_corpora = []
        for corpus in corpora:
            if corpus["Current"] == corpus["Version"] and force is not True:
                print("{} stays on version {}".format(corpus["Name"], corpus["Current"]))
            else:
                print("{}'s version is {}. Downloading {}".format(corpus["Name"], corpus["Current"], corpus["Version"]))
                status, path = download_corpus(tgt, corpus["Name"], corpus["Version"])
                if status is True:
                    corpus["Current"] = corpus["Version"]
                    print(TASK_SEPARATOR+"Cleaning up the corpus")
                    if is_capitains:
                        clean_up_corpora(path)
            new_corpora.append({k: v for k, v in corpus.items()})

    # Update the corpus
    with open(src, "w") as src_file:
        writer = csv.DictWriter(src_file, delimiter=";", fieldnames=["Name", "Version", "Current"])
        writer.writeheader()
        writer.writerows(new_corpora)


def clean_up_corpora(src):
    resolver = CtsCapitainsLocalResolver(glob.glob(src+"/**"))
    translations = [x.path for x in resolver.getMetadata().readableDescendants if x.lang != "lat"]
    for trans in translations:
        try:
            os.remove(trans)
        except Exception as E:
            print(E)
    print(SUBTASK_SEPARATOR+"Removed {} text(s) not in Latin".format(len(translations)))
    print(SUBTASK_SEPARATOR+"Kept {} text(s) in Latin".format(
        len([x for x in resolver.getMetadata().readableDescendants if x.lang == "lat"]))
    )


# Read the versions

In [2]:
repos = {
    
}
with open("../../data/raw/corpora.csv") as f:
    for lineno, line in enumerate(f):
        line = line.strip().split(";")
        if lineno == 0:
            headers = line
        else:
            line = dict(zip(headers, line))
            repos[line["Name"]] = line["Version"]

# Check latest version

In [3]:
import requests

need_update = {
    
}
for repo in repos:
    req = requests.get(f"https://api.github.com/repos/{repo}/releases/latest")
    data = req.json()
    latest = data["tag_name"]
    when = data["created_at"]
    if latest != repos[repo]:
        print(f"{repo} is {latest}, current version on disk {repos[repo]}. Published on {when}")
        need_update[repo] = latest

lascivaroma/additional-texts is 1.0.153, current version on disk 1.0.151. Published on 2021-02-18T18:06:34Z


# Update required versions

In [4]:
repos = []
with open("../../data/raw/corpora.csv") as f:
    for lineno, line in enumerate(f):
        line = line.strip().split(";")
        if lineno == 0:
            headers = line
        else:
            line = dict(zip(headers, line))
            repos.append(line)
            
with open("../../data/raw/corpora.csv", "w") as f:
    f.write(";".join(headers)+"\n")
    for repo in repos:
        if repo["Name"] in need_update:
            repo["Version"] = need_update[repo["Name"]]
        f.write(
            ";".join(
                [repo[key] for key in headers]
            ) + "\n"
        )


with open("../../data/raw/corpora.csv") as f:
    print(f.read())

Name;Version;Current
PerseusDL/canonical-latinLit;0.0.752;0.0.752
OpenGreekAndLatin/csel-dev;1.0.128;1.0.128
lascivaroma/priapeia;1.1.18;1.1.18
lascivaroma/additional-texts;1.0.153;1.0.151
ponteineptique/digiliblt;0.0.39;0.0.39
OpenGreekAndLatin/Latin;v1.10.0;v1.10.0



# Download and clean

In [5]:
download_corpora(
    src="../../data/raw/corpora.csv",
    tgt="../../data/raw/corpora/",
    force=False,
    is_capitains=True
)

PerseusDL/canonical-latinLit stays on version 0.0.752
OpenGreekAndLatin/csel-dev stays on version 1.0.128
lascivaroma/priapeia stays on version 1.1.18
lascivaroma/additional-texts's version is 1.0.151. Downloading 1.0.153
    Starting download
    Starting Unzipping
    Done
    Cleaning up the corpus


../../data/raw/corpora//lascivaroma_additional-texts/lascivaroma_additional-texts/data/phi1351/phi005/phi1351.phi005.perseus-eng1.xml is not present


[Errno 2] No such file or directory: '../../data/raw/corpora//lascivaroma_additional-texts/lascivaroma_additional-texts/data/phi1351/phi005/phi1351.phi005.perseus-eng1.xml'
        Removed 1 text(s) not in Latin
        Kept 100 text(s) in Latin
ponteineptique/digiliblt stays on version 0.0.39
OpenGreekAndLatin/Latin stays on version v1.10.0
