In [1]:
import opustools

In [2]:
import inspect

In [3]:
og = opustools.opus_get.OpusGet(source='fr', preprocess='mono', list_resources=True)

In [4]:
og.get_files()

   1 MB https://object.pouta.csc.fi/OPUS-bible-uedin/v1/mono/fr.tok.gz
   1 MB https://object.pouta.csc.fi/OPUS-bible-uedin/v1/mono/fr.txt.gz
   6 MB https://object.pouta.csc.fi/OPUS-Books/v1/mono/fr.tok.gz
   7 MB https://object.pouta.csc.fi/OPUS-Books/v1/mono/fr.txt.gz
 157 MB https://object.pouta.csc.fi/OPUS-DGT/v2019/mono/fr.tok.gz
  32 KB https://object.pouta.csc.fi/OPUS-DGT/v2019/mono/fr.txt.gz
  89 MB https://object.pouta.csc.fi/OPUS-ECB/v1/mono/fr.tok.gz
  89 MB https://object.pouta.csc.fi/OPUS-ECB/v1/mono/fr.txt.gz
  22 MB https://object.pouta.csc.fi/OPUS-EhuHac/v1/mono/fr.tok.gz
  22 MB https://object.pouta.csc.fi/OPUS-EhuHac/v1/mono/fr.txt.gz
  22 MB https://object.pouta.csc.fi/OPUS-EMEA/v3/mono/fr.tok.gz
  22 MB https://object.pouta.csc.fi/OPUS-EMEA/v3/mono/fr.txt.gz
 708 MB https://object.pouta.csc.fi/OPUS-EUbookshop/v2/mono/fr.tok.gz
 708 MB https://object.pouta.csc.fi/OPUS-EUbookshop/v2/mono/fr.txt.gz
 248 KB https://object.pouta.csc.fi/OPUS-EUconst/v1/mono/fr.tok.gz
 24

In [5]:
corpora, file_n, total_size = og.get_corpora_data()

In [6]:
corpora

[{'alignment_pairs': 39693,
  'corpus': 'bible-uedin',
  'documents': 1,
  'id': 5168,
  'latest': 'True',
  'preprocessing': 'mono',
  'size': 1276,
  'source': 'fr',
  'source_tokens': 857843,
  'target': '',
  'target_tokens': None,
  'url': 'https://object.pouta.csc.fi/OPUS-bible-uedin/v1/mono/fr.tok.gz',
  'version': 'v1'},
 {'alignment_pairs': 39693,
  'corpus': 'bible-uedin',
  'documents': 1,
  'id': 5169,
  'latest': 'True',
  'preprocessing': 'mono',
  'size': 1268,
  'source': 'fr',
  'source_tokens': 857843,
  'target': '',
  'target_tokens': None,
  'url': 'https://object.pouta.csc.fi/OPUS-bible-uedin/v1/mono/fr.txt.gz',
  'version': 'v1'},
 {'alignment_pairs': 164916,
  'corpus': 'Books',
  'documents': 29,
  'id': 31362,
  'latest': 'True',
  'preprocessing': 'mono',
  'size': 6336,
  'source': 'fr',
  'source_tokens': 3635088,
  'target': '',
  'target_tokens': None,
  'url': 'https://object.pouta.csc.fi/OPUS-Books/v1/mono/fr.tok.gz',
  'version': 'v1'},
 {'alignment_pa

In [7]:
no_tok = []
file_n = 0
total_size = 0
for c in corpora:
    if 'tok' not in c['url']:
        no_tok.append(c)
        file_n += 1
        total_size += c['size']

In [8]:
no_tok

[{'alignment_pairs': 39693,
  'corpus': 'bible-uedin',
  'documents': 1,
  'id': 5169,
  'latest': 'True',
  'preprocessing': 'mono',
  'size': 1268,
  'source': 'fr',
  'source_tokens': 857843,
  'target': '',
  'target_tokens': None,
  'url': 'https://object.pouta.csc.fi/OPUS-bible-uedin/v1/mono/fr.txt.gz',
  'version': 'v1'},
 {'alignment_pairs': 164916,
  'corpus': 'Books',
  'documents': 29,
  'id': 31363,
  'latest': 'True',
  'preprocessing': 'mono',
  'size': 6524,
  'source': 'fr',
  'source_tokens': 3635088,
  'target': '',
  'target_tokens': None,
  'url': 'https://object.pouta.csc.fi/OPUS-Books/v1/mono/fr.txt.gz',
  'version': 'v1'},
 {'alignment_pairs': 5048744,
  'corpus': 'DGT',
  'documents': 38630,
  'id': 32564,
  'latest': 'True',
  'preprocessing': 'mono',
  'size': 32,
  'source': 'fr',
  'source_tokens': 115621800,
  'target': '',
  'target_tokens': None,
  'url': 'https://object.pouta.csc.fi/OPUS-DGT/v2019/mono/fr.txt.gz',
  'version': 'v2019'},
 {'alignment_pair

In [9]:
print(file_n, 'files')
print(f'total size: {total_size:,}')

39 files
total size: 12,441,516


In [10]:
og.download(no_tok, file_n, str(total_size))

Downloading 39 file(s) with the total size of 12441516. Continue? (y/n)  n


In [11]:
print(''.join(inspect.getsource(opustools.opus_get)))

import urllib.request
import json
import argparse
import sys
import os.path

class OpusGet:

    def __init__(self, source=None, target=None, directory=None,
            release='latest', preprocess='xml', list_resources=False,
            download_dir='.', suppress_prompts=False):
        """Download files from OPUS.

        Keyword arguments:
        source -- Source language
        target -- Target language
        directory -- Corpus directory name
        release -- Corpus release version (default latest)
        preprocess -- Corpus preprocessing type (default xml)
        list_resource -- List resources instead of downloading
        download_dir -- Directory where files will be downloaded (default .)
        suppress_prompts -- Download files without prompting "(y/n)"
        """

        if target != None:
            self.fromto = [source, target]
            self.fromto.sort()

        self.url = 'http://opus.nlpl.eu/opusapi/?'

        urlparts = [(source, 'source'), (tar