In [1]:
import sys
import pandas as pd
from beakerx.object import beakerx
from beakerx import *
from IPython.display import display #, set_matplotlib_formats

sys.path = list(set(['.', '..']) - set(sys.path)) + sys.path

import domain.tCoIR.treaty_state

In [2]:
import zipfile
import typing
import re
import fnmatch
import gensim


HYPHEN_REGEXP = re.compile(r'\b(\w+)-\s*\r?\n\s*(\w+)\b', re.UNICODE)

def dehyphen(text):
    result = re.sub(HYPHEN_REGEXP, r"\1\2\n", text)
    return result

def get_lang_filenames(archive_name, lang='fr'):
    px = lambda x: pattern.match(x) if isinstance(pattern, typing.re.Pattern) else fnmatch.fnmatch(x, pattern)
    with zipfile.ZipFile(archive_name) as zf:
        return [ name for name in zf.namelist() if px(name) ]

def read_content(archive_name, lang='fr'):
    lang_pattern = '*_{}*.txt'.format(lang)
    with zipfile.ZipFile(archive_name) as zf:
        filenames = [ filename for filename in zf.namelist() if fnmatch.fnmatch(filename, lang_pattern) ]
        for filename in filenames:
            treaty_id, x_lang, *tail = filename.split('_')
            with zf.open(filename, 'r') as text_file:
                content = text_file.read()
            content = gensim.utils.to_unicode(content, 'utf8', errors='ignore')
            content = dehyphen(content)
            yield treaty_id, filename, content
    


def create_zip_for_lang(treaties, lang, source_name, target_name):
    with zipfile.ZipFile(target_name, "w") as zf:
        for treaty_id, filename, text in read_content(source_name, lang=lang):
            if not treaty_id in treaties.index:
                print('Skipping {} not in WTI-index...'.format(treaty_id))
                continue
            year = treaties.loc[treaty_id]['signed_year']
            store_name = '{}_{}_{}.txt'.format(year, treaty_id, lang)
            zf.writestr(store_name, text, zipfile.ZIP_DEFLATED)
    print('Done!')
    

lang = 'fr'
source_name = '../../data/tCoIR/tCoIR_all.txt.zip'
target_name = '../../data/tCoIR/tCoIR_{}_{}.txt.zip'.format(lang, time.strftime("%Y%m%d"))

treaty_repository = domain.tCoIR.treaty_state.TreatyState(data_folder='../../data/tCoIR/wti_index')

treaties = treaty_repository.get_treaties(language=lang)
create_zip_for_lang(treaties, lang, source_name, target_name)


Skipping XXX022 not in WTI-index...
Skipping XXX010 not in WTI-index...
Skipping 418117 not in WTI-index...
Skipping XXX027 not in WTI-index...
Skipping 418678 not in WTI-index...
Skipping 415293 not in WTI-index...
Skipping XXX024 not in WTI-index...
Skipping 476047 not in WTI-index...
Skipping 304676 not in WTI-index...
Skipping 200785 not in WTI-index...
Skipping XXX028 not in WTI-index...
Skipping XXX026 not in WTI-index...
Skipping XXX011 not in WTI-index...
Skipping XXX023 not in WTI-index...
Skipping XXX020 not in WTI-index...
Skipping 304572 not in WTI-index...
Done!
