## The Culture of International Relations - Text Analysis
### <span style='color: green'>SETUP </span> Prepare and Setup Notebook <span style='float: right; color: red'>MANDATORY</span>

In [None]:
# Setup
%load_ext autoreload
%autoreload 2

import sys
import ipywidgets as widgets

sys.path = list(set(['..', '../3_text_analysis']) - set(sys.path)) + sys.path

import matplotlib.pyplot as plt
import common.utility as utility
import common.treaty_utility as treaty_utility
import common.treaty_state as treaty_repository
import common.config as config
import treaty_corpus
import textacy_corpus_utility as textacy_utility
#from beakerx.object import beakerx
#from beakerx import *
from IPython.display import display, set_matplotlib_formats

logger = utility.getLogger('corpus_text_analysis')

treaty_repository.load_wti_index_with_gui(data_folder=config.DATA_FOLDER)

%matplotlib inline

### Create "LTS+UNTS GCAs, 1935-1972" corpus
See https://github.com/humlab/the_culture_of_international_relations/issues/28

In [None]:
import zipfile

def get_treaty_subset(column_name):

    treaties = treaty_repository.current_wti_index().treaties
    treaties = treaties.loc[(treaties[column_name] == 'yes')]
    treaties = treaties.loc[treaties.english == 'en']
    treaties = treaties.loc[treaties.signed_year.isin(range(1935, 1973))]

    return treaties
    
def extract_corpus_treaties(source_filename, treaties):

    with zipfile.ZipFile(source_filename, "r") as zf_in:

        filenames = zf_in.namelist()

        for treaty_id in treaties.index:

            treaty_filenames = [ filename for filename in filenames if filename.startswith(treaty_id) ]

            if len(treaty_filenames) == 1:
                content = zf_in.read(treaty_filenames[0]).decode('utf-8')
                yield treaty_id, treaty_filenames[0], content
            else:
                print("NOT FOUND OR DUPLICATE: {}".format(treaty_id))

def store_sub_corpus(source_filename, treaties, target_filename):

    with zipfile.ZipFile(target_filename, "w") as zf_out:
        for treaty_id, filename, content in extract_corpus_treaties(source_filename, treaties):
            zf_out.writestr(filename, content)

source_filename = '../data/treaty_texts.zip'
for column_name in [ 'is_cultural_yesno_org' ]
    treaties = get_treaty_subset(column_name)
    target_filename = "20200701_{}".format(column_name.split('_')[-1].upper())
    store_sub_corpus(source_filename, treaties, target_filename)


In [None]:
import sys
import os
if '..' not in sys.path: sys.path.append('..')

#from common.treaty_state import load_wti_index
#wti_index = load_wti_index('../data/')

import treaty_corpus

source_folder = '../data'
#source_file = 'treaty_text_corpora_en_201908.zip'
source_file = 'Treaty texts (.txt) (finished).zip'
source_path = os.path.join(source_folder, source_file)

treaties = treaty_repository.current_wti_index().treaties
treaties = treaties.loc[(treaties.is_cultural_yesno_org == 'yes') | (treaties.is_cultural_yesno_gen == 'yes') | (treaties.is_cultural_yesno_plus == 'yes')]
treaties = treaties.loc[treaties.english == 'en']
treaties = treaties.loc[treaties.signed_year.isin(range(1935, 1973))]

# corpus_stream = treaty_corpus.TreatyCompressedFileReader(source_path, 'en', list(treaties.index))
# corpus = treaty_corpus.TreatyCorpus(corpus_stream)


### Compile "LTS+UNTS GCAs, 1935-1972" corpus

In [None]:
import glob
import shutil
folder = "20200630_LTS+UNTS_GCAs_1935-1972"

for treaty_id in treaties.index:
    filepattern = os.path.join(source_folder, folder, "{}_en*.txt".format(treaty_id))
    filenames = glob.glob(filepattern)
    if len(filenames) == 1:        
        #print("file found: {}".format(filenames[0]))
        shutil.move(filenames[0], os.path.join(source_folder, folder, "compiled", os.path.split(filenames[0])[1]))
    elif len(filenames) == 0:
        print("missing file: {}".format(treaty_id))
    else:
        print("duplicate: {}".format(treaty_id))

        

## Rename files in archive
Add signed year to filenames in zip archive

In [None]:
import zipfile
with zipfile.ZipFile('../data/tCoIR_en_45-72_renamed.txt.zip', 'w', compression=zipfile.ZIP_DEFLATED) as new_zipfile:
    with zipfile.ZipFile('../data/tCoIR_en_45-72.txt.zip') as old_zipfile:
        for filename in old_zipfile.namelist():
            treaty_id = filename.split('_')[0]
            signed_year = treaty_repository.current_wti_index().treaties.loc[treaty_id]['signed_year']
            new_filename = '{}_{}'.format(signed_year, filename)
            with old_zipfile.open(filename) as textfile:
                content = textfile.read()
            new_zipfile.writestr(new_filename, content)