# Create regional sub-corpora


## Load WTI index

In [None]:
# https://github.com/humlab/the_culture_of_international_relations/issues/25

import glob
import os

def project_root():
    folder = os.getcwd()
    while not os.path.exists(os.path.join(folder, "common")):
        folder, _ = os.path.split(folder)
    return folder

sys.path.append(project_root())

import common.config as config
import common.treaty_state as treaty_repository

treaty_repository.load_wti_index_with_gui(data_folder=config.DATA_FOLDER)

## List Countries in Regions

In [None]:
import pandas as pd

pd.DataFrame({
    'Region 1': pd.Series(config.get_region_parties(1)),
    'Region 2': pd.Series(config.get_region_parties(2)),
    'Region 3': pd.Series(config.get_region_parties(3))
}).fillna('')

## Create bash-script

In [None]:
# Note: Requires Python => 3.7

folder = 'text'
# folder = os.path.join(os.getcwd(), input_folder_name)
groups = { x: set(config.get_region_parties(x)) for x in [1,2,3] }

bash_script = f'''
#!/bin/bash

rm -rf {folder}/region1
rm -rf {folder}/region2
rm -rf {folder}/region3

mkdir {folder}/region1
mkdir {folder}/region2
mkdir {folder}/region3
'''

for path in [ x for x in glob.glob(os.path.join(folder, '*.txt')) ]:
    
    _, filename = os.path.split(path)    
    treaty_id = filename.split('_')[0]
    treaty = treaty_repository.current_wti_index().treaties.loc[treaty_id]
    assert treaty is not None
    
    for region_id in [1, 2, 3]:

        if len(set([treaty['party1'], treaty['party2']]).intersection(groups[region_id])) > 0:
            bash_script += 'cp {}/{} {}/region{}/\n'.format(folder, filename, folder, region_id)

bash_script += f'''
zip -j sub_corpus_region1.zip {folder}/region1/*txt
zip -j sub_corpus_region2.zip {folder}/region2/*txt
zip -j sub_corpus_region3.zip {folder}/region3/*txt

rm -rf {folder}/region1
rm -rf {folder}/region2
rm -rf {folder}/region3
'''

with open(r"./create_regional_sub_corpora.sh","w+") as f:
    f.write(bash_script)
