In [88]:
# Data splitter
import json
import numpy as np
import pandas as pd
from pathlib import Path
from shutil import copy2
import math
import itertools

from sklearn.model_selection import train_test_split

In [89]:
# Setup variables
DATA_DIR = '/nas/home/joel/src/metis/projectmetis/projectmetis/experiments/run_fedmodels/ner/tf_ner/data/conll2003'
OUTPUT_DIR = '/lfs1/shared/nlp/conll2003data/conll2003_partitions/federated_non_uniform_random/32-way'

# Read Training data
train_sentences = pd.read_csv(f'{DATA_DIR}/train.words.txt', sep='|', header=None, names=['sentence'], quoting=3).sentence.to_numpy()
train_tags = pd.read_csv(f'{DATA_DIR}/train.tags.txt', sep='|', header=None, names=['tags'], converters={'tags': lambda x: x.split()}).tags.to_numpy()

# Read Validation data
valid_sentences = pd.read_csv(f'{DATA_DIR}/valid.words.txt', sep='|', header=None, names=['sentence'], quoting=3).sentence.to_numpy()
valid_tags = pd.read_csv(f'{DATA_DIR}/valid.tags.txt', sep='|', header=None, names=['tags'], converters={'tags': lambda x: x.split()}).tags.to_numpy()

# Concatenate training and validation set
train_valid_sentences_concat = np.concatenate((train_sentences, valid_sentences))
train_valid_tags_concat = np.concatenate((train_tags, valid_tags))


In [90]:
len(train_valid_tags_concat)

17291

In [91]:
N = 32
X = 200
y = 21.95
split_sizes = []
for split_number in range(N-1):
    split_sizes.append(X+math.ceil(split_number*y))

#split_sizes.append(len(train_valid_tags_concat)-sum(split_sizes))

In [92]:
sum(split_sizes)

16419

In [93]:
len(split_sizes)

31

In [94]:
split_sizes

[200,
 222,
 244,
 266,
 288,
 310,
 332,
 354,
 376,
 398,
 420,
 442,
 464,
 486,
 508,
 530,
 552,
 574,
 596,
 618,
 639,
 661,
 683,
 705,
 727,
 749,
 771,
 793,
 815,
 837,
 859]

In [95]:
split_points = split_sizes[:]
for i in range(len(split_points)-1):
    split_points[i+1] += split_points[i]

rng = np.random.default_rng(12345)
indices = np.arange(len(train_valid_tags_concat))
rng.shuffle(indices)

split_indices = np.split(indices, split_points)

In [96]:
total = 0
for split in split_indices:
    total += len(split)
assert len(train_valid_tags_concat) == total, "Something wrong with the splits. Does not add."
print(total)

17291


In [101]:
# Create train-dev split
for idx, split in enumerate(split_indices):
    print(f'Creating split {idx+1}')
    X_train, X_valid, y_train, y_valid = train_test_split(
        train_valid_sentences_concat[split], 
        train_valid_tags_concat[split], 
        test_size=0.05, 
        random_state=42)
    
    print(f'Number of train examples = {len(X_train)}')
    print(f'Number of dev examples = {len(X_valid)}')
    print(f'Total = {len(X_train)+len(X_valid)}')
    
    # Setup paths
    split_dir_path = Path(f'{OUTPUT_DIR}/split{idx+1}')
    split_dir_path.mkdir(exist_ok=True)

    # Save training data
    np.savetxt(f'{(split_dir_path/"train.words.txt")}', X_train, fmt='%s')
    np.savetxt(f'{(split_dir_path/"train.tags.txt")}', np.array(list(map(lambda f: ' '.join(f), y_train))), fmt='%s')

    # Save validation data
    np.savetxt(f'{(split_dir_path/"valid.words.txt")}', X_valid, fmt='%s')
    np.savetxt(f'{(split_dir_path/"valid.tags.txt")}', np.array(list(map(lambda f: ' '.join(f), y_valid))), fmt='%s')

    # Save test set as-is
    copy2(f'{DATA_DIR}/test.words.txt', f'{split_dir_path}/test.words.txt')
    copy2(f'{DATA_DIR}/test.tags.txt', f'{split_dir_path}/test.tags.txt')

    print('-'*5)
    print()

Creating split 1
Number of train examples = 190
Number of dev examples = 10
Total = 200
-----

Creating split 2
Number of train examples = 210
Number of dev examples = 12
Total = 222
-----

Creating split 3
Number of train examples = 231
Number of dev examples = 13
Total = 244
-----

Creating split 4
Number of train examples = 252
Number of dev examples = 14
Total = 266
-----

Creating split 5
Number of train examples = 273
Number of dev examples = 15
Total = 288
-----

Creating split 6
Number of train examples = 294
Number of dev examples = 16
Total = 310
-----

Creating split 7
Number of train examples = 315
Number of dev examples = 17
Total = 332
-----

Creating split 8
Number of train examples = 336
Number of dev examples = 18
Total = 354
-----

Creating split 9
Number of train examples = 357
Number of dev examples = 19
Total = 376
-----

Creating split 10
Number of train examples = 378
Number of dev examples = 20
Total = 398
-----

Creating split 11
Number of train examples = 399


In [102]:
# Stitch sentences and tags for model ingestion
def stitch_files(words_file, tags_file, output_filepath='/tmp/concat_data.txt', sep='|||'):
    '''
    Given the words filepath (tokenized sentence per line) and
    tags filepath (all tags of a sentence per line) and
    a separator pattern (that is not already in the data),
    create an `output_filepath` which concantenates the words
    and tags into a single file.
    '''
    words = pd.read_csv(words_file, sep='|', header=None, names=['sentence'], quoting=3)
    tags = pd.read_csv(tags_file, sep='|', header=None, names=['tags'], quoting=3)

    assert words.shape[0] == tags.shape[0], 'Number of sentences and tags do not match'

    concat_data = words['sentence'] + sep + tags['tags']
    # print(concat_data)
    concat_data.to_csv(output_filepath, sep='\t', index=False, quoting=3, header=False)


for data_split in ['train', 'valid', 'test']:
    for split in range(1, 33):
        words_file = f'/lfs1/shared/nlp/conll2003data/conll2003_partitions/federated_non_uniform_random/32-way/split{split}/{data_split}.words.txt'
        tags_file = f'/lfs1/shared/nlp/conll2003data/conll2003_partitions/federated_non_uniform_random/32-way/split{split}/{data_split}.tags.txt'
        output_file = f'/lfs1/shared/nlp/conll2003data/conll2003_partitions/federated_non_uniform_random/32-way/split{split}/{data_split}.words_tags.txt'

        stitch_files(words_file, tags_file, output_file)