In [1]:
import os, re, json, html
import numpy as np
import random as ra
from tqdm import tqdm
# from src.bpe import GreedyBPE
from src.bpe import HRBPE
from bs4 import BeautifulSoup

def read_training_data(train_sets, data_dir, language, seed, n):
    train_handle = '-'.join(train_sets)
    gold = [ts for train_set in train_sets for ts in json.load(open(f"{data_dir}/{language}/{train_set}.json")).values()]
    if n > 0 and n < len(gold):
        ra.seed(seed)
        gold = ra.sample(gold, n)
    elif n == -1 or n >= len(gold):
        pass
    else:
        raise ValueError
    return ["".join(ts) for ts in gold], train_handle, gold

def get_spans(tokens):
        locs = [0] + list(np.cumsum([len(t) for t in tokens]))
        return  list(zip(locs[0:-1],locs[1:]))

def eval_segmentation(ts, ts_hat):
    y = set(get_spans(ts)); y_hat = set(get_spans(ts_hat))
    TP = len(y_hat.intersection(y)); FP = len(y_hat - y); FN = len(y - y_hat)
    P = TP/(TP+FP) if (TP+FP) else 0
    R = TP/(TP+FN) if (TP+FN) else 0
    F1 = 2*P*R/(P+R) if (P+R) else 0
    return P, R, F1

seed = 691; ra.seed(seed)
method = 'hr-bpe'
init_method = 'char' 
# init_method = 'warm'
num_batches = 100
batch_size = 10_000
actions_per_batch = int(batch_size/1)
reg_model = 'mixing'
# param_method = 'est_type'
# param_method = 'est_doc'
param_method = 'est_theta'
# param_method = 'regress'
# param_method = 'regress_theta'
early_stop = True
language = "EN"
data_dir = "/cephfs/data/hr-bpe/data/gold" # "./data/gold"
nsamp = 1000000
use_external = False
use_cover = False
use_covering_vocab = False

if language == "EN":
    # load the supplementary data
    threaddir = "/cephfs/data/hr-bpe/data/threads/" # '/home/jake/courses/DSCIT780/data/threads/'
    tids = [fname[:-5] for fname in os.listdir(threaddir) if re.search('^\w+\.json$', fname)]
    ntids = 10000
    ntids = min(len(tids), ntids) # number of threads to sample
    external_handle = f'reddit-privacy-threads-{ntids}'
    # model_str = f'{method}_{init_method}_{num_batches}_{batch_size}_{actions_per_batch}_{reg_model}_{param_method}_{language}_{seed}_{train_handle}-{nsamp}'
    sample_tids = ra.sample(tids, ntids)
    external_docs = [BeautifulSoup(html.unescape(d.get('body', 
                                              (d.get('title', '') + '\n' + d.get('selftext', ''))).strip('\n')), 
                                 "lxml").text for tid in sample_tids for d in json.load(open(threaddir+tid+'.json'))]
    train_sets = ["ewtb", "lowlands", "ritter"]
elif language == "ZH":
    external_docs = [line for line in open("/cephfs/data/hr-bpe/data/background.txt") if line]
    train_sets = ["nlpcc2016-train"]
    train_sets += ["icwb2-train-msr", "icwb2-train-pku", "icwb2-train-cityu"] # "icwb2-train-as"
    external_handle = 'weibo_background'
#     external_docs, _, _ = read_training_data(external_sets, data_dir, "ZH", seed, -1)
else:
    train_sets = ['parseme-train']
    external_docs = []
    external_handle = 'NA'

# load the training data

train_docs, train_handle, train_gold = read_training_data(train_sets, data_dir, language, seed, nsamp)
if use_covering_vocab:
    covering_vocab = set([t for ts in train_gold for t in ts])
else:
    covering_vocab = set()
if use_cover:
    covering = list(train_gold) 
else:
    covering = []
# name the model
model_str = f'{method}_{init_method}_{num_batches}_{batch_size}_{actions_per_batch}_{reg_model}_{param_method}_{language}_{seed}_{train_handle}-{nsamp}_{external_handle}'
if use_external:
    docs = train_docs + external_docs
else:
    docs = train_docs
# check to see what we got
print('model name: ', model_str, '\n')
print('number of samples, total characters, covering size: ', len(docs), sum([len(d) for d in docs]), len(covering_vocab))

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that d

model name:  hr-bpe_char_100_10000_10000_mixing_est_theta_EN_691_ewtb-lowlands-ritter-1000000_reddit-privacy-threads-10000 

number of samples, total characters, covering size:  4802 352374 0


In [2]:
token_index = {"<eos>": 0, "<unk>": 1}
model = HRBPE(param_method=param_method, reg_model=reg_model, early_stop=early_stop, 
              tok2ind = token_index, covering_vocab = covering_vocab) 
model.init(docs, seed=seed, method=init_method, covering = covering,
           action_protect = ["\n","[*\(\{\[\)\}\]\.\?\!\,\;][ ]*\w", "\w[ ]*[*\(\{\[\)\}\]\.\?\!\,\;]"])
model.fit(num_batches, batch_size, actions_per_batch=actions_per_batch, seed=seed)

model.tokenize('this is a test to see how tokenization works on an arbitrary sentence')

Initializing: 100%|██████████| 4802/4802 [00:00<00:00, 5169.72it/s]
Fitting:   8%|▊         | 8/100 [00:09<02:01,  1.32s/it]

Built a vocabulary of 9272 types


('this',
 ' ',
 'is',
 ' ',
 'a',
 ' ',
 't',
 'e',
 's',
 't',
 ' ',
 'to',
 ' ',
 's',
 'e',
 'e',
 ' ',
 'how',
 ' ',
 't',
 'ok',
 'e',
 'niz',
 'ation',
 ' ',
 'work',
 's',
 ' ',
 'on',
 ' ',
 'an',
 ' ',
 'a',
 'r',
 'bit',
 'r',
 'a',
 'ry',
 ' ',
 's',
 'e',
 'n',
 'tence')

Results by the `'regress_theta'` method, which merges the covering and covered tokens of a vocabulary:
```
with 1 thread:
- P, R, F1: [0.5439240588877221, 0.7853464480450258, 0.6351837302274742]
with 10 threads:
- P, R, F1: [0.5473520696069848, 0.7878414794975205, 0.6382979653996623]
with 100 threads:
- P, R, F1: [0.5590421705222007, 0.794544866772627, 0.648861981193056]
with 1000 threads:
- P, R, F1: [0.6150348779580214, 0.828224440535471, 0.6977321039142568]
with 10000 threads:
- P, R, F1: [0.6332812612278924, 0.8365140770050181, 0.712705025346611]
```

Results by the `'regress_theta'` method, with merges coforming under a positional covering (gold tokenization):
```
with 1 thread:
- P, R, F1: [0.5508993408194046, 0.7963483709277502, 0.6432619091776358]
with 10 threads:
- P, R, F1: [0.5265977934771198, 0.7787350859313389, 0.62051201301476]
with 100 threads:
- P, R, F1: [0.5703482529506562, 0.8063852262338111, 0.6598749556157072]
with 1000 threads:
- P, R, F1: [0.5860991713323762, 0.8159955107070732, 0.6739022523072684]
with 10000 threads:
- P, R, F1: [0.6065336714999103, 0.8262617805965031, 0.6907931087621503]
```

Results for EN:
```
with no cover:
- P, R, F1:  [0.5473409177750366, 0.7889378079367121, 0.6386962894430573]
with no external data:
- 
with all external data:
- 
```

Results for ZH:
```
with no cover:
- 
with no external data:
- 
with all external data:
- 
```

In [3]:
if language == "EN":
    test_sets = ["ted", "trustpilot", "tweebank"]
elif language == "ZH":
    test_sets = ["nlpcc2016-test"]
    test_sets += ["icwb2-test-msr", "icwb2-test-pku", "icwb2-test-cityu"] # "icwb2-test-as"
else:
    test_sets = ["parseme-test"]
test_docs, test_handle, test_gold = read_training_data(test_sets, data_dir, language, seed, nsamp)

print("P, R, F1: ", list(map(np.mean, zip(*[eval_segmentation(ts, model.tokenize("".join(ts))) for ts in tqdm(test_gold) if ts]))))
[model.tokenize("".join(ts)) for ts in test_gold[1:2] if ts]


  0%|          | 0/1003 [00:00<?, ?it/s][A
  3%|▎         | 27/1003 [00:00<00:03, 269.20it/s][A
  5%|▌         | 51/1003 [00:00<00:04, 208.05it/s][A
  9%|▉         | 92/1003 [00:00<00:03, 244.10it/s][A
 13%|█▎        | 132/1003 [00:00<00:03, 275.53it/s][A
 16%|█▌        | 160/1003 [00:00<00:03, 275.21it/s][A
 20%|█▉        | 198/1003 [00:00<00:02, 299.79it/s][A
 23%|██▎       | 233/1003 [00:00<00:02, 312.58it/s][A
 28%|██▊       | 281/1003 [00:00<00:02, 348.43it/s][A
 32%|███▏      | 318/1003 [00:00<00:01, 353.05it/s][A
 35%|███▌      | 356/1003 [00:01<00:01, 356.31it/s][A
 39%|███▉      | 394/1003 [00:01<00:01, 361.92it/s][A
 43%|████▎     | 435/1003 [00:01<00:01, 373.63it/s][A
 47%|████▋     | 473/1003 [00:01<00:01, 362.17it/s][A
 51%|█████     | 510/1003 [00:01<00:01, 356.63it/s][A
 57%|█████▋    | 568/1003 [00:01<00:01, 399.84it/s][A
 63%|██████▎   | 627/1003 [00:01<00:00, 441.22it/s][A
 68%|██████▊   | 686/1003 [00:01<00:00, 476.96it/s][A
 74%|███████▎  | 739/10

P, R, F1:  [0.5473409177750366, 0.7889378079367121, 0.6386962894430573]





[('So',
  ' ',
  'to',
  ' ',
  'm',
  'e',
  ' ',
  'it',
  ' ',
  'f',
  'e',
  'l',
  't',
  ' ',
  'like',
  ' ',
  'photography',
  ' ',
  'was',
  ' ',
  'more',
  ' ',
  'about',
  ' ',
  'being',
  ' ',
  'a',
  't',
  ' ',
  'th',
  'e',
  ' ',
  'right',
  ' ',
  'place',
  ' ',
  'and',
  ' ',
  'th',
  'e',
  ' ',
  'right',
  ' ',
  'time',
  '.')]

In [4]:
# model.save('cache/' + model_str + '.json')

In [5]:
# ntids = 10000
# ntids = min(len(tids), ntids) # number of threads to sample
# thread_handle = f'reddit-privacy-threads-{ntids}'
# model_str = f'{method}_{init_method}_{num_batches}_{batch_size}_{actions_per_batch}_{reg_model}_{param_method}_{language}_{seed}_{train_handle}-{nsamp}_{thread_handle}'

# mod = HRBPE(param_method=param_method, reg_model=reg_model, early_stop=early_stop)
# mod.load('cache/' + model_str + '.json')
# mod.init([], seed=seed, method=init_method)
# print("P, R, F1: ", list(map(np.mean, zip(*[eval_segmentation(ts, mod.tokenize("".join(ts))) for ts in tqdm(test_gold) if ts]))))
# [mod.tokenize("".join(ts)) for ts in test_gold[1:2] if ts]