In [2]:
import logging
level = logging.getLevelName("INFO")
logging.basicConfig(
  level=level,
  format="[%(asctime)s %(levelname)s] %(message)s",
  datefmt="%H:%M:%S",
)
import json
import numpy as np
import parallel
import torch

from experiment import Net, init_dataset, run_one

In [5]:
chars = [' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
         '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
         ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', 
         'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p',
         'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 
         '{', '|', '}', '~']
len(chars)


69

In [6]:
char_to_idx_map = {char: chars.index(char)+1 for char in chars}
# add chr 0 as an unknown char id
char_to_idx_map[chr(0)] = 0
idx_to_char_map = {val: key for key, val in char_to_idx_map.items()}
len(char_to_idx_map) 

70

In [7]:
def RecoverCharacters(encoded):
  chars = []
  for ci in encoded:
    chars.append(idx_to_char_map[ci])
  return chars
def RecoverOriginalSequence(encoded, spaces):
  s = ""
  for i, c in enumerate(RecoverCharacters(encoded)):
    s+=c
    if spaces[i] >= 1:
      s+=" "
  return s


In [124]:
def GetEncodingAndSpaceLabels(chunk):
  # get original chunk, integer encoded char with spaces removed, and space location vector
  space_location = [0]*CHUNK_SIZE
  spaces = 0
  encoded_chars = []
  for i, c in enumerate(chunk):
    if c == " ":
      space_location[i - 1 - spaces] = 1
      spaces += 1
    else:
      encoded_chars.append(char_to_idx_map.get(c, 0))
  return chunk, encoded_chars

In [125]:
def GetCharEncoding(text):
  encoding = []
  for c in text:
    encoding.append( char_to_idx_map.get(c, 0))
  return torch.tensor(encoding, dtype=torch.long).unsqueeze(0)

# Let me use the model

In [126]:
h = {
  'kernel|filter_sizes': [
    (4,32),
    (4,64),
    (4,64),
  ],
  'final_conv_kernel': 3,  
  'sequence_length': 100,
  'vocab_size': 70, #character vocab
  'char_embedding_size': 8,
  'conv_activation': 'relu',
  }

In [127]:
model = Net(h)

In [129]:
model.load_state_dict(torch.load('model_trained_on_wiki_data.pt'))

<All keys matched successfully>

In [149]:
def SplitText(text, model):
  inp_tensor = GetCharEncoding(text)
  out = model(inp_tensor)
  space_location = (out>.5).int()
  return RecoverOriginalSequence(inp_tensor[0].cpu().numpy(), space_location[0])

In [154]:
SplitText("petsmart", model)

'pets mart'

In [155]:
text = "thisstringisjustatest"
SplitText(text, model)

'this string is justatest'

# Data preprocessing

In [8]:
# raw text file
text_file = 'yourdata.txt'

In [9]:
lines = []
with open(text_file) as f:
  for l in f.readlines():
    line = l.lower().strip()
    line = "".join([x if x in char_to_idx_map else idx_to_char_map[0] for x in line]).strip()
    lines.append(line)

full_set = " ".join(lines)


In [14]:
# How many characters in a chunk of text after spaces are removed
CHUNK_SIZE = 100
chunks = []
count = 0
cur_chunk = []
for c in full_set:
  if c == " ":
    if count != 0:
      cur_chunk.append(c)
    continue
  if count >= CHUNK_SIZE:
    chunks.append("".join(cur_chunk))
    cur_chunk = []
    count = 0
  
  if count == 0 and c==" ":
   continue
  count +=1
  cur_chunk.append(c)

len(chunks)

548247

In [20]:
original_chunk, char_ids, spaces = GetEncodingAndSpaceLabels("convert this to a chunk with no spaces and get a vector indicating where spaces belong")

In [23]:
np.array(spaces)

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [24]:
out = parallel.RunInParallel(chunks, GetEncodingAndSpaceLabels)

In [25]:
np.random.shuffle(out)

chunks, encoded, spaces = zip(*out)


In [29]:
i = 7
assert chunks[i] == RecoverOriginalSequence(encoded[i], spaces[i])

In [None]:
raw_data = {'char_as_ints': encoded, 'space_labels': spaces, 'char_to_idx_map': char_to_idx_map}

In [None]:
torch.save({'features': torch.tensor(encoded), 'labels': torch.tensor(labels), 'char_to_idx_map':data['char_to_idx_map']}, 'data/removed_spaces.pt')

In [None]:
# Training the model

In [21]:
h = {
  'dataset':'torch_wiki.pt',
  'dataset_split': [.5,.1],
  'batch_size': 128,
  'learning_rate': .5,
  'momentum': .98,
  'epochs': 3,
  'kernel|filter_sizes': [
    (4,32),
    (4,64),
    (4,64),
  ],
  'final_conv_kernel': 3,  
  'sequence_length': 100,
  'vocab_size': 70, #character vocab
  'char_embedding_size': 8,
  'conv_activation': 'relu',
  'lr_step_size': 4000,
  'lr_decay': .9,
  'run_validation':True,
  }

In [22]:
ds = init_dataset(h)

In [24]:
d = ds[0][0]
RecoverOriginalSequence(d['features'].cpu().numpy(), d['labels'].cpu().numpy())

'd from the position . mike steele then took the job from banister in 2009 . banister managed the scottsdale scorpions of t'

In [25]:
results, model = run_one(h, ds)

[01:40:31 INFO] Experiment Model:
Net(
  (emb): Embedding(70, 8)
  (convs): Sequential(
    (0): Conv1d(8, 32, kernel_size=(4,), stride=(1,), padding=(2,))
    (1): LambdaLayer()
    (2): ReLU()
    (3): Conv1d(32, 64, kernel_size=(4,), stride=(1,), padding=(2,))
    (4): LambdaLayer()
    (5): ReLU()
    (6): Conv1d(64, 64, kernel_size=(4,), stride=(1,), padding=(2,))
    (7): LambdaLayer()
    (8): ReLU()
  )
  (conv_final): Conv1d(64, 1, kernel_size=(3,), stride=(1,), padding=(1,))
)


[01:40:31 INFO] Hyperparams:
{'dataset': 'torch_wiki.pt', 'dataset_split': [0.5, 0.1], 'batch_size': 128, 'learning_rate': 0.5, 'momentum': 0.98, 'epochs': 3, 'kernel|filter_sizes': [(4, 32), (4, 64), (4, 64)], 'final_conv_kernel': 3, 'sequence_length': 100, 'vocab_size': 70, 'char_embedding_size': 8, 'conv_activation': 'relu', 'lr_step_size': 400, 'lr_decay': 0.9, 'run_validation': True}


[01:40:31 INFO] Model Size: 1.06e+05 bytes


[01:40:31 INFO] TRAINING


[01:40:31 INFO] total batches: 50217


[01:40:39 INFO] Log Type  global_step  epoch  char_accuracy  example_accuracy  loss     lr_at_step  us/ex  % cmplt


[01:40:39 INFO] TRAIN     502          0      0.95554685592  0.0390625         0.11003  0.45        124.3  0.999  


[01:40:47 INFO] TRAIN     1004         0      0.96531248092  0.0859375         0.09827  0.405       125.8  1.999  


[01:40:55 INFO] TRAIN     1506         0      0.97101563215  0.078125          0.08024  0.36450000  124.4  2.998  


[01:41:04 INFO] TRAIN     2008         0      0.96999996900  0.1015625         0.08492  0.29524500  127.0  3.998  


[01:41:11 INFO] TRAIN     2510         0      0.97304683923  0.1015625         0.07336  0.26572050  125.6  4.998  


[01:41:21 INFO] TRAIN     3012         0      0.97429686784  0.125             0.06764  0.23914845  130.6  5.997  


[01:41:30 INFO] TRAIN     3514         0      0.97421872615  0.1484375         0.06845  0.21523360  130.3  6.997  


[01:41:38 INFO] TRAIN     4016         0      0.97226560115  0.109375          0.07200  0.17433922  130.1  7.997  


[01:41:46 INFO] TRAIN     4518         0      0.97499996423  0.1015625         0.06779  0.15690529  129.4  8.996  


[01:41:55 INFO] TRAIN     5020         0      0.97609370946  0.1640625         0.06901  0.14121476  130.5  9.996  


[01:41:55 INFO] TRAIN     5021         0      0.97585934400  0.1796875         0.06270  0.14121476  130.5  9.998  


[01:43:21 INFO] TRAIN     10042        0      0.97679686546  0.1640625         0.06655  0.03589489  131.9  19.99  


[01:44:52 INFO] TRAIN     15063        0      0.97882813215  0.1953125         0.06239  0.01013777  135.3  29.99  


[01:46:12 INFO] TRAIN     20084        1      0.97820311784  0.2265625         0.05855  0.00257688  124.6  39.99  


[01:47:36 INFO] TRAIN     25105        1      0.97609370946  0.15625           0.06150  0.00072778  128.5  49.99  


[01:49:02 INFO] TRAIN     30126        1      0.97921872138  0.125             0.05970  0.00018499  130.8  59.99  


[01:50:23 INFO] TRAIN     35147        2      0.97679686546  0.1484375         0.06205  5.22478381  120.3  69.99  


[01:51:40 INFO] TRAIN     40168        2      0.9765625      0.15625           0.06417  1.32806994  119.2  79.98  


[01:52:51 INFO] TRAIN     45189        2      0.97945308685  0.234375          0.05702  3.75086178  115.7  89.98  


[01:54:03 INFO] TRAIN     50210        2      0.97789061069  0.1171875         0.05787  9.53418740  114.3  99.98  


[01:54:16 INFO] VAL       50220               0.97768712043  0.17460158467292  0.06188                            


In [156]:
text = "petsmart"

In [157]:
SplitText(text, model)

'pets mart'

In [159]:
inp_tensor = GetCharEncoding(text)
out = model(inp_tensor)
space_location = (out>.5).int()

print("Top Splits:")
for location in out.argsort(descending=True)[0][:6]:
  space_location = torch.zeros(size=space_location.shape)
  space_location[0][location] = 1
  print(RecoverOriginalSequence(inp_tensor[0].cpu().numpy(), space_location[0]))

Top Splits:
pets mart
petsmart
petsmart 
petsmart
petsmart
pet smart
