In [None]:
import os 
import re 
import ipyparallel as ipp 
 
import torch 
import numpy as np

In [None]:
n_cores = os.cpu_count()

In [None]:
with open(f'data/words_alpha.txt', 'r') as f: 
    data = f.read() 
words = data.splitlines() 
words = [w.strip() for w in words] # get rid of any leading or trailing white space 
words = [w for w in words if w] # get rid of any empty strings 
words

In [None]:
def create_binary_combinations(n): 
    """
    Function to create binary masks for a given number of unique characters in a word
    """
    # Loop through all numbers from 0 to 2^n - 1 
    r = [] 
    for i in range(1 << n): 
        # Convert the current number to a binary string of length n 
        binary_str = format(i, '0' + str(n) + 'b') 
        r.append(binary_str) 
    return r 

In [None]:
# Create masks to extract letters from words for further data creation
masks = {} 
negative_masks = {} 
target_masks = {} 
full_masks = {} 
for i in range(1,17): 
    mask = torch.tensor(np.array([[*s] for s in create_binary_combinations(i)[:-1]], dtype=int).astype(bool)) # need double type conversion to keep '0' -> False 
    masks[i] = mask 
    negative_masks[i] = ~mask 
    targets_mask_proxy = (negative_masks[i] * np.arange(1, i+1)).reshape(-1) 
    target_masks[i] = np.delete(targets_mask_proxy,np.where(targets_mask_proxy == 0)) - 1 
    full_masks[i] = ~torch.repeat_interleave(mask, mask.shape[1]-mask.sum(1), axis=0)

In [None]:
cluster = ipp.Cluster(n=n_cores) 
cluster.start_cluster_sync() 
rc = cluster.connect_client_sync() 
rc.wait_for_engines(n_cores) 
rc.block = True 
dview = rc[:]

In [None]:
chars = "abcdefghijklmnopqrstuvwxyz" 
stoi = {ch:i+1 for i,ch in enumerate(chars)} 
itos = {i:s for s,i in stoi.items()} # inverse mapping

In [None]:
def create_data(letters: torch.tensor, word): 
    """
    Create data for a single word
    """
    n_unique_letters = len(letters) 
    x_mask = full_masks[n_unique_letters] * letters 
    xs = [] 
    for row in x_mask: 
        letter_filter = ''.join([itos[l] for l in row.tolist() if l != 0]) 
        output = re.sub(f'[{letter_filter}]','_',word) 
        xs.append(output) 
    return (xs, torch.tensor(letters)[target_masks[n_unique_letters]])

In [None]:
# create list for each process
word_splits = [] 
words_per_core = len(words) // n_cores 
for i in range(n_cores): 
    word_splits.append(words[i*words_per_core:(i+1)*words_per_core]) 
unfinished_words = len(words) - words_per_core*n_cores 
word_splits[-1].extend(words[-unfinished_words:])

In [None]:
def data_creation_wrapper(words): 
    """
    Wrapper function that processes a list of words.
    The function will be send to a process.
    """
    targets = [] 
    data_new = [] 
    for _, word in enumerate(words): 
        letters = torch.tensor([stoi[e] for e in list(set(word))]) 
        if len(letters) > 0: 
            d,t = create_data(letters,word) 
            targets.append(t) 
            data_new.append(d) 
    y = torch.concatenate(targets) 
    return (data_new, y)

In [None]:
# Run the data processing
dview.execute("import torch\nimport re") 
dview.push(dict(full_masks=full_masks,target_masks=target_masks,create_data=create_data,stoi=stoi,itos=itos)) 
output = dview.map_sync(data_creation_wrapper, word_splits)

In [None]:
# Combine y data into a single tensor
data_y = torch.concat([o[1] for o in output]) 
data_y.shape

In [None]:
# Combine x data into a single list
data_x = [a for o in output for e in o[0] for a in e] 
len(data_x)

In [None]:
# save x list
f = open(f'data/x.txt', 'w') 
x_data = '\n'.join(data_x) 
f.write(x_data) 
f.close()

In [None]:
# Save y tensor
torch.save(data_y,f'data/y.pt')