In [1]:
import re, collections

path  = '/content/sample1.txt'

def get_vocab(filename):
    vocab = collections.defaultdict(int)
    with open(filename, 'r', encoding='utf-8') as fs:
        for l in fs:
            sent = l.strip().split(" ")
            for word in sent:
                vocab[' '.join(list(word)) + ' </w>'] += 1
    return vocab

vocab = get_vocab(path)
print(len(vocab))

33


In [2]:
def get_pair_stats(vocab):
    pairs = collections.defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols)-1):
            pairs[symbols[i],symbols[i+1]] += freq
    return pairs

In [3]:
pairs = get_pair_stats(vocab)
print(pairs)

defaultdict(<class 'int'>, {('T', 'h'): 1, ('h', 'e'): 10, ('e', '</w>'): 13, ('l', 'e'): 2, ('e', 'a'): 1, ('a', 'd'): 1, ('d', 'i'): 2, ('i', 'n'): 4, ('n', 'g'): 3, ('g', '</w>'): 2, ('m', 'e'): 3, ('e', 'm'): 1, ('m', 'b'): 1, ('b', 'e'): 1, ('e', 'r'): 5, ('r', 's'): 2, ('s', '</w>'): 3, ('o', 'f'): 4, ('f', '</w>'): 4, ('t', 'h'): 8, ('C', 'l'): 1, ('l', 'u'): 1, ('u', 'b'): 1, ('b', ','): 1, (',', '</w>'): 7, ('n', 'a'): 2, ('a', 'm'): 1, ('e', 'l'): 2, ('l', 'y'): 1, ('y', '</w>'): 4, ('P', 'r'): 1, ('r', 'e'): 3, ('e', 's'): 1, ('s', 'i'): 2, ('i', 'd'): 1, ('d', 'e'): 2, ('e', 'n'): 2, ('n', 't'): 2, ('t', '</w>'): 1, ('B', 'a'): 1, ('a', 'r'): 3, ('r', 'b'): 1, ('b', 'i'): 1, ('i', 'c'): 1, ('c', 'a'): 2, ('a', 'n'): 9, ('n', ','): 4, ('S', 'e'): 1, ('e', 'c'): 2, ('c', 'r'): 1, ('e', 't'): 3, ('t', 'a'): 1, ('r', 'y'): 1, ('M', 'a'): 2, ('s', 't'): 2, ('t', 'o'): 3, ('o', 'n'): 4, ('a', 'j'): 1, ('j', 'o'): 1, ('o', 'r'): 3, ('r', '</w>'): 1, ('E', 'l'): 1, ('l', 'p'): 1, (

In [4]:
def merge(pair,v_in):
  v_out = {}
  for word in v_in:
    w_out = re.sub(' '.join(pair), ''.join(pair),word)
    v_out[w_out] = v_in[word]
  return v_out

In [5]:
def extract_tokens(vocab):
    tokens = collections.defaultdict(int)
    vocab_tokens = {}
    for word, freq in vocab.items():
        word_tokens = word.split()
        for token in word_tokens:
            tokens[token] += freq
        vocab_tokens[''.join(word_tokens)] = word_tokens
    return tokens, vocab_tokens

# Training

In [None]:
n_merges = 10
print("Before Merging")
tokens, vocab_tokens = extract_tokens(vocab)
print('All tokens: {}'.format(tokens.keys()))
print('Number of tokens: {}'.format(len(tokens.keys())))
print(5*'==========')
for i in range(n_merges):
  pairs = get_pair_stats(vocab)

  if not pairs:
    break

  best_pair = max(pairs, key = pairs.get)
  print('Best Pair : {}, count : {}'.format(best_pair,pairs[best_pair]))
  vocab = merge(best_pair,vocab)
  tokens, vocab_tokens = extract_tokens(vocab)
  print('All tokens: {}'.format(tokens.keys()))
  print('Number of tokens: {}'.format(len(tokens.keys())))
  print('==========')


In [7]:
def measure_token_length(token):
    if token[-4:] == '</w>':
        return len(token[:-4]) + 1
    else:
        return len(token)

In [8]:
sorted_tokens_tuple = sorted(tokens.items(), key=lambda item: (measure_token_length(item[0]), item[1]), reverse=True)
sorted_tokens = [token for (token, freq) in sorted_tokens_tuple]

In [15]:
def tokenize_word(string, sorted_tokens, unknown_token='</u>'):
    
    if string == '':
        return ' '
    if sorted_tokens == []:
        return unknown_token

    string_tokens = []
    for i in range(len(sorted_tokens)):
        token = sorted_tokens[i]
        matched_positions = [(m.start(0), m.end(0)) for m in re.finditer(token, string)]
        matched_position = matched_positions[0]
        left_substring = tokenize_word(string[0:matched_position[0]], sorted_tokens, unknown_token='</u>')
        right_substring = tokenize_word(string[matched_position[1]:len(string)], sorted_tokens, unknown_token='</u>')
        break
    return left_substring + ' ' + string[matched_position[0]:matched_position[1]] + right_substring

In [None]:
"""def tokenize_word(string, sorted_tokens, unknown_token='</u>'):
    
    if string == '':
        return []
    if sorted_tokens == []:
        return [unknown_token]

    string_tokens = []
    for i in range(len(sorted_tokens)):
        token = sorted_tokens[i]
        #token_reg = re.escape(token.replace('.', '[.]'))
        matched_positions = [(m.start(0), m.end(0)) for m in re.finditer(token, string)]
        #print(matched_positions)
        if len(matched_positions) == 0:
            continue
        substring_end_positions = [matched_position[0] for matched_position in matched_positions]
        print(substring_end_positions)
        substring_start_position = 0
        for substring_end_position in substring_end_positions:
            substring = string[substring_start_position:substring_end_position]
            string_tokens += tokenize_word(string=substring, sorted_tokens=sorted_tokens[i+1:], unknown_token=unknown_token)
            string_tokens += [token]
            substring_start_position = substring_end_position + len(token)
        remaining_substring = string[substring_start_position:]
        string_tokens += tokenize_word(string=remaining_substring, sorted_tokens=sorted_tokens[i+1:], unknown_token=unknown_token)
        break
    return string_tokens"""

In [16]:
tokenize_word("the", sorted_tokens, unknown_token='</u>')

'  the</w> '