In [1]:
import os
import collections
import sys
import re
import tqdm


In [2]:
dir_of_dirs = 'results'
results_dir = 'outputs/results.step2' 
tokens_file = 'tokens_set.txt'
chars_file = 'chars_set.txt'

In [4]:
tokens_file = os.path.join(results_dir, tokens_file)
chars_file = os.path.join(results_dir, chars_file)
os.makedirs(results_dir, exist_ok = True)

In [5]:
def counters_merge(counters):
    if len(counters) < 3:
        share_counter = collections.Counter()
        for counter in counters:
            share_counter += counter
        return share_counter
    else:
        split_point = len(counters)//2
        l_c = counters_merge(counters[:split_point])
        r_c = counters_merge(counters[split_point:])
        return l_c + r_c
    
def count_chars(text_lines):
    counters = []
    for text in text_lines:
        counters.append(collections.Counter(text))
    return counters_merge(counters)

In [6]:
def get_chars_counter_from_files(txt_files):
    texts = [open(file).read().split('\n') for file in txt_files]
    lines = []
    for txt_lines in texts:
        lines.extend([''.join(line.split()) for line in txt_lines if line!='<NEXT_PAPER>' and line])

    return count_chars(lines)

def map_text_files(from_txt_files, to_txt_files, mapper):
    texts = [open(file).read().split('\n') for file in from_txt_files]
    mapped_texts = []
    for txt_lines in texts:
        mapped_texts.append([ ''.join(map(mapper,line)) for line in txt_lines if line!='<NEXT_PAPER>' and line])
    
    for lines, file in zip(mapped_texts,to_txt_files):
        open(file, 'wt').write('\n'.join(lines)) 

In [8]:
dirs = [os.path.join(dir_of_dirs,file_path) for file_path in  os.listdir(dir_of_dirs)]
counters = []
for data_dir in tqdm.tqdm(dirs):
    txt_files = [os.path.join(data_dir,file_path) for file_path in  os.listdir(data_dir)]
    counters.append(get_chars_counter_from_files(txt_files))
sum_counter = sum(counters, collections.Counter())

100%|██████████| 258/258 [16:07<00:00,  3.75s/it]


In [9]:
chars = [char for char, c in sum_counter.most_common(255)] + ['😟'] #U+1F61F
freq_chars = [' '] + chars

In [10]:
mapper = lambda char: char if char in freq_chars else freq_chars[-1]

In [11]:
dirs = [os.path.join(dir_of_dirs,file_path) for file_path in  os.listdir(dir_of_dirs)]
file_index = 1
for data_dir in tqdm.tqdm(dirs):
    from_txt_files = [os.path.join(data_dir,file_path) for file_path in  os.listdir(data_dir)]
    to_txt_files = [os.path.join(results_dir,'prts_{}'.format(str(_file_index).zfill(12))) for _file_index, file_path in  enumerate(os.listdir(data_dir),file_index)]
    file_index += len(to_txt_files)
    map_text_files(from_txt_files, to_txt_files, mapper)


100%|██████████| 258/258 [11:33<00:00,  2.69s/it]


In [12]:
def chunked_gen(l, chunk_size):
    for i in range(0, len(l), chunk_size):
        yield l[i:i + chunk_size]

In [13]:
def get_token_counter_from_files(txt_files):
    texts = [open(file).read().split('\n') for file in txt_files]
    lines = []
    for txt_lines in texts:
        lines.extend([line.split() for line in txt_lines if line])

    return count_chars(lines)

In [14]:
txt_files = [os.path.join(results_dir,file_path) for file_path in  os.listdir(results_dir)]

In [15]:
chunk_size = 500
token_counters = []
for files in tqdm.tqdm(chunked_gen(txt_files, chunk_size), total = len(txt_files)//chunk_size + 1):
    token_counters.append(get_token_counter_from_files(files))
sum_token_counter = sum(token_counters, collections.Counter())

100%|██████████| 53/53 [26:52<00:00, 30.42s/it]


In [16]:
tokens = ['<S>', '</S>', '<UNK>'] + [token for token, c in sum_token_counter.most_common()]
open(tokens_file, 'wt').write('\n'.join(tokens)) 
open(chars_file, 'wt').write('\n'.join(chars)) 


511

In [17]:
len(chars)

256