In [169]:
import os
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [157]:
def clean_text(text):
    text = text.lower()
    text = text.replace(' ', '_')
    text = re.sub(r'[^a-z_]', '', text)
    return text

In [158]:
with open(os.path.join('text', 'text_sh.txt'), 'r', encoding='cp1251') as file:
    content = file.read().replace('\n', '')
    content = clean_text(content)

In [116]:
PERIOD_OF_IDS = 10
WINDOW_SIZE = 6
CODE_SIZE = 256
CODE_ACTIVE_BITS = 8

In [117]:
VOCAB = dict()
syms = [chr(x) for x in range(ord('a'), ord('z') + 1)]
syms += ['_']
codes = set()

for sym in syms:
    for p in range(PERIOD_OF_IDS):
        vocab_item_id = f'{sym}:{p}'
        vocab_item_code = np.zeros(CODE_SIZE, dtype=int)
        vocab_item_code[:CODE_ACTIVE_BITS] = 1

        while True:
            vocab_item_code = np.random.permutation(vocab_item_code)
            assert (vocab_item_code == 1).sum() == CODE_ACTIVE_BITS
            vocab_item_code_str = ''.join(map(str, vocab_item_code))

            if not vocab_item_code_str in codes:
                codes.add(vocab_item_code_str)
                break

        VOCAB[vocab_item_id] = vocab_item_code

assert len(codes) == len(syms) * PERIOD_OF_IDS, (len(codes), len(syms))

In [153]:
def encode(content, offsets):
    df = pd.DataFrame(columns=['text'])
    
    for offset in offsets:
        window_texts = []
        window_offset_texts = []
        cumcodes = []
        
        window = []
        window_text = []
        window_offset_text = []
        
        for i, ch in enumerate(content):
            p = (i + offset) % PERIOD_OF_IDS
            q = f'{ch}:{p}'
            code = VOCAB[q]
            window.append(code)
            window_text.append(ch)
            window_offset_text.append(q)
        
            while len(window) > WINDOW_SIZE:
                window.pop(0)
                window_text.pop(0)
                window_offset_text.pop(0)
        
            if len(window) == WINDOW_SIZE:
                cumcode = np.sum(window, axis=0)
                cumcode[cumcode > 1] = 1
                cumcodes.append(cumcode)
    
                window_texts.append(''.join(window_text))
                window_offset_texts.append(''.join(window_offset_text))
        
        df['text'] = window_texts
        df[f'offset_{offset}_text'] = window_offset_texts
        df[f'offset_{offset}_code'] = cumcodes

    return df

In [163]:
encode(clean_text('hello_'), [0, 8])

Unnamed: 0,text,offset_0_text,offset_0_code,offset_8_text,offset_8_code
0,hello_,h:0e:1l:2l:3o:4_:5,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ...",h:8e:9l:0l:1o:2_:3,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."


In [170]:
df = encode(content, [0, 1])
df.head()

Unnamed: 0,text,offset_0_text,offset_0_code,offset_1_text,offset_1_code
0,the_so,t:0h:1e:2_:3s:4o:5,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, ...",t:1h:2e:3_:4s:5o:6,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,he_son,h:1e:2_:3s:4o:5n:6,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, ...",h:2e:3_:4s:5o:6n:7,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
2,e_sonn,e:2_:3s:4o:5n:6n:7,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",e:3_:4s:5o:6n:7n:8,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
3,_sonne,_:3s:4o:5n:6n:7e:8,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, ...",_:4s:5o:6n:7n:8e:9,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
4,sonnet,s:4o:5n:6n:7e:8t:9,"[0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, ...",s:5o:6n:7n:8e:9t:0,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, ..."


In [172]:
df_train, df_test = train_test_split(df)