In [233]:
import csv
import pandas as pd
import os
import natsort

from collections import defaultdict

In [234]:
def parse_csv_to_dict(file_path):
    df = pd.read_csv(file_path)
    result_dict = df.set_index('word').to_dict()[' count']
    return result_dict

In [235]:
csv_dict_path = "../../cyrillic_dict3/"

In [236]:
csv_files = natsort.natsorted([csv_dict_path + f for f in os.listdir(csv_dict_path) if f.endswith('.csv') and os.path.isfile(os.path.join(csv_dict_path, f))])

In [237]:
csv_files

['../../cyrillic_dict3/chunk_0.csv',
 '../../cyrillic_dict3/chunk_1.csv',
 '../../cyrillic_dict3/chunk_2.csv',
 '../../cyrillic_dict3/chunk_3.csv',
 '../../cyrillic_dict3/chunk_4.csv',
 '../../cyrillic_dict3/chunk_5.csv',
 '../../cyrillic_dict3/chunk_6.csv',
 '../../cyrillic_dict3/chunk_7.csv',
 '../../cyrillic_dict3/chunk_8.csv',
 '../../cyrillic_dict3/chunk_9.csv',
 '../../cyrillic_dict3/chunk_10.csv',
 '../../cyrillic_dict3/chunk_11.csv',
 '../../cyrillic_dict3/chunk_12.csv',
 '../../cyrillic_dict3/chunk_13.csv',
 '../../cyrillic_dict3/chunk_14.csv',
 '../../cyrillic_dict3/chunk_15.csv',
 '../../cyrillic_dict3/chunk_16.csv',
 '../../cyrillic_dict3/chunk_17.csv',
 '../../cyrillic_dict3/chunk_18.csv',
 '../../cyrillic_dict3/chunk_19.csv',
 '../../cyrillic_dict3/chunk_20.csv',
 '../../cyrillic_dict3/chunk_21.csv',
 '../../cyrillic_dict3/chunk_22.csv',
 '../../cyrillic_dict3/chunk_23.csv',
 '../../cyrillic_dict3/chunk_24.csv',
 '../../cyrillic_dict3/chunk_25.csv',
 '../../cyrillic_dict3

In [238]:
word_freq = defaultdict(lambda: 0)

In [239]:
for csv_file in csv_files:
    word_dict = parse_csv_to_dict(csv_file)
    for word in word_dict.keys():
        word_freq[word] += word_dict[word]

In [240]:
len(word_freq)

1799827

In [241]:
alphabet = set()

In [242]:
for word in word_freq.keys():
    for char in word:
        if char not in alphabet :
            alphabet.add(char)

In [243]:
print(alphabet)

{'л', 'ф', 'ц', 'ю', 'ё', 'п', 'ч', 'ъ', 'е', 'и', 'і', 'н', 'ë', 'к', 'г', 'м', 'ы', 'в', 'ш', 'я', 'ь', 'з', 'й', 'с', 'р', 'б', 'ғ', 'а', 'ж', 'ў', 'т', 'э', 'д', 'i', 'ҳ', 'х', 'о', 'қ', 'щ', 'у'}


In [244]:
alphabet = sorted(list(alphabet))

In [245]:
len(alphabet)

40

In [246]:
splits = {
    word : [c for c in word] for word in word_freq.keys()
}

In [247]:
splits[list(splits.keys())[0]]

['ë', 'а', 'ж']

In [218]:
type(splits['ëаж'])

list

In [251]:
def compute_pair_freq(splits):
    pair_freq = defaultdict(int)
    
    for word, freq in word_freq.items():
        split = splits[word]
        if len(split) == 1 :
            continue
        
        for i in range(len(split)-1):
            pair = (split[i] , split[i+1])
            pair_freq[pair] += freq
        
    return pair_freq


In [252]:
pair_freq = compute_pair_freq(splits)

In [253]:
pair_freq

defaultdict(int,
            {('ë', 'а'): 4,
             ('а', 'ж'): 752774,
             ('ë', 'в'): 3,
             ('в', 'у'): 130631,
             ('у', 'з'): 770462,
             ('з', 'л'): 642790,
             ('л', 'и'): 8871264,
             ('и', 'к'): 3649386,
             ('к', 'н'): 278016,
             ('н', 'и'): 13957849,
             ('ë', 'д'): 3,
             ('д', 'г'): 72603,
             ('г', 'о'): 355529,
             ('о', 'р'): 4093281,
             ('р', 'л'): 2028981,
             ('к', 'л'): 1107867,
             ('л', 'а'): 19789823,
             ('а', 'р'): 18673189,
             ('р', 'г'): 1828576,
             ('г', 'а'): 11867031,
             ('ë', 'з'): 19,
             ('з', 'г'): 407919,
             ('г', 'и'): 4127069,
             ('з', 'д'): 353815,
             ('д', 'и'): 6181266,
             ('и', 'р'): 5050205,
             ('р', 'и'): 11561310,
             ('и', 'б'): 2016004,
             ('з', 'и'): 2101093,
             ('и', 'д'): 

In [254]:
def get_best_pair_freq(pair_freq):
    best_pair = ""
    max_pair_freq = None
    
    for pair, freq in pair_freq.items():
        if max_pair_freq is None or freq > max_pair_freq:
            best_pair = pair
            max_pair_freq = freq
    return best_pair, max_pair_freq

In [255]:
get_best_pair_freq(pair_freq)

(('л', 'а'), 19789823)

In [256]:
def merge_pair(a, b, splits):
    for word in word_freq:
        split = splits[word]
        if len(split) == 1:
            continue

        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                split = split[:i] + [a + b] + split[i + 2 :]
            else:
                i += 1
        splits[word] = split
    return splits

In [257]:
splits = merge_pair('л', 'а' , splits)

In [258]:
alphabet.append('ла')

In [259]:
i = 0

In [260]:
print(len(pair_freq))

1430


In [261]:
part = list(splits.keys())[:100]

In [263]:
for p in part: print(p, splits[p])

ëаж ['ë', 'а', 'ж']
ëвузликни ['ë', 'в', 'у', 'з', 'л', 'и', 'к', 'н', 'и']
ëдгор ['ë', 'д', 'г', 'о', 'р']
ëдгорликларга ['ë', 'д', 'г', 'о', 'р', 'л', 'и', 'к', 'ла', 'р', 'г', 'а']
ëзги ['ë', 'з', 'г', 'и']
ëзди ['ë', 'з', 'д', 'и']
ëздириб ['ë', 'з', 'д', 'и', 'р', 'и', 'б']
ëзида ['ë', 'з', 'и', 'д', 'а']
ëзилади ['ë', 'з', 'и', 'ла', 'д', 'и']
ëзилган ['ë', 'з', 'и', 'л', 'г', 'а', 'н']
ëзиши ['ë', 'з', 'и', 'ш', 'и']
ëзув ['ë', 'з', 'у', 'в']
ëзувни ['ë', 'з', 'у', 'в', 'н', 'и']
ëзувчи ['ë', 'з', 'у', 'в', 'ч', 'и']
ëзувчининг ['ë', 'з', 'у', 'в', 'ч', 'и', 'н', 'и', 'н', 'г']
ëки ['ë', 'к', 'и']
ëмон ['ë', 'м', 'о', 'н']
ëмғир ['ë', 'м', 'ғ', 'и', 'р']
ëнг ['ë', 'н', 'г']
ëнига ['ë', 'н', 'и', 'г', 'а']
ëнидаги ['ë', 'н', 'и', 'д', 'а', 'г', 'и']
ëнилғи ['ë', 'н', 'и', 'л', 'ғ', 'и']
ëнғин ['ë', 'н', 'ғ', 'и', 'н']
ëнғинга ['ë', 'н', 'ғ', 'и', 'н', 'г', 'а']
ëнғинларни ['ë', 'н', 'ғ', 'и', 'н', 'ла', 'р', 'н', 'и']
ëпиб ['ë', 'п', 'и', 'б']
ëпилишини ['ë', 'п', 'и', 'л', 'и', 

In [None]:
while i < 10_000 :
    i += 1
    pair_freq = compute_pair_freq(splits)
    maxpair = get_best_pair_freq(pair_freq)
    print(maxpair)
    best_pair, max_pair_freq = maxpair
    print('best_pair:', best_pair, len(best_pair), 'mp:', max_pair_freq)
    first, second = best_pair
    splits = merge_pair(first, second, splits)
    alphabet.append(first + second)

(('н', 'и'), 13957849)
best_pair: ('н', 'и') 2 mp: 13957849
(('д', 'а'), 12014646)
best_pair: ('д', 'а') 2 mp: 12014646
(('г', 'а'), 11867031)
best_pair: ('г', 'а') 2 mp: 11867031
(('ла', 'р'), 11694160)
best_pair: ('ла', 'р') 2 mp: 11694160
(('л', 'и'), 8871264)
best_pair: ('л', 'и') 2 mp: 8871264
(('т', 'и'), 7588350)
best_pair: ('т', 'и') 2 mp: 7588350
(('м', 'а'), 7203804)
best_pair: ('м', 'а') 2 mp: 7203804
(('р', 'и'), 6695886)
best_pair: ('р', 'и') 2 mp: 6695886
(('д', 'и'), 6181266)
best_pair: ('д', 'и') 2 mp: 6181266
(('т', 'а'), 5981150)
best_pair: ('т', 'а') 2 mp: 5981150
(('с', 'и'), 5744464)
best_pair: ('с', 'и') 2 mp: 5744464
(('н', 'г'), 5119935)
best_pair: ('н', 'г') 2 mp: 5119935
(('р', 'а'), 5047308)
best_pair: ('р', 'а') 2 mp: 5047308
(('лар', 'и'), 4865424)
best_pair: ('лар', 'и') 2 mp: 4865424
(('га', 'н'), 3902034)
best_pair: ('га', 'н') 2 mp: 3902034
(('ни', 'нг'), 3809034)
best_pair: ('ни', 'нг') 2 mp: 3809034
(('н', 'а'), 3594364)
best_pair: ('н', 'а') 2 mp: 35

In [None]:
pair_freq.items()

In [None]:
pair_freq['лар']