In [2]:
import re 
from collections import defaultdict 

In [4]:
text = '''
In a hole in the ground there lived a hobbit. Not a nasty, dirty, wet hole, filled with the ends of worms and an oozy smell, nor yet a dry, bare, sandy hole with nothing in it to sit down on or to eat: it was a hobbit-hole, and that 
means comfort. 
It had a perfectly round door like a porthole, painted green, with a shiny yellow brass knob in the exact middle. The door opened on to a tube-shaped hall like a tunnel: a very comfortable tunnel without smoke, with panelled walls, and floors tiled and carpeted, provided with polished chairs, and lots and lots of pegs for hats and coats - the hobbit was fond of visitors. The tunnel wound on and on, going fairly but not quite straight into the side of the hill - The Hill, as all the people for many miles round called it - and many little round doors opened out of it, first on one side and then on another. 
No going upstairs for the hobbit: bedrooms, bathrooms, cellars, pantries (lots of these), wardrobes (he had whole 
rooms devoted to clothes), kitchens, dining-rooms, all were on the same floor, and indeed on the same passage. 
The best rooms were all on the lefthand side (going in), for these were the only ones to have windows, deep-set round windows looking over his garden, and meadows beyond, sloping down to the river. 
'''

In [5]:
import re 
from collections import defaultdict 

def get_stats(vocab): 
	""" 
	Given a vocabulary (dictionary mapping words to frequency counts), returns a 
	dictionary of tuples representing the frequency count of pairs of characters 
	in the vocabulary. 
	"""
	pairs = defaultdict(int) 
	for word, freq in vocab.items(): 
		symbols = word.split() 
		for i in range(len(symbols)-1): 
			pairs[symbols[i],symbols[i+1]] += freq 
	return pairs 

def merge_vocab(pair, v_in): 
	""" 
	Given a pair of characters and a vocabulary, returns a new vocabulary with the 
	pair of characters merged together wherever they appear. 
	"""
	v_out = {} 
	bigram = re.escape(' '.join(pair)) 
	p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)') 
	for word in v_in: 
		w_out = p.sub(''.join(pair), word) 
		v_out[w_out] = v_in[word] 
	return v_out 

def get_vocab(data): 
	""" 
	Given a list of strings, returns a dictionary of words mapping to their frequency 
	count in the data. 
	"""
	vocab = defaultdict(int) 
	for line in data: 
		for word in line.split(): 
			vocab[' '.join(list(word)) + ' </w>'] += 1
	return vocab 

def byte_pair_encoding(data, n): 
	""" 
	Given a list of strings and an integer n, returns a list of n merged pairs 
	of characters found in the vocabulary of the input data. 
	"""
	vocab = get_vocab(data) 
	for i in range(n): 
		pairs = get_stats(vocab) 
		best = max(pairs, key=pairs.get) 
		vocab = merge_vocab(best, vocab) 
	return vocab 

data = text.split('.') 

n = 230
bpe_pairs = byte_pair_encoding(data, n) 
bpe_pairs


{'In</w>': 1,
 'a</w>': 11,
 'hole</w>': 2,
 'in</w>': 3,
 'the</w>': 13,
 'ground</w>': 1,
 'there</w>': 1,
 'lived</w>': 1,
 'hobbit</w>': 2,
 'Not</w>': 1,
 'nasty,</w>': 1,
 'dirty,</w>': 1,
 'wet</w>': 1,
 'hole,</w>': 1,
 'filled</w>': 1,
 'with</w>': 5,
 'ends</w>': 1,
 'of</w>': 6,
 'worms</w>': 1,
 'and</w>': 12,
 'an</w>': 1,
 'oozy</w>': 1,
 'smell,</w>': 1,
 'nor</w>': 1,
 'yet</w>': 1,
 'dry,</w>': 1,
 'bare,</w>': 1,
 'sandy</w>': 1,
 'nothing</w>': 1,
 'it</w>': 3,
 'to</w>': 6,
 'sit</w>': 1,
 'down</w>': 2,
 'on</w>': 8,
 'or</w>': 1,
 'eat:</w>': 1,
 'was</w>': 2,
 'hobbit-hole,</w>': 1,
 'that</w>': 1,
 'means</w>': 1,
 'comfort</w>': 1,
 'It</w>': 1,
 'had</w>': 2,
 'perfectly</w>': 1,
 'round</w>': 4,
 'door</w>': 2,
 'like</w>': 2,
 'porthole,</w>': 1,
 'painted</w>': 1,
 'green,</w>': 1,
 'shiny</w>': 1,
 'yellow</w>': 1,
 'brass</w>': 1,
 'knob</w>': 1,
 'exact</w>': 1,
 'middle</w>': 1,
 'The</w>': 4,
 'opened</w>': 2,
 'tube-shaped</w>': 1,
 'hall</w>': 1,
 't

In [7]:
pip install pyenchant

Collecting pyenchant
  Downloading pyenchant-3.2.2-py3-none-win_amd64.whl (11.9 MB)
     ---------------------------------------- 11.9/11.9 MB 1.8 MB/s eta 0:00:00
Installing collected packages: pyenchant
Successfully installed pyenchant-3.2.2
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
import enchant.utils

string1 = "программированние"
string2 = "лингвистика"
l_dist = enchant.utils.levenshtein(string1, string2)
print("Расстояние Левенштейна между "+string1+" и "+string2+" равно "+str(l_dist))

string1 = "levenshtein"
string2 = "einstein"
l_dist = enchant.utils.levenshtein(string1, string2)
print("Расстояние Левенштейна межжду "+string1+" и "+string2+" равно "+str(l_dist))

Расстояние Левенштейна между программированние и лингвистика равно 14
Расстояние Левенштейна между levenshtein и einstein равно 4


1 - [A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}

2 - ^(([a-zA-Z]:)|(\/))?(\/[^/\0]+)*\/((\w+)\.(\w+))$

3 - #\[\[([^:]+):\s*([^"#]+)\s*"([^"]+)"\s*:\s*([^:]+):([^]]+)\]\]