In [28]:
!pip install textdistance



In [29]:
!pip install hazm



In [30]:
import pandas as pd
import numpy as np
import textdistance
import re
from collections import Counter
import os
from hazm import *
from tqdm import tqdm

In [33]:
DATA_PATH = "/content/drive/MyDrive/yektanet_data"

In [34]:
txt_path = os.path.join(DATA_PATH,"wiki.txt")

In [35]:
wiki_list = []
with open(txt_path,'r') as txt_file:
  for line in txt_file:
    wiki_list.append(line)

In [36]:
word_tokenize("سلام کیان زیبا حالت چطوره؟")

['سلام', 'کیان', 'زیبا', 'حالت', 'چطوره', '؟']

In [37]:
normalizer = Normalizer()

In [38]:
words = []
for wiki in wiki_list:
  words.extend(word_tokenize(wiki))

In [39]:
words_freq = Counter(words)

In [40]:
len(words_freq)

221120

In [45]:
words_freq['!']

0

In [46]:
def get_similar_words(word):
  pass

In [56]:
def levenshtein(word1, word2):
  # print(word1, word2)
  ins_cost = 1
  del_cost = 1
  # rep_cost = replace_cost(ch1,ch2)
  if len(word1) < len(word2):
    word1, word2 = word2, word1

  # len(s1) >= len(s2)
  if len(word2) == 0:
    return len(word1)

  previous_row = range(len(word2) + 1)
  for i, c1 in enumerate(word1):
    current_row = [i + 1]
    for j, c2 in enumerate(word2):
      insertions = previous_row[j + 1] + ins_cost # j+1 instead of j since previous_row and current_row are one character longer
      deletions = current_row[j] + del_cost       # than s2
      substitutions = previous_row[j] + replace_cost(c1,c2)
      current_row.append(min(insertions, deletions, substitutions))
    previous_row = current_row
  
  return previous_row[-1]

In [48]:
persian_keyboard_list = list('ضصثقفغعهخحجچشسیبلاتنمکگ---ظطزرذدپو--')
persian_keyboard = np.array(persian_keyboard_list).reshape(3,12)
persian_keyboard

array([['ض', 'ص', 'ث', 'ق', 'ف', 'غ', 'ع', 'ه', 'خ', 'ح', 'ج', 'چ'],
       ['ش', 'س', 'ی', 'ب', 'ل', 'ا', 'ت', 'ن', 'م', 'ک', 'گ', '-'],
       ['-', '-', 'ظ', 'ط', 'ز', 'ر', 'ذ', 'د', 'پ', 'و', '-', '-']],
      dtype='<U1')

In [64]:
filtered_words_freq = {key:val for key, val in words_freq.items() if val > 20}

In [49]:
np.where(persian_keyboard=='س')

(array([1]), array([1]))

In [51]:
def replace_cost(ch1,ch2):
  if ch1=="_" or ch2=="_" or ch1 == 'ـ' or ch2=='ـ' or ch1=='ۀ' or ch2=='ۀ':
    return 2
  # if ch2=="آ":
    # ch2 == 'ا'
  if ch1=="آ":
    ch1 = 'ا'
  if ch2=="آ":
    ch2 = 'ا'
  if ch1=="ژ":
    ch1 = 'ز'
  if ch2=="ژ":
    ch2 = 'ز'
  if ch1=="ئ":
    ch1 = 'س'
  if ch2=="ئ":
    ch2 = 'س'
  if ch1=="ة":
    ch1 = 'ت'
  if ch2=="ة":
    ch2 = 'ت'
  if ch1=="ؤ":
    ch1 = 'ش'
  if ch2=="ؤ":
    ch2 = 'ش'
  if ch1=="أ":
    ch1 = 'ل'
  if ch2=="أ":
    ch2 = 'ل'
  if ch1=="ء":
    ch1 = 'پ'
  if ch2=="ء":
    ch2 = 'پ'
  if ch1=="إ":
    ch1 = 'ب'
  if ch2=="إ":
    ch2 = 'ب'
  if ch1 not in persian_keyboard_list or ch2 not in persian_keyboard_list:
    return 2
  # print(ch1,ch2)
  if ch1 == ch2:
    return 0
  row1 , col1 = np.where(persian_keyboard==ch1)
  row2 , col2 = np.where(persian_keyboard==ch2)
  row1, row2, col1, col2 = int(row1), int(row2), int(col1), int(col2)
  dis = ((row1-row2)**2 + (col1-col2)**2)**0.5
  if dis<=1:
    return 0.5
  if dis<=2:
    return 0.9
  if dis<=3:
    return 1.5
  return 2

In [86]:
def get_recoms(input_word):
  similarities = []
  for word in tqdm(filtered_words_freq.keys()):
    if len(input_word)-2 <=len(word) <= len(input_word) + 2:
      similarities.append((word,levenshtein(word,input_word),1/filtered_words_freq[word]))
  similarities = sorted(similarities, key = lambda x : (x[1],x[2]),reverse = False)
  recoms = similarities[:10]
  words = []
  distance = []
  freq = []
  for recom in recoms:
    words.append(recom[0])
    distance.append(recom[1])
    freq.append(1/recom[2])
  recoms_df = pd.DataFrame(list(zip(words,distance,freq)), columns=['words','distance','freq'])
  # for recom in recoms:
    
  return recoms_df

In [89]:
get_recoms('ترجیه')

100%|██████████| 25582/25582 [00:06<00:00, 3910.03it/s]


Unnamed: 0,words,distance,freq
0,ترجیع,0.5,63.0
1,ترکیه,0.9,2527.0
2,ترجیح,0.9,396.0
3,تزکیه,1.4,27.0
4,ترین,1.5,14592.0
5,ناحیه,1.5,3852.0
6,تاجی,1.5,62.0
7,درجه,1.9,5154.0
8,ترکی,1.9,1774.0
9,تکیه,1.9,699.0


In [90]:
get_recoms('موجح')

100%|██████████| 25582/25582 [00:04<00:00, 5901.13it/s]


Unnamed: 0,words,distance,freq
0,موجه,0.9,103.0
1,موج,1.0,1564.0
2,خوجه,1.4,37.0
3,کوچک,1.5,5232.0
4,نوح,1.5,352.0
5,توجه,1.8,6614.0
6,گوجه,1.8,267.0
7,موجود,1.9,4112.0
8,وجه,1.9,1097.0
9,مواجه,1.9,935.0


In [95]:
get_recoms('زمیمی')

100%|██████████| 25582/25582 [00:06<00:00, 3897.72it/s]


Unnamed: 0,words,distance,freq
0,زمینی,0.5,1721.0
1,میمی,1.0,33.0
2,امینی,1.4,292.0
3,رحیمی,1.4,113.0
4,زمین,1.5,9772.0
5,نیمی,1.5,563.0
6,زخمی,1.5,513.0
7,میخی,1.5,283.0
8,مینی,1.5,148.0
9,میکی,1.5,46.0


In [83]:
filtered_words_freq['توجیه']
levenshtein('توجیح','توجیه')

0.9

In [99]:
os.

['/usr/local/nvidia/bin',
 '/usr/local/cuda/bin',
 '/usr/local/sbin',
 '/usr/local/bin',
 '/usr/sbin',
 '/usr/bin',
 '/sbin',
 '/bin',
 '/tools/node/bin',
 '/tools/google-cloud-sdk/bin',
 '/opt/bin']

In [None]:
textdistance.Jaccard(qval=2).distance('خیابان',input_word)

0.6666666666666667

In [71]:
!git init

Initialized empty Git repository in /content/.git/


In [100]:
!git add /content/drive/MyDrive/Colab\ Notebooks/

fatal: pathspec '/content/drive/MyDrive/Colab' did not match any files


In [None]:
import inspect

In [None]:
inspect.getsource(textdistance)

'"""\nTextDistance.\nCompute distance between sequences.\n30+ algorithms, pure python implementation, common interface.\n"""\n\n# main package info\n__title__ = \'TextDistance\'\n__version__ = \'4.2.1\'\n__author__ = \'Gram (@orsinium)\'\n__license__ = \'MIT\'\n\n\n# version synonym\nVERSION = __version__\n\n\n# app\nfrom .algorithms import *  # noQA\nfrom .utils import *  # noQA\n'

In [None]:
def word_distance(word1,word2):
  ins_cost = 1
  del_cost = 1
  rep_cost = 2
  word1_ch = list(word1)
  word2_ch = list(word2)
  

In [None]:
'آ' == "آ"

True

In [None]:
def get_near_words:
  