<a href="https://colab.research.google.com/github/iAmarus/Tutorial/blob/main/Tutorial_07.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
## Student Name: Mohammed Alnajjar
## ID: 4102947

In [None]:
import re
from collections import defaultdict

In [None]:
def get_soundex_code(word):
  word = word.upper()
  word = re.sub(r'[^A-Z]', '', word)
  if not word:
    return ''

  soundex_code = word[0]
  encoding_map = {'BFPV' : '1', 'CGJKQSXZ' : '2', 'DT' : '3', 'L' : '4', 'MN': '5', 'R': '6'}
  for encoding, digit in encoding_map.items():
    soundex_code += ''.join(digit if letter in encoding else '' for letter in word[1:])

  soundex_code = re.sub(r'(.)\1+', r'\1', soundex_code)

  soundex_code = soundex_code.replace(soundex_code[0], '', 1)
  soundex_code = re.sub(r'[AEIOUY]', '', soundex_code)

  soundex_code = soundex_code.ljust(4, '0')
  soundex_code = soundex_code[:4]

  return soundex_code

In [None]:
def build_soundex_index(words):
  soundex_index = defaultdict(list)
  for word in words:
    soundex_code = get_soundex_code (word)
    soundex_index[soundex_code].append(word)
  return soundex_index

In [None]:
def find_similar_sounding_words (query, soundex_index):
  soundex_code = get_soundex_code (query)
  return soundex_index[soundex_code]


In [None]:
words = [
    'Robert' , 'Rupert', 'Rubin' , 'Robby', 'Rabat', 'Rabbit',
    'Smith', 'Smyth', 'Smythe' , 'Schmidt', 'Schmitz', 'Mohammed' , 'Muhammed'
  ]

soundex_index = build_soundex_index (words)

query = 'Mohammed'
similar_words = find_similar_sounding_words (query, soundex_index)

print(f"Words similar to '{query}':")
print(similar_words)
print(soundex_index)

Words similar to 'Mohammed':
['Smith', 'Smyth', 'Smythe', 'Mohammed', 'Muhammed']
defaultdict(<class 'list'>, {'1360': ['Robert', 'Rupert'], '1500': ['Rubin'], '1000': ['Robby'], '1300': ['Rabat', 'Rabbit'], '3500': ['Smith', 'Smyth', 'Smythe', 'Mohammed', 'Muhammed'], '2350': ['Schmidt', 'Schmitz']})


In [None]:
def get_soundex_code_ar(word):
  word = word.upper()
  word = re.sub(r'[^ا-ي]', '', word)
  if not word:
    return ''

  soundex_code = word[0]
  encoding_map = {'ف,ب' : '1', 'خ,ج,ز,س,ص,ظ,,ك' : '2', 'ت,ث,د,ذ,ض,ط' : '3', 'ل': '4', 'م,ن':'5', 'ر':'6'}
  for encoding, digit in encoding_map.items():
    soundex_code += ''.join(digit if letter in encoding else '' for letter in word[1:])

  soundex_code = re.sub(r'(.)\1+', r'\1', soundex_code)

  soundex_code = soundex_code.replace(soundex_code[0], '', 1)
  soundex_code = re.sub(r'[ح,ع,غ,ش,و,ي]', '', soundex_code)

  soundex_code = soundex_code.ljust(4, '0')
  soundex_code = soundex_code[:4]

  return soundex_code

In [None]:
def build_soundex_index_ar(words):
  soundex_index = defaultdict(list)
  for word in words:
    soundex_code = get_soundex_code_ar (word)
    soundex_index[soundex_code].append(word)
  return soundex_index

In [None]:
def find_similar_sounding_words_ar (query, soundex_index):
  soundex_code = get_soundex_code_ar (query)
  return soundex_index[soundex_code]

In [None]:
words = [
    'احمد' , 'محمد', 'محمدا' , 'محمود', 'حامد', 'عمير',
    'عامر', 'عمر', 'طارق' , 'نجار', 'النجار', 'علي' , 'علي'
  ]

soundex_index = build_soundex_index_ar (words)

query = 'علي'
similar_words = find_similar_sounding_words_ar (query, soundex_index)

print(f"Words similar to '{query}':")
print(similar_words)
print(soundex_index)

Words similar to 'علي':
['علي', 'علي']
defaultdict(<class 'list'>, {'3500': ['احمد', 'محمد', 'محمدا', 'محمود', 'حامد'], '5600': ['عمير', 'عامر', 'عمر'], '6000': ['طارق'], '2600': ['نجار'], '2456': ['النجار'], '4000': ['علي', 'علي']})
