# Find most closely related lemma for each lemma

## Vectorize methods for splitting and comparing strings

In [2]:
#Import dependencies
import re, math
import heapq
from collections import Counter

In [3]:
#Convert text to vector
def text_to_vector(text):
        letters = list (text)
        return Counter(letters)

In [4]:
text_to_vector ('إنسان')

Counter({'إ': 1, 'ن': 2, 'س': 1, 'ا': 1})

In [5]:
#Get cosine similarity
def get_cosine(vec1, vec2):
     intersection = set(vec1.keys()) & set(vec2.keys())
     numerator = sum([vec1[x] * vec2[x] for x in intersection])

     sum1 = sum([vec1[x]**2 for x in vec1.keys()])
     sum2 = sum([vec2[x]**2 for x in vec2.keys()])
     denominator = math.sqrt(sum1) * math.sqrt(sum2)

     if not denominator:
        return 0.0
     else:
        return float(numerator) / denominator

In [6]:
#Compare similarities
def compare_similarity(text1,text2):
    vector1 = text_to_vector(text1)
    vector2 = text_to_vector(text2)

    cosine = get_cosine(vector1, vector2)
    return cosine   

In [7]:
lemma1 = 'إنسان'
lemma2 = 'نسَانَ'
print (compare_similarity (lemma1, lemma2))

0.7171371656006361


## Get a list of unique lemma's in the Qur'an

In [11]:
import pymysql
import connection.config

In [12]:
connection = pymysql.connect(**connection.config.config)
def get_set_of_lemmatized_words_from_quran ():
    
    s = set()
    
    cur = connection.cursor()
    cur.execute("SELECT DISTINCT(lemmatized_word) FROM `tbl_translated_words`")
    
    for row in cur.fetchall():
        s.add (row[0])

    connection.close ()
    return s

In [13]:
lemmas = get_set_of_lemmatized_words_from_quran ()

In [14]:
len (lemmas)

5647

## Compare each lemma with every other lemma and save it

Use a memory efficient method to perform lemma comparison.

We first load the data into a csv file in memory, and then output it to a DataFrame.

Read more about the technique here: https://stackoverflow.com/questions/41888080/python-efficient-way-to-add-rows-to-dataframe

In [15]:
from io import StringIO
from csv import writer 
import pandas as pd

In [16]:
# Store in csv memory object
output = StringIO()
csv_writer = writer(output)

In [17]:
csv_writer.writerow(['lemma_1', 'lemma_2', 'cosine_similarity'])
for lemma1 in lemmas:
    for lemma2 in lemmas:
        if lemma1 != lemma2:
            similarity = compare_similarity (lemma1, lemma2)
            if similarity != 0:
                csv_writer.writerow([lemma1, lemma2, similarity])
#     break;

In [18]:
# Read back into the dataframe
output.seek(0)
df = pd.read_csv(output)

In [None]:
# df = pd.DataFrame(columns=['lemma_1', 'lemma_2', 'cosine_similarity'])

In [19]:
len (df)

18269852

## Save results to SQL

In [55]:
from sqlalchemy import create_engine
import connection.config

In [56]:
# Create SQLAlchemy engine to connect to MySQL Database
engine = create_engine("mysql+pymysql://{user}:{passwd}@{host}/{db}".format(**connection.config.config))

In [57]:
# Save to MySQL using chunks of 500 rows each
df.to_sql('tbl_lemma_relatives', engine, index=False, if_exists='append', chunksize=500, method=None)

31882962

In [20]:
## Save & Retrieve from Pickle file

In [21]:
df.to_pickle('data/tbl_lemma_relatives.pkl')

In [22]:
df.head()

Unnamed: 0,lemma_1,lemma_2,cosine_similarity
0,ذراع,أخرجتم,0.204124
1,ذراع,شاور,0.5
2,ذراع,ونصرناهم,0.316228
3,ذراع,استأذن,0.408248
4,ذراع,تخافوهم,0.188982


## Refine matches

When comparing matches, we need to also check when they begin with the same string. This gives the best results

In [89]:
dfB = df[df['cosine_similarity'] > 0.9].head(100).copy()

In [90]:
len (dfB)

100

In [85]:
dfB['lemma_1'].str[:2]

25179     تخ
25876     تخ
29431     سن
31354     عس
31739     عس
          ..
463604    اس
478125    حل
479056    حل
479638    حل
482497    وع
Name: lemma_1, Length: 100, dtype: object

Relevancy of two initial character match

In [112]:
dfX = df[(df['lemma_1'].str[:2] == df['lemma_2'].str[:2])].copy()

In [113]:
len (dfX)

111738

In [114]:
dfX.sort_values(by=['cosine_similarity'], ascending=False).head(15)

Unnamed: 0,lemma_1,lemma_2,cosine_similarity
16877487,شاكر,شارك,1.0
15174262,أسلف,أسفل,1.0
11933317,اعمل,اعلم,1.0
12521445,أقرب,أقبر,1.0
15172054,أقبر,أقرب,1.0
8238523,راود,رادو,1.0
13321524,واعد,وادع,1.0
12401517,مسفر,مسرف,1.0
2411282,أعلم,أعمل,1.0
2173260,وادع,واعد,1.0


Relevancy of *three* initial character match

In [115]:
dfY = df[(df['lemma_1'].str[:3] == df['lemma_2'].str[:3])].copy()

In [116]:
len (dfY)

15622

In [117]:
dfY.sort_values(by=['cosine_similarity'], ascending=False).head(15)

Unnamed: 0,lemma_1,lemma_2,cosine_similarity
8810676,يهدني,يهدين,1.0
5834385,يهدين,يهدني,1.0
9393826,استرق,استقر,1.0
772862,استقر,استرق,1.0
7883127,ونجيناهم,ونجيناهما,0.964764
2309240,وهديناهما,وهديناهم,0.964764
3526202,وهديناهم,وهديناهما,0.964764
12789327,ونجيناهما,ونجيناهم,0.964764
4104634,أفأنتم,أفأمنتم,0.959403
13613454,أفأمنتم,أفأنتم,0.959403


In [111]:
df[df['lemma_1'] == 'إنسان'].sort_values(by=['cosine_similarity'], ascending=False)

Unnamed: 0,lemma_1,lemma_2,cosine_similarity
17454111,إنسان,إنس,0.872872
17452581,إنسان,ناس,0.872872
17455334,إنسان,إحسان,0.845154
17454216,إنسان,ننس,0.845154
17453607,إنسان,ننساكم,0.801784
...,...,...,...
17453698,إنسان,تستفتي,0.109109
17454002,إنسان,أعمامكم,0.104828
17454547,إنسان,فليستعفف,0.101015
17454007,إنسان,فاتخذتموهم,0.101015


In [119]:
dfX[dfX['lemma_1'] == 'إنسان'].sort_values(by=['cosine_similarity'], ascending=False)

Unnamed: 0,lemma_1,lemma_2,cosine_similarity
17454111,إنسان,إنس,0.872872
17452630,إنسان,إن,0.801784
17454620,إنسان,إناء,0.755929
17455031,إنسان,إناث,0.755929
17455492,إنسان,إنسي,0.755929
17452591,إنسان,إنفاق,0.676123
17453353,إنسان,إنشاء,0.676123
17454435,إنسان,إنجيل,0.507093


In [120]:
dfY[dfY['lemma_1'] == 'إنسان'].sort_values(by=['cosine_similarity'], ascending=False)

Unnamed: 0,lemma_1,lemma_2,cosine_similarity
17454111,إنسان,إنس,0.872872
17455492,إنسان,إنسي,0.755929
