# Find most closely related lemma for each lemma

## Vectorize methods for splitting and comparing strings

In [14]:
#Import dependencies
import re, math
import heapq
from collections import Counter

In [15]:
#Convert text to vector
def text_to_vector(text):
        letters = list (text)
        return Counter(letters)

In [16]:
text_to_vector ('إنسان')

Counter({'إ': 1, 'ن': 2, 'س': 1, 'ا': 1})

In [17]:
#Get cosine similarity
def get_cosine(vec1, vec2):
     intersection = set(vec1.keys()) & set(vec2.keys())
     numerator = sum([vec1[x] * vec2[x] for x in intersection])

     sum1 = sum([vec1[x]**2 for x in vec1.keys()])
     sum2 = sum([vec2[x]**2 for x in vec2.keys()])
     denominator = math.sqrt(sum1) * math.sqrt(sum2)

     if not denominator:
        return 0.0
     else:
        return float(numerator) / denominator

In [18]:
#Compare similarities
def compare_similarity(text1,text2):
    vector1 = text_to_vector(text1)
    vector2 = text_to_vector(text2)

    cosine = get_cosine(vector1, vector2)
    return cosine   

In [6]:
lemma1 = 'إنسان'
lemma2 = 'نسَانَ'
print (compare_similarity (lemma1, lemma2))

0.7171371656006361


## Get a list of unique lemma's in the Qur'an

In [19]:
import pymysql
import connection.config

In [20]:
connection = pymysql.connect(**connection.config.config)
def get_set_of_lemmatized_words_from_quran ():
    
    s = set()
    
    cur = connection.cursor()
    cur.execute("SELECT DISTINCT(lemmatized_word) FROM `tbl_translated_words`")
    
    for row in cur.fetchall():
        s.add (row[0])

    connection.close ()
    return s

In [21]:
lemmas = get_set_of_lemmatized_words_from_quran ()

In [22]:
len (lemmas)

5647

## Compare each lemma with every other lemma and save it

Use a memory efficient method to perform lemma comparison.

We first load the data into a csv file in memory, and then output it to a DataFrame.

Read more about the technique here: https://stackoverflow.com/questions/41888080/python-efficient-way-to-add-rows-to-dataframe

In [50]:
from io import StringIO
from csv import writer 
import pandas as pd

In [51]:
# Store in csv memory object
output = StringIO()
csv_writer = writer(output)

In [52]:
csv_writer.writerow(['lemma_1', 'lemma_2', 'cosine_similarity'])
for lemma1 in lemmas:
    for lemma2 in lemmas:
        if lemma1 != lemma2:
            similarity = compare_similarity (lemma1, lemma2)
            if similarity != 0:
                csv_writer.writerow([lemma1, lemma2, similarity])
#     break;

In [53]:
# Read back into the dataframe
output.seek(0)
df = pd.read_csv(output)

In [None]:
# df = pd.DataFrame(columns=['lemma_1', 'lemma_2', 'cosine_similarity'])

In [54]:
len (df)

31882962

In [55]:
from sqlalchemy import create_engine
import connection.config

In [56]:
# Create SQLAlchemy engine to connect to MySQL Database
engine = create_engine("mysql+pymysql://{user}:{passwd}@{host}/{db}".format(**connection.config.config))

In [57]:
# Save to MySQL using chunks of 500 rows each
df.to_sql('tbl_lemma_relatives', engine, index=False, if_exists='append', chunksize=500, method=None)

31882962