# Get Cosine Similarity Between the Arabic Verses of the Quran

In [1]:
#Import dependencies
import re, math
import heapq
from collections import Counter

In [2]:
WORD = re.compile(r'\w+')

In [3]:
#Convert text to vector
def text_to_vector(text):
        words = WORD.findall(text)
        return Counter(words)

In [4]:
#Get cosine similarity
def get_cosine(vec1, vec2):
     intersection = set(vec1.keys()) & set(vec2.keys())
     numerator = sum([vec1[x] * vec2[x] for x in intersection])

     sum1 = sum([vec1[x]**2 for x in vec1.keys()])
     sum2 = sum([vec2[x]**2 for x in vec2.keys()])
     denominator = math.sqrt(sum1) * math.sqrt(sum2)

     if not denominator:
        return 0.0
     else:
        return float(numerator) / denominator

In [5]:
#Compare similarities
def compare_similarity(text1,text2):
    vector1 = text_to_vector(text1)
    vector2 = text_to_vector(text2)

    cosine = get_cosine(vector1, vector2)
    return cosine   

In [10]:
text1 = "أمة جاء يستأخرون ساعة يستقدمون"
text2 = "قال ملك ضر نفع شاء الله أمة جاء يستأخرون ساعة يستقدمون"
print (compare_similarity (text1,text2))

0.674199862463242


## Update MySQL record for tbl_similarity_score

In [12]:
import mysql.connector
from sqlalchemy import create_engine

In [13]:
hostname="localhost"
dbname="quran_db"
uname="root"
pwd="mysql"

In [14]:
import pymysql

In [23]:
connection = pymysql.connect(host=hostname, user=uname, passwd=pwd, db=dbname)

In [35]:
def handler ():
    
    translation_id = 678
    
    curA = connection.cursor()
    # Get every aya
    curA.execute("SELECT surah_number, aya_number, text_arabic_lemmatized, text_arabic_lemmatized_without_stop_words FROM `tbl_quran` LIMIT 1")
    for A in curA.fetchall():
        surah_number = A[0]
        aya_number = A[1]
        text_arabic_lemmatized = A[2]
        text_arabic_lemmatized_without_stop_words = A[3]        

        # For each aya, compare to every other aya
        curB = connection.cursor()
        curB.execute("SELECT compare_to_surah_number, compare_to_aya_number FROM `tbl_similarity_score` WHERE surah_number = {0} AND aya_number = {1}".format (surah_number, aya_number))
        
        for B in curB.fetchall():
            compare_to_surah_number = B[0]
            compare_to_aya_number = B[1]
            
            # Get arabic for the comparison
            curC = connection.cursor()
            curC.execute("SELECT text_arabic_lemmatized, text_arabic_lemmatized_without_stop_words FROM `tbl_quran` WHERE surah_number = {0} AND aya_number = {1}".format (compare_to_surah_number, compare_to_aya_number))
            C = curC.fetchone()
            compare_to_text_arabic_lemmatized = C[0]
            compare_to_text_arabic_lemmatized_without_stop_words = C[1]            
            
            cosine_similarity_arabic_lemmatized = compare_similarity (text_arabic_lemmatized, compare_to_text_arabic_lemmatized)
            cosine_similarity_arabic_lemmatized_without_stop_words = compare_similarity (text_arabic_lemmatized_without_stop_words, compare_to_text_arabic_lemmatized_without_stop_words)
            
            #print (surah_number, aya_number, compare_to_surah_number, compare_to_aya_number, cosine_similarity_arabic_lemmatized, cosine_similarity_arabic_lemmatized_without_stop_words)
            curD = connection.cursor ()
            curD.execute ("INSERT INTO tbl_similarity_score (translation_id, surah_number, aya_number, compare_to_surah_number, compare_to_aya_number, cosine_similarity_arabic_lemmatized, cosine_similarity_arabic_lemmatized_without_stop_words) VALUES ({0}, {1}, {2}, {3}, {4}, {5}, {6})".format (translation_id, surah_number, aya_number, compare_to_surah_number, compare_to_aya_number, cosine_similarity_arabic_lemmatized, cosine_similarity_arabic_lemmatized_without_stop_words))
#             print (surah_number, aya_number, compare_to_surah_number, compare_to_aya_number)
            connection.commit()
handler()


In [37]:
connection.close()

Error: Already closed