# Get Cosine Similarity Between the Arabic Verses of the Quran

In [55]:
#Import dependencies
import re, math
import heapq
from collections import Counter

In [56]:
WORD = re.compile(r'\w+')

In [57]:
#Convert text to vector
def text_to_vector(text):
        words = WORD.findall(text)
        return Counter(words)

In [58]:
#Get cosine similarity
def get_cosine(vec1, vec2):
     intersection = set(vec1.keys()) & set(vec2.keys())
     numerator = sum([vec1[x] * vec2[x] for x in intersection])

     sum1 = sum([vec1[x]**2 for x in vec1.keys()])
     sum2 = sum([vec2[x]**2 for x in vec2.keys()])
     denominator = math.sqrt(sum1) * math.sqrt(sum2)

     if not denominator:
        return 0.0
     else:
        return float(numerator) / denominator

In [59]:
#Compare similarities
def compare_similarity(text1,text2):
    vector1 = text_to_vector(text1)
    vector2 = text_to_vector(text2)

    cosine = get_cosine(vector1, vector2)
    return cosine   

In [60]:
text1 = "أمة جاء يستأخرون ساعة يستقدمون"
text2 = "قال ملك ضر نفع شاء الله أمة جاء يستأخرون ساعة يستقدمون"
print (compare_similarity (text1,text2))

0.674199862463242


# Create a DataFrame

In [14]:
import pandas as pd

Read from existing pickle file so we can quickly create a new dataframe model

In [16]:
df = pd.read_pickle ('tbl_similarity_score.pkl')

In [17]:
df.head()

Unnamed: 0,surah_number,aya_number,compare_to_surah_number,compare_to_aya_number,cosine_similarity_translation,cosine_similarity_translation_clean,cosine_similarity_translation_tokenized,cosine_similarity_translation_without_stop_words
1,1,1,1,2,0.456435,0.456435,0.456435,0.288675
2,1,1,1,3,0.456435,0.653197,0.653197,0.816497
3,1,1,1,4,0.456435,0.456435,0.456435,0.0
4,1,1,1,5,0.0,0.0,0.0,0.0
5,1,1,1,6,0.316228,0.316228,0.316228,0.0


Make sure there are no surah/aya pairs comparing to themselves. This would be a waste of resources

In [21]:
len (df[(df['surah_number']==1) & (df['aya_number']==1) & (df['compare_to_surah_number']==1) & (df['compare_to_aya_number']==1)])

0

We need to create two new columns in the dataframe to hold data from arab lemmitization

In [28]:
df['cosine_similarity_arabic_lemmatized'] = 0
df['cosine_similarity_arabic_lemmatized_without_stop_words'] = 0

In [29]:
df.head()

Unnamed: 0,surah_number,aya_number,compare_to_surah_number,compare_to_aya_number,cosine_similarity_translation,cosine_similarity_translation_clean,cosine_similarity_translation_tokenized,cosine_similarity_translation_without_stop_words,cosine_similarity_arabic_lemmatized,cosine_similarity_arabic_lemmatized_without_stop_words
1,1,1,1,2,0.456435,0.456435,0.456435,0.288675,0,0
2,1,1,1,3,0.456435,0.653197,0.653197,0.816497,0,0
3,1,1,1,4,0.456435,0.456435,0.456435,0.0,0,0
4,1,1,1,5,0.0,0.0,0.0,0.0,0,0
5,1,1,1,6,0.316228,0.316228,0.316228,0.0,0,0


Load tbl_quran as a dataframe as well so we can quickly grab its aya's without burdening sql

In [40]:
df_quran = pd.read_sql ('SELECT * FROM `tbl_quran`', connection)

In [41]:
df_quran.head ()

Unnamed: 0,id,surah_number,aya_number,text,text_minimal,text_arabic_lemmatized,text_arabic_lemmatized_without_stop_words
0,1,1,1,بِسْمِ اللَّهِ الرَّحْمَـٰنِ الرَّحِيمِ,بسم الله الرحمن الرحيم,بسم الله رحمن رحيم,بسم الله رحمن رحيم
1,2,1,2,الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ,الحمد لله رب العالمين,حمد الله رب عالم,حمد الله رب عالم
2,3,1,3,الرَّحْمَـٰنِ الرَّحِيمِ,الرحمن الرحيم,رحمن رحيم,رحمن رحيم
3,4,1,4,مَالِكِ يَوْمِ الدِّينِ,مالك يوم الدين,مالك يوم دين,مالك يوم دين
4,5,1,5,إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ,إياك نعبد وإياك نستعين,إياك عبد إياك استعان,عبد استعان


<span style='color:red'>Note the following uses itertuples. But in the future we should use to_dict() for faster processing. Also it may be better to write directly to a DarFrame so the RAM doesn't need to keep enormous amount of data in its memory while processing.</span>

For more info on efficient computiation, see:
https://towardsdatascience.com/heres-the-most-efficient-way-to-iterate-through-your-pandas-dataframe-4dad88ac92ee

In [99]:
data = []
i = 0
for row in df.itertuples(index=True, name='Pandas'):
        
    surah_number = row.surah_number 
    aya_number = row.aya_number
    
    compare_to_surah_number = row.compare_to_surah_number
    compare_to_aya_number = row.compare_to_aya_number
    
    # Pull to text to compare
    
    # Lemmatized
    text_arabic_lemmatized_1 = df_quran[(df_quran['surah_number']==surah_number) & (df_quran['aya_number']==aya_number)]['text_arabic_lemmatized'].values[0]
    text_arabic_lemmatized_2 = df_quran[(df_quran['surah_number']==compare_to_surah_number) & (df_quran['aya_number']==compare_to_aya_number)]['text_arabic_lemmatized'].values[0]
    
    # Lemmatized without stop words
    text_arabic_lemmatized_without_stop_words_1 = df_quran[(df_quran['surah_number']==surah_number) & (df_quran['aya_number']==aya_number)]['text_arabic_lemmatized_without_stop_words'].values[0]
    text_arabic_lemmatized_without_stop_words_2 = df_quran[(df_quran['surah_number']==compare_to_surah_number) & (df_quran['aya_number']==compare_to_aya_number)]['text_arabic_lemmatized_without_stop_words'].values[0]

    cosine_similarity_arabic_lemmatized = compare_similarity (text_arabic_lemmatized_1, text_arabic_lemmatized_2)
    cosine_similarity_arabic_lemmatized_without_stop_words = compare_similarity (text_arabic_lemmatized_without_stop_words_1, text_arabic_lemmatized_without_stop_words_2)
    
    data.append ({
        'surah_number': surah_number,
        'aya_number': aya_number,
        'compare_to_surah_number': compare_to_surah_number,
        'compare_to_aya_number': compare_to_aya_number,
        'cosine_similarity_translation': 0,
        'cosine_similarity_translation_clean': 0,
        'cosine_similarity_translation_tokenized': 0,
        'cosine_similarity_translation_without_stop_words': 0,
        'cosine_similarity_arabic_lemmatized': cosine_similarity_arabic_lemmatized,
        'cosine_similarity_arabic_lemmatized_without_stop_words': cosine_similarity_arabic_lemmatized_without_stop_words
    })
    
#     i+=1
#     if (i==5):
#         break

In [100]:
df_lemmatized = pd.DataFrame(data)

Save to pickle file for easy restoration

In [101]:
df_lemmatized.to_pickle ('df_lemmatized_sept_5_2022.pkl');

In [102]:
df_lemmatized.head ()

Unnamed: 0,surah_number,aya_number,compare_to_surah_number,compare_to_aya_number,cosine_similarity_translation,cosine_similarity_translation_clean,cosine_similarity_translation_tokenized,cosine_similarity_translation_without_stop_words,cosine_similarity_arabic_lemmatized,cosine_similarity_arabic_lemmatized_without_stop_words
0,1,1,1,2,0,0,0,0,0.25,0.25
1,1,1,1,3,0,0,0,0,0.707107,0.707107
2,1,1,1,4,0,0,0,0,0.0,0.0
3,1,1,1,5,0,0,0,0,0.0,0.0
4,1,1,1,6,0,0,0,0,0.0,0.0


In [103]:
# Only select rows with at least some similarity
df_sliced = df_lemmatized[(df_lemmatized['cosine_similarity_arabic_lemmatized'] > 0) | (df_lemmatized['cosine_similarity_arabic_lemmatized_without_stop_words'] > 0)].copy()

In [107]:
df_sliced.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19361898 entries, 0 to 38881459
Data columns (total 10 columns):
 #   Column                                                  Dtype  
---  ------                                                  -----  
 0   surah_number                                            int64  
 1   aya_number                                              int64  
 2   compare_to_surah_number                                 int64  
 3   compare_to_aya_number                                   int64  
 4   cosine_similarity_translation                           int64  
 5   cosine_similarity_translation_clean                     int64  
 6   cosine_similarity_translation_tokenized                 int64  
 7   cosine_similarity_translation_without_stop_words        int64  
 8   cosine_similarity_arabic_lemmatized                     float64
 9   cosine_similarity_arabic_lemmatized_without_stop_words  float64
dtypes: float64(2), int64(8)
memory usage: 1.6 GB


In [104]:
df_sliced.head()

Unnamed: 0,surah_number,aya_number,compare_to_surah_number,compare_to_aya_number,cosine_similarity_translation,cosine_similarity_translation_clean,cosine_similarity_translation_tokenized,cosine_similarity_translation_without_stop_words,cosine_similarity_arabic_lemmatized,cosine_similarity_arabic_lemmatized_without_stop_words
0,1,1,1,2,0,0,0,0,0.25,0.25
1,1,1,1,3,0,0,0,0,0.707107,0.707107
12,1,1,2,7,0,0,0,0,0.117851,0.176777
13,1,1,2,8,0,0,0,0,0.138675,0.188982
14,1,1,2,9,0,0,0,0,0.144338,0.223607


In [109]:
# Test to see if the DataFrame stored similar ayas
df_sliced[(df_sliced['cosine_similarity_arabic_lemmatized']>0.8) & (df_sliced['cosine_similarity_arabic_lemmatized_without_stop_words']<0.9)]

Unnamed: 0,surah_number,aya_number,compare_to_surah_number,compare_to_aya_number,cosine_similarity_translation,cosine_similarity_translation_clean,cosine_similarity_translation_tokenized,cosine_similarity_translation_without_stop_words,cosine_similarity_arabic_lemmatized,cosine_similarity_arabic_lemmatized_without_stop_words
10742,1,2,45,36,0,0,0,0,0.801784,0.801784
57276,2,3,8,3,0,0,0,0,0.866025,0.816497
78533,2,6,36,10,0,0,0,0,0.852803,0.894427
207485,2,27,13,25,0,0,0,0,0.820610,0.868599
251862,2,34,20,116,0,0,0,0,0.832050,0.881917
...,...,...,...,...,...,...,...,...,...,...
38556937,104,5,86,2,0,0,0,0,0.833333,0.500000
38557039,104,5,90,12,0,0,0,0,0.833333,0.500000
38557164,104,5,101,3,0,0,0,0,0.833333,0.500000
38557171,104,5,101,10,0,0,0,0,0.833333,0.707107


In [110]:
# To check recorded results are correct, you can just grab the associated ayas and compare them
# [1:2]
str1 = 'حمد الله رب عالم'
# [45:36]
str2 = 'الله حمد رب سماء رب أرض رب عالم'

In [112]:
# This should match the results above
compare_similarity (str1, str2)

0.8017837257372732

In [106]:
df_sliced.to_pickle ('df_lemmatized_sept_7_2022.pkl');

In [128]:
translation_id = 789

In [129]:
# Using DataFrame.insert() to add a column
df_sliced.insert(0, "translation_id", translation_id)

In [132]:
df_sliced.head(2)

Unnamed: 0,translation_id,surah_number,aya_number,compare_to_surah_number,compare_to_aya_number,cosine_similarity_translation,cosine_similarity_translation_clean,cosine_similarity_translation_tokenized,cosine_similarity_translation_without_stop_words,cosine_similarity_arabic_lemmatized,cosine_similarity_arabic_lemmatized_without_stop_words
0,789,1,1,1,2,0,0,0,0,0.25,0.25
1,789,1,1,1,3,0,0,0,0,0.707107,0.707107


## Save resultant DataFrame to SQL

In [168]:
import mysql.connector
from sqlalchemy import create_engine

In [169]:
import connection.config

In [170]:
# Create SQLAlchemy engine to connect to MySQL Database
engine = create_engine("mysql+pymysql://{user}:{passwd}@{host}/{db}".format(**connection.config.config))

In [150]:
#connection = pymysql.connect(host=hostname, user=uname, passwd=pwd, db=dbname)
# connect = pymysql.connect(**connection.config)

In [171]:
# Save to MySQL using chunks of 500 rows each
df_sliced.to_sql('tbl_similarity_score', engine, index=False, if_exists='append', chunksize=500, method=None)

19361898