In [1]:
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
from transformers import AutoTokenizer, AutoModel
import plotly.graph_objs as go
import torch

import openai
import time
import logging
import os

openai.api_key = "Use your own key here"
logging.basicConfig(level=logging.INFO)

In [2]:
surah_names = {
    '1': 'Al-Fatihah',
    '2': 'Al-Baqarah',
    '3': 'Aal-E-Imran',
    '4': 'An-Nisa',
    '5': 'Al-Maidah',
    '6': 'Al-An\'am',
    '7': 'Al-A\'raf',
    '8': 'Al-Anfal',
    '9': 'At-Tawbah',
    '10': 'Yunus',
    '11': 'Hud',
    '12': 'Yusuf',
    '13': 'Ar-Ra\'d',
    '14': 'Ibrahim',
    '15': 'Al-Hijr',
    '16': 'An-Nahl',
    '17': 'Al-Isra',
    '18': 'Al-Kahf',
    '19': 'Maryam',
    '20': 'Taha',
    '21': 'Al-Anbiya',
    '22': 'Al-Hajj',
    '23': 'Al-Mu\'minun',
    '24': 'An-Nur',
    '25': 'Al-Furqan',
    '26': 'Ash-Shu\'ara',
    '27': 'An-Naml',
    '28': 'Al-Qasas',
    '29': 'Al-Ankabut',
    '30': 'Ar-Rum',
    '31': 'Luqman',
    '32': 'As-Sajda',
    '33': 'Al-Ahzab',
    '34': 'Saba',
    '35': 'Fatir',
    '36': 'Ya-Sin',
    '37': 'As-Saffat',
    '38': 'Sad',
    '39': 'Az-Zumar',
    '40': 'Ghafir',
    '41': 'Fussilat',
    '42': 'Ash-Shura',
    '43': 'Az-Zukhruf',
    '44': 'Ad-Dukhan',
    '45': 'Al-Jathiya',
    '46': 'Al-Ahqaf',
    '47': 'Muhammad',
    '48': 'Al-Fath',
    '49': 'Al-Hujraat',
    '50': 'Qaf',
    '51': 'Adh-Dhariyat',
    '52': 'At-Tur',
    '53': 'An-Najm',
    '54': 'Al-Qamar',
    '55': 'Ar-Rahman',
    '56': 'Al-Waqi\'a',
    '57': 'Al-Hadid',
    '58': 'Al-Mujadila',
    '59': 'Al-Hashr',
    '60': 'Al-Mumtahina',
    '61': 'As-Saff',
    '62': 'Al-Jumu\'a',
    '63': 'Al-Munafiqun',
    '64': 'At-Taghabun',
    '65': 'At-Talaq',
    '66': 'At-Tahrim',
    '67': 'Al-Mulk',
    '68': 'Al-Qalam',
    '69': 'Al-Haaqqa',
    '70': 'Al-Ma\'arij',
    '71': 'Nuh',
    '72': 'Al-Jinn',
    '73': 'Al-Muzzammil',
    '74': 'Al-Muddaththir',
    '75': 'Al-Qiyama',
    '76': 'Al-Insan',
    '77': 'Al-Mursalat',
    '78': 'An-Naba',
    '79': 'An-Nazi\'at',
    '80': 'Abasa',
    '81': 'At-Takwir',
    '82': 'Al-Infitar',
    '83': 'Al-Mutaffifin',
    '84': 'Al-Inshiqaq',
    '85': 'Al-Buruj',
    '86': 'At-Tariq',
    '87': 'Al-A\'la',
    '88': 'Al-Ghashiya',
    '89': 'Al-Fajr',
    '90': 'Al-Balad',
    '91': 'Ash-Shams',
    '92': 'Al-Lail',
    '93': 'Ad-Duha',
    '94': 'Al-Inshirah',
    '95': 'At-Tin',
    '96': 'Al-Alaq',
    '97': 'Al-Qadr',
    '98': 'Al-Bayyina',
    '99': 'Az-Zalzala',
    '100': 'Al-Adiyat',
    '101': 'Al-Qari\'a',
    '102': 'At-Takathur',
    '103': 'Al-Asr',
    '104': 'Al-Humaza',
    '105': 'Al-Fil',
    '106': 'Quraysh',
    '107': 'Al-Ma\'un',
    '108': 'Al-Kawthar',
    '109': 'Al-Kafirun',
    '110': 'An-Nasr',
    '111': 'Al-Masad',
    '112': 'Al-Ikhlas',
    '113': 'Al-Falaq',
    '114': 'An-Nas'
}

In [3]:
# Step 2: Read the Quran and its translation from text files

with open('data/quran_en.txt', 'r', encoding='utf-8') as f:
    quran_en = f.readlines()
quran_en = [verse.strip().split('|') for verse in quran_en]

with open('data/quran_ar.txt', 'r', encoding='utf-8') as f:
    quran_ar = f.readlines()
    
quran_ar_new = []
for verse in quran_ar:
    surah, ayah, text = verse.strip().split('|')
    if text.startswith('بسم الله الرحمن الرحيم'):
        bismillah = 'بسم الله الرحمن الرحيم'
        verse = text.replace(bismillah, '').strip()
        if len(verse) > 1:
            quran_ar_new.append([surah, ayah, verse])
        else :
            quran_ar_new.append([surah, ayah, bismillah])
    else:
        quran_ar_new.append([surah, ayah, text])

# Merge Arabic and English Quran data
quran_data = []
for i in range(len(quran_en)):
    quran_data.append({
        'surah': int(quran_en[i][0]),
        'ayah': int(quran_en[i][1]),
        'text_en': quran_en[i][2],
        'text_ar': quran_ar_new[i][2]
    })

In [4]:
print(quran_data[0])
print(quran_data[6235])

{'surah': 1, 'ayah': 1, 'text_en': 'In the name of Allah, the Compassionate, the Merciful.', 'text_ar': 'بسم الله الرحمن الرحيم'}
{'surah': 114, 'ayah': 6, 'text_en': 'Whether of Jinn or of mankind.', 'text_ar': 'من الجنة والناس'}


In [5]:
import json

def export_embeddings_to_json(embeddings: list, file_path: str):
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(embeddings, f, ensure_ascii=False, indent=4)

In [19]:
import json
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import plotly.graph_objects as go

def visualize_embeddings_from_json(json_file):
    with open(json_file, 'r') as file:
        quran_json = json.load(file)
    
    # Extract embeddings from JSON and convert them to numpy arrays
    embeddings = [np.array(item['embedding']) for item in quran_json]

    # Reduce dimensionality of embeddings using t-SNE
    tsne = TSNE(n_components=3, random_state=7)
    embeddings_3d = tsne.fit_transform(np.array(embeddings))

    # Create a dictionary to map each surah to a unique color
    colors = {}
    for surah in pd.DataFrame(quran_json)['surah'].unique():
        colors[surah] = np.random.choice(range(256), size=3)

    # Create scatter plot
    fig = go.Figure()
    for surah in pd.DataFrame(quran_json)['surah'].unique():
        data = pd.DataFrame(quran_json)[pd.DataFrame(quran_json)['surah'] == surah]
        color = f'rgb({",".join(map(str, colors[surah]))})'
        fig.add_trace(go.Scatter3d(
            x=embeddings_3d[data.index, 0],
            y=embeddings_3d[data.index, 1],
            z=embeddings_3d[data.index, 2],
            text=data['text_ar'],
            mode='markers',
            marker=dict(
                size=10,
                color=color,
                opacity=0.8,
            ),
            name=surah_names[f'{surah}']
        ))

    # Set plot layout
    fig.update_layout(
        title='Distribution of Quran Verses',
        scene=dict(
            xaxis_title='t-SNE Dimension 1',
            yaxis_title='t-SNE Dimension 2',
            zaxis_title='t-SNE Dimension 3'
        )
    )

    # Export plot as HTML file
    fig.write_html(f'{json_file}.html')


In [6]:
def quran_bert_embeddings(quran_data: list):
    # Load pre-trained BERT model and tokenizer
    model_name = 'bert-base-uncased'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    BERT_embeddings = []
    for verse in quran_data:
        surah = verse['surah']
        ayah = verse['ayah']
        text_en = verse['text_en']
        text_ar = verse['text_ar']
        encoding = tokenizer(text_en, return_tensors='pt', padding=True, truncation=True)
        with torch.no_grad():
            output = model(**encoding)
            embedding = output.last_hidden_state.mean(dim=1).squeeze().numpy()
            BERT_embeddings.append(
                {
                    'surah': int(surah),
                    'ayah': int(ayah),
                    'text_en': text_en,
                    'text_ar': text_ar,
                    'embedding': embedding.tolist() # Convert embedding numpy array to list for JSON serialization
                })
            logging.info(f"Extracted BERT embedding for surah {surah}, ayah {ayah}")
    return BERT_embeddings

In [7]:
BERT_embeddings = quran_bert_embeddings(quran_data)
export_embeddings_to_json(BERT_embeddings, 'data/quran_BERT_embeddings.json')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
INFO:root:Extracted BERT embedding for surah 1, ayah 1
INFO:root:Extracted BERT embedding for surah 1, ayah 2
INFO:root:Extracted BERT embedding for surah 

In [20]:
visualize_embeddings_from_json('./data/quran_BERT_embeddings.json')

In [21]:
def gpt3_embedding(content: str, engine: str='text-embedding-ada-002') -> list:
    try:
        response = openai.Embedding.create(input=content, engine=engine)
        vector = response['data'][0]['embedding']
        return vector
    except Exception as e:
        logging.error(f'Embedding failed. Error message: {e}')

def extract_embedding(text: str) -> list:
    try:
        embedding = gpt3_embedding(text)
    except:
        while(True):
            try:
                if len(text) > 8191:
                    logging.warning('[OPENAI ERROR] Trying to get shorter input < 8191 for text...')
                    embedding = gpt3_embedding(text[:8191])
                else:
                    embedding = gpt3_embedding(text)
                break
            except Exception as e:
                logging.error(f'Trying to get the embedding for text. Error message: {e}')
                time.sleep(5)
    return embedding

def quran_gpt_embedding(quran_data: list) -> list:
    # Encode the text using the tokenizer and extract embeddings using the model
    GPT_embeddings = []
    for i, verse in enumerate(quran_data):
        surah = verse['surah']
        ayah = verse['ayah']
        text_en = verse['text_en']
        text_ar = verse['text_ar']
        logging.info(f'Processing verse {i+1}: Surah {surah}, Ayah {ayah}')
        embedding = extract_embedding(text_en)
        GPT_embeddings.append(
        {
            'surah': int(surah),
            'ayah': int(ayah),
            'text_en': text_en,
            'text_ar': text_ar,
            'embedding': embedding # Convert embedding numpy array to list for JSON serialization
        })
        time.sleep(1)
    return GPT_embeddings

In [22]:
GPT_embeddings = quran_gpt_embedding(quran_data)
export_embeddings_to_json(GPT_embeddings, 'data/quran_GPT_embeddings.json')

INFO:root:Processing verse 1: Surah 1, Ayah 1
INFO:root:Processing verse 2: Surah 1, Ayah 2
INFO:root:Processing verse 3: Surah 1, Ayah 3
INFO:root:Processing verse 4: Surah 1, Ayah 4
INFO:root:Processing verse 5: Surah 1, Ayah 5
INFO:root:Processing verse 6: Surah 1, Ayah 6
INFO:root:Processing verse 7: Surah 1, Ayah 7
INFO:root:Processing verse 8: Surah 2, Ayah 1
INFO:root:Processing verse 9: Surah 2, Ayah 2
INFO:root:Processing verse 10: Surah 2, Ayah 3
INFO:root:Processing verse 11: Surah 2, Ayah 4
INFO:root:Processing verse 12: Surah 2, Ayah 5
INFO:root:Processing verse 13: Surah 2, Ayah 6
INFO:root:Processing verse 14: Surah 2, Ayah 7
INFO:root:Processing verse 15: Surah 2, Ayah 8
INFO:root:Processing verse 16: Surah 2, Ayah 9
INFO:root:Processing verse 17: Surah 2, Ayah 10
INFO:root:Processing verse 18: Surah 2, Ayah 11
INFO:root:Processing verse 19: Surah 2, Ayah 12
INFO:root:Processing verse 20: Surah 2, Ayah 13
INFO:root:Processing verse 21: Surah 2, Ayah 14
INFO:root:Process

In [None]:
visualize_embeddings_from_json('./data/quran_GPT_embeddings.json')