In [7]:
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
from transformers import AutoTokenizer, AutoModel
import plotly.graph_objs as go
import torch

In [19]:
surah_names = {
    '1': 'Al-Fatihah',
    '2': 'Al-Baqarah',
    '3': 'Aal-E-Imran',
    '4': 'An-Nisa',
    '5': 'Al-Maidah',
    '6': 'Al-An\'am',
    '7': 'Al-A\'raf',
    '8': 'Al-Anfal',
    '9': 'At-Tawbah',
    '10': 'Yunus',
    '11': 'Hud',
    '12': 'Yusuf',
    '13': 'Ar-Ra\'d',
    '14': 'Ibrahim',
    '15': 'Al-Hijr',
    '16': 'An-Nahl',
    '17': 'Al-Isra',
    '18': 'Al-Kahf',
    '19': 'Maryam',
    '20': 'Taha',
    '21': 'Al-Anbiya',
    '22': 'Al-Hajj',
    '23': 'Al-Mu\'minun',
    '24': 'An-Nur',
    '25': 'Al-Furqan',
    '26': 'Ash-Shu\'ara',
    '27': 'An-Naml',
    '28': 'Al-Qasas',
    '29': 'Al-Ankabut',
    '30': 'Ar-Rum',
    '31': 'Luqman',
    '32': 'As-Sajda',
    '33': 'Al-Ahzab',
    '34': 'Saba',
    '35': 'Fatir',
    '36': 'Ya-Sin',
    '37': 'As-Saffat',
    '38': 'Sad',
    '39': 'Az-Zumar',
    '40': 'Ghafir',
    '41': 'Fussilat',
    '42': 'Ash-Shura',
    '43': 'Az-Zukhruf',
    '44': 'Ad-Dukhan',
    '45': 'Al-Jathiya',
    '46': 'Al-Ahqaf',
    '47': 'Muhammad',
    '48': 'Al-Fath',
    '49': 'Al-Hujraat',
    '50': 'Qaf',
    '51': 'Adh-Dhariyat',
    '52': 'At-Tur',
    '53': 'An-Najm',
    '54': 'Al-Qamar',
    '55': 'Ar-Rahman',
    '56': 'Al-Waqi\'a',
    '57': 'Al-Hadid',
    '58': 'Al-Mujadila',
    '59': 'Al-Hashr',
    '60': 'Al-Mumtahina',
    '61': 'As-Saff',
    '62': 'Al-Jumu\'a',
    '63': 'Al-Munafiqun',
    '64': 'At-Taghabun',
    '65': 'At-Talaq',
    '66': 'At-Tahrim',
    '67': 'Al-Mulk',
    '68': 'Al-Qalam',
    '69': 'Al-Haaqqa',
    '70': 'Al-Ma\'arij',
    '71': 'Nuh',
    '72': 'Al-Jinn',
    '73': 'Al-Muzzammil',
    '74': 'Al-Muddaththir',
    '75': 'Al-Qiyama',
    '76': 'Al-Insan',
    '77': 'Al-Mursalat',
    '78': 'An-Naba',
    '79': 'An-Nazi\'at',
    '80': 'Abasa',
    '81': 'At-Takwir',
    '82': 'Al-Infitar',
    '83': 'Al-Mutaffifin',
    '84': 'Al-Inshiqaq',
    '85': 'Al-Buruj',
    '86': 'At-Tariq',
    '87': 'Al-A\'la',
    '88': 'Al-Ghashiya',
    '89': 'Al-Fajr',
    '90': 'Al-Balad',
    '91': 'Ash-Shams',
    '92': 'Al-Lail',
    '93': 'Ad-Duha',
    '94': 'Al-Inshirah',
    '95': 'At-Tin',
    '96': 'Al-Alaq',
    '97': 'Al-Qadr',
    '98': 'Al-Bayyina',
    '99': 'Az-Zalzala',
    '100': 'Al-Adiyat',
    '101': 'Al-Qari\'a',
    '102': 'At-Takathur',
    '103': 'Al-Asr',
    '104': 'Al-Humaza',
    '105': 'Al-Fil',
    '106': 'Quraysh',
    '107': 'Al-Ma\'un',
    '108': 'Al-Kawthar',
    '109': 'Al-Kafirun',
    '110': 'An-Nasr',
    '111': 'Al-Masad',
    '112': 'Al-Ikhlas',
    '113': 'Al-Falaq',
    '114': 'An-Nas'
}

In [4]:
# Step 2: Read the Quran and its translation from text files

with open('data/quran_en.txt', 'r', encoding='utf-8') as f:
    quran_en = f.readlines()
quran_en = [verse.strip().split('|') for verse in quran_en]

with open('data/quran_ar.txt', 'r', encoding='utf-8') as f:
    quran_ar = f.readlines()
quran_ar = [verse.strip().split('|') for verse in quran_ar]

# Merge Arabic and English Quran data
quran_data = []
for i in range(len(quran_en)):
    quran_data.append({
        'surah': int(quran_en[i][0]),
        'ayah': int(quran_en[i][1]),
        'text_en': quran_en[i][2],
        'text_ar': quran_ar[i][2]
    })

In [9]:
quran_data[6200]

{'surah': 107,
 'ayah': 4,
 'text_en': 'Woe, therefore, Unto such performers of prayer.',
 'text_ar': 'فويل للمصلين'}

In [6]:
# Step 3: Calculate the embeddings of the Quran verses using BERT

# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
# Encode the text using the tokenizer and extract embeddings using the model
embeddings = []
for verse in quran_data:
    text = verse['text_en']
    encoding = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        output = model(**encoding)
        embedding = output.last_hidden_state.mean(dim=1).squeeze().numpy()
        embeddings.append(embedding)

In [15]:
# Step 4: Visualize the Quran verses

# Reduce dimensionality of embeddings using t-SNE
tsne = TSNE(n_components=3, random_state=7)
embeddings_3d = tsne.fit_transform(np.array(embeddings))

In [20]:
# Create a dictionary to map each surah to a unique color
colors = {}
for surah in pd.DataFrame(quran_data)['surah'].unique():
    colors[surah] = np.random.choice(range(256), size=3)

# Create scatter plot
fig = go.Figure()
for surah in pd.DataFrame(quran_data)['surah'].unique():
    data = pd.DataFrame(quran_data)[pd.DataFrame(quran_data)['surah'] == surah]
    color = f'rgb({",".join(map(str, colors[surah]))})'
    fig.add_trace(go.Scatter3d(
        x=embeddings_3d[data.index, 0],
        y=embeddings_3d[data.index, 1],
        z=embeddings_3d[data.index, 2],
        text=data['text_ar'],
        mode='markers',
        marker=dict(
            size=10,
            color=color,
            opacity=0.8,
        ),
        name=surah_names[f'{surah}']
    ))

# Set plot layout
fig.update_layout(
    title='Distribution of Quran Verses',
    xaxis=dict(title='t-SNE Dimension 1'),
    yaxis=dict(title='t-SNE Dimension 2')
)

# Export plot as HTML file
fig.write_html('quran_verses.html')

KeyError: 1