In [None]:
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
from transformers import AutoTokenizer, AutoModel
import plotly.graph_objs as go
import torch

import openai
import time
import logging
import os

import src.config as config

openai.api_key = config.OPENAI_API_KEY
logging.basicConfig(level=logging.INFO)

In [None]:
surah_names = {
    '1': 'Al-Fatihah',
    '2': 'Al-Baqarah',
    '3': 'Aal-E-Imran',
    '4': 'An-Nisa',
    '5': 'Al-Maidah',
    '6': 'Al-An\'am',
    '7': 'Al-A\'raf',
    '8': 'Al-Anfal',
    '9': 'At-Tawbah',
    '10': 'Yunus',
    '11': 'Hud',
    '12': 'Yusuf',
    '13': 'Ar-Ra\'d',
    '14': 'Ibrahim',
    '15': 'Al-Hijr',
    '16': 'An-Nahl',
    '17': 'Al-Isra',
    '18': 'Al-Kahf',
    '19': 'Maryam',
    '20': 'Taha',
    '21': 'Al-Anbiya',
    '22': 'Al-Hajj',
    '23': 'Al-Mu\'minun',
    '24': 'An-Nur',
    '25': 'Al-Furqan',
    '26': 'Ash-Shu\'ara',
    '27': 'An-Naml',
    '28': 'Al-Qasas',
    '29': 'Al-Ankabut',
    '30': 'Ar-Rum',
    '31': 'Luqman',
    '32': 'As-Sajda',
    '33': 'Al-Ahzab',
    '34': 'Saba',
    '35': 'Fatir',
    '36': 'Ya-Sin',
    '37': 'As-Saffat',
    '38': 'Sad',
    '39': 'Az-Zumar',
    '40': 'Ghafir',
    '41': 'Fussilat',
    '42': 'Ash-Shura',
    '43': 'Az-Zukhruf',
    '44': 'Ad-Dukhan',
    '45': 'Al-Jathiya',
    '46': 'Al-Ahqaf',
    '47': 'Muhammad',
    '48': 'Al-Fath',
    '49': 'Al-Hujraat',
    '50': 'Qaf',
    '51': 'Adh-Dhariyat',
    '52': 'At-Tur',
    '53': 'An-Najm',
    '54': 'Al-Qamar',
    '55': 'Ar-Rahman',
    '56': 'Al-Waqi\'a',
    '57': 'Al-Hadid',
    '58': 'Al-Mujadila',
    '59': 'Al-Hashr',
    '60': 'Al-Mumtahina',
    '61': 'As-Saff',
    '62': 'Al-Jumu\'a',
    '63': 'Al-Munafiqun',
    '64': 'At-Taghabun',
    '65': 'At-Talaq',
    '66': 'At-Tahrim',
    '67': 'Al-Mulk',
    '68': 'Al-Qalam',
    '69': 'Al-Haaqqa',
    '70': 'Al-Ma\'arij',
    '71': 'Nuh',
    '72': 'Al-Jinn',
    '73': 'Al-Muzzammil',
    '74': 'Al-Muddaththir',
    '75': 'Al-Qiyama',
    '76': 'Al-Insan',
    '77': 'Al-Mursalat',
    '78': 'An-Naba',
    '79': 'An-Nazi\'at',
    '80': 'Abasa',
    '81': 'At-Takwir',
    '82': 'Al-Infitar',
    '83': 'Al-Mutaffifin',
    '84': 'Al-Inshiqaq',
    '85': 'Al-Buruj',
    '86': 'At-Tariq',
    '87': 'Al-A\'la',
    '88': 'Al-Ghashiya',
    '89': 'Al-Fajr',
    '90': 'Al-Balad',
    '91': 'Ash-Shams',
    '92': 'Al-Lail',
    '93': 'Ad-Duha',
    '94': 'Al-Inshirah',
    '95': 'At-Tin',
    '96': 'Al-Alaq',
    '97': 'Al-Qadr',
    '98': 'Al-Bayyina',
    '99': 'Az-Zalzala',
    '100': 'Al-Adiyat',
    '101': 'Al-Qari\'a',
    '102': 'At-Takathur',
    '103': 'Al-Asr',
    '104': 'Al-Humaza',
    '105': 'Al-Fil',
    '106': 'Quraysh',
    '107': 'Al-Ma\'un',
    '108': 'Al-Kawthar',
    '109': 'Al-Kafirun',
    '110': 'An-Nasr',
    '111': 'Al-Masad',
    '112': 'Al-Ikhlas',
    '113': 'Al-Falaq',
    '114': 'An-Nas'
}

In [None]:
import json
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import plotly.graph_objects as go

def visualize_embeddings_from_json(json_file):
    with open(json_file, 'r') as file:
        quran_json = json.load(file)
    
    # Extract embeddings from JSON and convert them to numpy arrays
    embeddings = [np.array(item['embedding']) for item in quran_json]

    # Reduce dimensionality of embeddings using t-SNE
    perplexity = min(len(embeddings) - 1, 30)  # Use either 30 or n_samples - 1, whichever is smaller
    tsne = TSNE(n_components=3, random_state=7)
    embeddings_3d = tsne.fit_transform(np.array(embeddings))

    # Create a dictionary to map each surah to a unique color
    colors = {}
    for surah in pd.DataFrame(quran_json)['surah_number'].unique():
        colors[surah] = np.random.choice(range(256), size=3)

    # Create scatter plot
    fig = go.Figure()
    for surah in pd.DataFrame(quran_json)['surah_number'].unique():
        data = pd.DataFrame(quran_json)[pd.DataFrame(quran_json)['surah_number'] == surah]
        color = f'rgb({",".join(map(str, colors[surah]))})'
        fig.add_trace(go.Scatter3d(
            x=embeddings_3d[data.index, 0],
            y=embeddings_3d[data.index, 1],
            z=embeddings_3d[data.index, 2],
            text=data['arabic_diacritics'],
            mode='markers',
            marker=dict(
                size=10,
                color=color,
                opacity=0.8,
            ),
            name=surah_names[f'{surah}']
        ))

    # Set plot layout
    fig.update_layout(
        title='Distribution of Quran Verses',
        scene=dict(
            xaxis_title='t-SNE Dimension 1',
            yaxis_title='t-SNE Dimension 2',
            zaxis_title='t-SNE Dimension 3'
        )
    )

    # Export plot as HTML file
    fig.write_html(f'{json_file}.html')


In [None]:
visualize_embeddings_from_json('./data/embeddings/v2/quran_GPT_embeddings_v2.json')