In [1]:
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
from transformers import AutoTokenizer, AutoModel
import plotly.graph_objs as go
import torch

import openai
import time
import logging
import os

import src.config as config

logging.basicConfig(level=logging.INFO)

In [51]:
import json
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import plotly.graph_objects as go

def visualize_embeddings_from_json(json_file):
    with open(json_file, 'r') as file:
        hadith_json = json.load(file)
      
    filtered_hadiths = []

    for idx in range(len(hadith_json)):
        hadith = hadith_json[idx]
        hadith_embeddings = np.array(hadith['embeddings'])
        if isinstance(hadith_embeddings, np.ndarray) :
            if hadith_embeddings.shape == (1536,) :
                filtered_hadiths.append(hadith)
    
    
    # Extract embeddings from JSON and convert them to numpy arrays
    embeddings = [np.array(item['embeddings']) for item in filtered_hadiths]
    
    # Reduce dimensionality of embeddings using t-SNE
    perplexity = min(len(embeddings) - 1, 30)  # Use either 30 or n_samples - 1, whichever is smaller
    tsne = TSNE(n_components=3, random_state=7)
    embeddings_3d = tsne.fit_transform(np.array(embeddings))

    # Create a dictionary to map each surah to a unique color
    colors = {}
    for hadith in pd.DataFrame(filtered_hadiths)['Chapter_Number'].unique():
        colors[hadith] = np.random.choice(range(256), size=3)

    # Create scatter plot
    fig = go.Figure()
    for hadith in pd.DataFrame(filtered_hadiths)['Chapter_Number'].unique():
        data = pd.DataFrame(filtered_hadiths)[pd.DataFrame(filtered_hadiths)['Chapter_Number'] == hadith]
        color = f'rgb({",".join(map(str, colors[hadith]))})'
        fig.add_trace(go.Scatter3d(
            x=embeddings_3d[data.index, 0],
            y=embeddings_3d[data.index, 1],
            z=embeddings_3d[data.index, 2],
            text= data["Arabic_Hadith"],
            mode='markers',
            marker=dict(
                size=10,
                color=color,
                opacity=0.8,
            ),
            name=data["Chapter_Arabic"].unique()[0]
        ))

    # Set plot layout
    fig.update_layout(
        title='Distribution of Ibn Maja Hadiths in 3D',
        scene=dict(
            xaxis_title='t-SNE Dimension 1',
            yaxis_title='t-SNE Dimension 2',
            zaxis_title='t-SNE Dimension 3'
        )
    )

    # Export plot as HTML file
    fig.write_html(f'{json_file[:-5]}.html')


In [52]:
visualize_embeddings_from_json('./data/embeddings/ibnmaja_GPT_embeddings_v2.json')