In [23]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from typing import List, Dict, Tuple, Any
import warnings
warnings.filterwarnings("ignore")

In [24]:
# plotting settings
plt.style.use("seaborn-v0_8")
sns.set_palette("husl")
plt.rcParams["figure.figsize"] = (12, 8)
plt.rcParams["font.size"] = 10

In [25]:
def load_json(file_path: str) -> List[Dict]:
    """Load data from JSON file"""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
        print(f"Successfully loaded {len(data)} records from {file_path}")
        return data
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return []
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON file: {e}")
        return []
    except Exception as e:
        print(f"Error loading file {file_path}: {e}")
        return []

In [26]:
file_path = "/home/jhwang/semicon-rag/data/examples_text_summary_pair.json"

data = load_json(file_path)

if data:
    print(f"Total records loaded: {len(data)}")
    print(f"Sample record keys: {list(data[0].keys())}")
    print(f"Sample record ID: {data[0]['id']}")
else:
    print("No data loaded")

Successfully loaded 122 records from /home/jhwang/semicon-rag/data/examples_text_summary_pair.json
Total records loaded: 122
Sample record keys: ['id', 'original', 'summary']
Sample record ID: Example 1.1


In [27]:
# Data exploration
if data:
    print("Data Structure Analysis:")
    print(f"Number of examples: {len(data)}")
    print(f"Keys in each record: {list(data[0].keys())}")

    # Check for missing data
    missing_summaries = sum(1 for item in data if not item.get('summary'))
    missing_originals = sum(1 for item in data if not item.get('original'))

    print(f"Records missing summaries: {missing_summaries}")
    print(f"Records missing originals: {missing_originals}")

    # Show example IDs
    example_ids = [item['id'] for item in data[:10]]
    print(f"First 10 example IDs: {example_ids}")

    # Text length statistics
    summary_lengths = [len(item['summary']) for item in data if item.get('summary')]
    original_lengths = [len(item['original']) for item in data if item.get('original')]

    print(f"Summary length stats - Mean: {np.mean(summary_lengths):.1f}, Min: {min(summary_lengths)}, Max: {max(summary_lengths)}")
    print(f"Original length stats - Mean: {np.mean(original_lengths):.1f}, Min: {min(original_lengths)}, Max: {max(original_lengths)}")

Data Structure Analysis:
Number of examples: 122
Keys in each record: ['id', 'original', 'summary']
Records missing summaries: 0
Records missing originals: 0
First 10 example IDs: ['Example 1.1', 'Example 1.2', 'Example 1.3', 'Example 2.1', 'Example 2.2', 'Example 2.3', 'Example 2.4', 'Example 2.5', 'Example 2.6', 'Example 3.1']
Summary length stats - Mean: 615.6, Min: 403, Max: 974
Original length stats - Mean: 1229.0, Min: 415, Max: 2497


In [28]:
def extract_texts_and_ids(data: List[Dict]) -> Tuple[List[str], List[str], List[str]]:
    originals = []
    summaries = []
    ids = []
    for item in data:
        originals.append(item.get('original', ''))
        summaries.append(item.get('summary', ''))
        ids.append(item.get('id', ''))
    return originals, summaries, ids

In [29]:
originals, summaries, ids = extract_texts_and_ids(data)
print(f"Extracted {len(originals)} originals, {len(summaries)} summaries, and {len(ids)} IDs.")

Extracted 122 originals, 122 summaries, and 122 IDs.


In [None]:
# Create Embeddings
def create_embeddings(texts: List[str], model_name: str = "jinaai/jina-embeddings-v3") -> np.ndarray:
    print(f"Loading embedding model: {model_name}")
    
    try:
        model = SentenceTransformer(model_name, trust_remote_code=True)
        print("Model loaded successfully!")
        
        print(f"Create embeddings for {len(texts)} texts")
        embeddings = model.encode(
            texts,
            show_progress_bar=True,
            normalize_embeddings=True
        )
        
        print(f"Embeddings created with shape: {embeddings.shape}")
        print(f"Embeding dimension: {embeddings.shape[1]}")
        
        return embeddings
    except Exception as e:
        print(f"Error creating embeddings: {e}")
        return np.array([])