# Verify Label Embeddings

This notebook verifies the content of `data/processed2/label.parquet`.

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
file_path = "../../data/processed2/label.parquet"

if not os.path.exists(file_path):
    print(f"File not found: {file_path}")
else:
    df = pd.read_parquet(file_path)
    print(f"Loaded DataFrame with shape: {df.shape}")

Loaded DataFrame with shape: (26125, 3)


In [3]:
if 'df' in locals():
    print("Columns:", df.columns)
    print("\nFirst 5 rows:")
    display(df.tail())

Columns: Index(['id', 'name', 'embedding'], dtype='object')

First 5 rows:


Unnamed: 0,id,name,embedding
26120,26120,GO:2001307,"[-0.3650091, 1.0338237, 0.30712467, -0.0832880..."
26121,26121,GO:2001310,"[-0.41389447, 0.33824375, 0.36773187, -0.33797..."
26122,26122,GO:2001311,"[0.51156086, -0.26030645, 0.028753588, 0.49726..."
26123,26123,GO:2001315,"[0.00093602645, 0.44081417, -0.9185203, 0.4230..."
26124,26124,GO:2001317,"[0.09472421, 0.23762597, 0.052462384, 0.137327..."


In [4]:
if 'df' in locals():
    # Verify embedding shape
    first_embedding = df.iloc[0]['embedding']
    print(f"\nEmbedding type: {type(first_embedding)}")
    if isinstance(first_embedding, (list, np.ndarray)):
        print(f"Embedding length/shape: {len(first_embedding)}")
        
        # Check if it matches expected 768 dimension
        if len(first_embedding) == 768:
            print("SUCCESS: Embedding dimension is 768.")
        else:
            print(f"WARNING: Expected 768 dimensions, got {len(first_embedding)}.")


Embedding type: <class 'numpy.ndarray'>
Embedding length/shape: 768
SUCCESS: Embedding dimension is 768.


In [5]:
if 'df' in locals():
    # Check for nulls
    print("\nNull values:")
    print(df.isnull().sum())


Null values:
id           0
name         0
embedding    0
dtype: int64
