# Verify Label Embeddings

This notebook verifies the content of `data/processed2/label.parquet`.

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
file_path = "../../data/processed2/label.parquet"

if not os.path.exists(file_path):
    print(f"File not found: {file_path}")
else:
    df = pd.read_parquet(file_path)
    print(f"Loaded DataFrame with shape: {df.shape}")

Loaded DataFrame with shape: (20000, 3)


In [3]:
if 'df' in locals():
    print("Columns:", df.columns)
    print("\nFirst 5 rows:")
    display(df.tail())

Columns: Index(['id', 'name', 'embedding'], dtype='object')

First 5 rows:


Unnamed: 0,id,name,embedding
19995,19995,GO:2001303,"[-0.00035673208, 0.67592674, 0.039151926, -0.1..."
19996,19996,GO:2001307,"[-0.6064583, 1.211026, 0.10974569, -0.19711222..."
19997,19997,GO:2001310,"[-0.4033451, 0.45717975, 0.45919684, -0.333266..."
19998,19998,GO:2001311,"[0.46174765, -0.14555496, -0.17262812, 0.29604..."
19999,19999,GO:2001317,"[-0.18756926, 0.5853828, -0.095721684, 0.00241..."


In [4]:
if 'df' in locals():
    # Verify embedding shape
    first_embedding = df.iloc[0]['embedding']
    print(f"\nEmbedding type: {type(first_embedding)}")
    if isinstance(first_embedding, (list, np.ndarray)):
        print(f"Embedding length/shape: {len(first_embedding)}")
        
        # Check if it matches expected 768 dimension
        if len(first_embedding) == 768:
            print("SUCCESS: Embedding dimension is 768.")
        else:
            print(f"WARNING: Expected 768 dimensions, got {len(first_embedding)}.")


Embedding type: <class 'numpy.ndarray'>
Embedding length/shape: 768
SUCCESS: Embedding dimension is 768.


In [5]:
if 'df' in locals():
    # Check for nulls
    print("\nNull values:")
    print(df.isnull().sum())


Null values:
id           0
name         0
embedding    0
dtype: int64
