# Verify Label Embeddings

This notebook verifies the content of `data/processed2/label.parquet`.

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
file_path = "../../data/processed2/label.parquet"

if not os.path.exists(file_path):
    print(f"File not found: {file_path}")
else:
    df = pd.read_parquet(file_path)
    print(f"Loaded DataFrame with shape: {df.shape}")

Loaded DataFrame with shape: (20000, 3)


In [3]:
if 'df' in locals():
    print("Columns:", df.columns)
    print("\nFirst 5 rows:")
    display(df.head())

Columns: Index(['id', 'name', 'embedding'], dtype='object')

First 5 rows:


Unnamed: 0,id,name,embedding
0,0,GO:0000001,"[-0.9518631, 0.0803124, 0.4292605, 0.13440354,..."
1,1,GO:0000002,"[-0.70065135, -0.047753025, 0.8565155, -0.1312..."
2,2,GO:0000006,"[-0.25982004, -0.37692344, 0.076045305, 0.8337..."
3,3,GO:0000009,"[-0.19106326, -0.4243139, -0.8747841, -0.19341..."
4,4,GO:0000011,"[-0.53705364, 0.39702752, -0.025368618, -0.422..."


In [4]:
if 'df' in locals():
    # Verify embedding shape
    first_embedding = df.iloc[0]['embedding']
    print(f"\nEmbedding type: {type(first_embedding)}")
    if isinstance(first_embedding, (list, np.ndarray)):
        print(f"Embedding length/shape: {len(first_embedding)}")
        
        # Check if it matches expected 768 dimension
        if len(first_embedding) == 768:
            print("SUCCESS: Embedding dimension is 768.")
        else:
            print(f"WARNING: Expected 768 dimensions, got {len(first_embedding)}.")


Embedding type: <class 'numpy.ndarray'>
Embedding length/shape: 768
SUCCESS: Embedding dimension is 768.


In [5]:
if 'df' in locals():
    # Check for nulls
    print("\nNull values:")
    print(df.isnull().sum())


Null values:
id           0
name         0
embedding    0
dtype: int64
