In [1]:
import pandas as pd
import os
import pyarrow.parquet as pq

# Define the path to the data directory
data_dir = '../../data/processed2'

# List all files in the directory
files = [f for f in os.listdir(data_dir) if f.endswith('.parquet')]

print(f"Found {len(files)} parquet files in {data_dir}:")
for f in files:
    print(f" - {f}")

print("\n" + "="*50 + "\n")

# Read and display head(10) for each file efficiently
for f in files:
    print(f'Reading file: {f}')
    file_path = os.path.join(data_dir, f)
    try:
        # Use pyarrow to read the file metadata and the first batch
        parquet_file = pq.ParquetFile(file_path)
        print(f"Metadata: {parquet_file.metadata}")
        print(f"Num rows: {parquet_file.metadata.num_rows}")
        
        # Read only the first batch (or first 10 rows)
        # iter_batches returns an iterator of RecordBatches
        first_batch = next(parquet_file.iter_batches(batch_size=10))
        df = first_batch.to_pandas()
        
        print("Head 10:")
        display(df.head(10))
    except Exception as e:
        print(f"Error reading {f}: {e}")
    print("\n" + "="*50 + "\n")

Found 4 parquet files in ../../data/processed2:
 - label.parquet
 - test.parquet
 - train.parquet
 - train_complete.parquet


Reading file: label.parquet
Metadata: <pyarrow._parquet.FileMetaData object at 0x7ebd748404a0>
  created_by: parquet-cpp-arrow version 22.0.0
  num_columns: 3
  num_rows: 26125
  num_row_groups: 1
  format_version: 2.6
  serialized_size: 2562
Num rows: 26125
Head 10:


Unnamed: 0,id,name,embedding
0,0,GO:0000001,"[-1.0028372, -0.045182128, 0.35864478, 0.10161..."
1,1,GO:0000002,"[-0.6721835, 0.031537063, 0.8209418, -0.049415..."
2,2,GO:0000006,"[-0.20918404, -0.38913184, 0.007835354, 0.8332..."
3,3,GO:0000007,"[-0.000720839, -0.4979901, 0.1971867, 0.659227..."
4,4,GO:0000009,"[-0.0877412, -0.5774055, -0.87606597, -0.07907..."
5,5,GO:0000010,"[-0.14391537, 0.0236177, -0.6990757, -0.216485..."
6,6,GO:0000011,"[-0.48112705, 0.4002617, -0.0010845569, -0.492..."
7,7,GO:0000012,"[0.114751704, 0.07488136, 0.41559148, -1.52286..."
8,8,GO:0000014,"[-0.3239982, 0.5678688, -0.078156225, -0.35794..."
9,9,GO:0000015,"[-0.24522287, -0.04206, -0.13269497, 0.5897899..."




Reading file: test.parquet
Metadata: <pyarrow._parquet.FileMetaData object at 0x7ebd63f298a0>
  created_by: parquet-cpp-arrow version 22.0.0
  num_columns: 7
  num_rows: 224309
  num_row_groups: 1
  format_version: 2.6
  serialized_size: 4570
Num rows: 224309
Head 10:
Head 10:


Unnamed: 0,id,seq,taxonomy,go_terms,go_terms_id,superkingdom,embedding
0,A0A0C5B5G6,MRWQEMGYIFYPRKLR,9606,,,"[1.0, 0.0, 0.0, 0.0]","[0.005658077, -0.02524874, 0.04387835, 0.00099..."
1,A0A1B0GTW7,MLLLLLLLLLLPPLVLRVAASRCLHDETQKSVSLLRPPFSQLPSKS...,9606,,,"[1.0, 0.0, 0.0, 0.0]","[0.0062481337, -0.049506485, 0.021152359, -0.0..."
2,A0JNW5,MAGIIKKQILKHLSRFTKNLSPDKINLSTLKGEGELKNLELDEEVL...,9606,,,"[1.0, 0.0, 0.0, 0.0]","[-0.037082493, -0.024758408, 0.043183815, -0.0..."
3,A0JP26,MVAEVCSMPAASAVKKPFDLRSKMGKWCHHRFPCCRGSGKSNMGTS...,9606,,,"[1.0, 0.0, 0.0, 0.0]","[0.0077521782, -0.0063698683, 0.035366222, -0...."
4,A0PK11,MPGWFKKAWYGLASLLSFSSFILIIVALVVPHWLSGKILCQTGVDL...,9606,,,"[1.0, 0.0, 0.0, 0.0]","[-0.015596853, -0.023404969, 0.0350156, -0.082..."
5,A1A4S6,MGLQPLEFSDCYLDSPWFRERIRAHEAELERTNKFIKELIKDGKNL...,9606,,,"[1.0, 0.0, 0.0, 0.0]","[-0.05801201, -0.061138455, -0.029753128, -0.0..."
6,A1A519,MKRRQKRKHLENEESQETAEKGGGMSKSQEDALQPGSTRVAKGWSQ...,9606,,,"[1.0, 0.0, 0.0, 0.0]","[-0.010918891, -0.028198358, 0.01869835, 0.007..."
7,A1L190,MDDADPEERNYDNMLKMLSDLNKDLEKLLEEMEKISVQATWMAYDM...,9606,,,"[1.0, 0.0, 0.0, 0.0]","[-0.006491976, -0.03021013, 0.02057783, 0.0399..."
8,A1L3X0,MAFSDLTSRTVHLYDNWIKDADPRVEDWLLMSSPLPQTILLGFYVY...,9606,,,"[1.0, 0.0, 0.0, 0.0]","[0.06460048, 0.068392925, 0.012541338, -0.0225..."
9,A1X283,MPPRRSIVEVKVLDVQKRRVPNKHYVYIIRVTWSSGSTEAIYRRYS...,9606,,,"[1.0, 0.0, 0.0, 0.0]","[-0.034920264, -0.09441762, -0.008813337, -0.0..."




Reading file: train.parquet
Metadata: <pyarrow._parquet.FileMetaData object at 0x7ebd1bf61da0>
  created_by: parquet-cpp-arrow version 22.0.0
  num_columns: 7
  num_rows: 82404
  num_row_groups: 1
  format_version: 2.6
  serialized_size: 5214
Num rows: 82404
Head 10:
Head 10:


Unnamed: 0,id,seq,taxonomy,go_terms,go_terms_id,superkingdom,embedding
0,A0A0C5B5G6,MRWQEMGYIFYPRKLR,9606,"[GO:0005739, GO:0071902, GO:0072522, GO:000164...","[2832, 18897, 19167, 597, 15065, 2772, 26040, ...","[1.0, 0.0, 0.0, 0.0]","[0.005658077, -0.02524874, 0.04387835, 0.00099..."
1,A0JNW5,MAGIIKKQILKHLSRFTKNLSPDKINLSTLKGEGELKNLELDEEVL...,9606,"[GO:0005769, GO:0034498, GO:0062069, GO:000582...","[2850, 10732, 17678, 2892, 2709, 21289, 12453,...","[1.0, 0.0, 0.0, 0.0]","[-0.037082493, -0.024758408, 0.043183815, -0.0..."
2,A0JP26,MVAEVCSMPAASAVKKPFDLRSKMGKWCHHRFPCCRGSGKSNMGTS...,9606,[GO:0005515],[2709],"[1.0, 0.0, 0.0, 0.0]","[0.0077521782, -0.0063698683, 0.035366222, -0...."
3,A0PK11,MPGWFKKAWYGLASLLSFSSFILIIVALVVPHWLSGKILCQTGVDL...,9606,"[GO:0007605, GO:0005515]","[4063, 2709]","[1.0, 0.0, 0.0, 0.0]","[-0.015596853, -0.023404969, 0.0350156, -0.082..."
4,A1A4S6,MGLQPLEFSDCYLDSPWFRERIRAHEAELERTNKFIKELIKDGKNL...,9606,"[GO:0005096, GO:0051056, GO:0005829, GO:001000...","[2508, 15707, 2892, 5373, 2709]","[1.0, 0.0, 0.0, 0.0]","[-0.05801201, -0.061138455, -0.029753128, -0.0..."
5,A1A519,MKRRQKRKHLENEESQETAEKGGGMSKSQEDALQPGSTRVAKGWSQ...,9606,"[GO:0045893, GO:0006366, GO:0005634]","[13844, 3231, 2772]","[1.0, 0.0, 0.0, 0.0]","[-0.010918891, -0.028198358, 0.01869835, 0.007..."
6,A1L190,MDDADPEERNYDNMLKMLSDLNKDLEKLLEEMEKISVQATWMAYDM...,9606,[GO:0005515],[2709],"[1.0, 0.0, 0.0, 0.0]","[-0.006491976, -0.03021013, 0.02057783, 0.0399..."
7,A1L3X0,MAFSDLTSRTVHLYDNWIKDADPRVEDWLLMSSPLPQTILLGFYVY...,9606,"[GO:0019367, GO:0005783, GO:0009922, GO:000578...","[7649, 2860, 5309, 2866, 11130, 10777, 12425, ...","[1.0, 0.0, 0.0, 0.0]","[0.06460048, 0.068392925, 0.012541338, -0.0225..."
8,A1X283,MPPRRSIVEVKVLDVQKRRVPNKHYVYIIRVTWSSGSTEAIYRRYS...,9606,"[GO:0022617, GO:0006801, GO:0007507, GO:000165...","[8229, 3546, 4007, 602, 2709, 16753, 508, 19224]","[1.0, 0.0, 0.0, 0.0]","[-0.034920264, -0.09441762, -0.008813337, -0.0..."
9,A2A2Y4,MFASCHCVPRGRRTMKMIHFRSSSVKSLSQEMRCTIRLLDDSEISC...,9606,[GO:0005515],[2709],"[1.0, 0.0, 0.0, 0.0]","[-0.047029294, -0.08146118, -0.011144579, -0.0..."




Reading file: train_complete.parquet
Metadata: <pyarrow._parquet.FileMetaData object at 0x7ebd74841210>
  created_by: parquet-cpp-arrow version 22.0.0
  num_columns: 7
  num_rows: 82404
  num_row_groups: 1
  format_version: 2.6
  serialized_size: 5661
Num rows: 82404
Head 10:
Head 10:


Unnamed: 0,id,seq,taxonomy,go_terms,go_terms_id,superkingdom,embedding
0,A0A0C5B5G6,MRWQEMGYIFYPRKLR,9606,"[GO:0005622, GO:0043549, GO:0006357, GO:004851...","[2832, 18897, 19167, 597, 15065, 2772, 26040, ...","[1.0, 0.0, 0.0, 0.0]","[0.005658077, -0.02524874, 0.04387835, 0.00099..."
1,A0JNW5,MAGIIKKQILKHLSRFTKNLSPDKINLSTLKGEGELKNLELDEEVL...,9606,"[GO:0048193, GO:0097708, GO:0016192, GO:000562...","[2850, 10732, 17678, 2892, 2709, 21289, 12453,...","[1.0, 0.0, 0.0, 0.0]","[-0.037082493, -0.024758408, 0.043183815, -0.0..."
2,A0JP26,MVAEVCSMPAASAVKKPFDLRSKMGKWCHHRFPCCRGSGKSNMGTS...,9606,"[GO:0005488, GO:0005515]",[2709],"[1.0, 0.0, 0.0, 0.0]","[0.0077521782, -0.0063698683, 0.035366222, -0...."
3,A0PK11,MPGWFKKAWYGLASLLSFSSFILIIVALVVPHWLSGKILCQTGVDL...,9606,"[GO:0007600, GO:0007605, GO:0005488, GO:005087...","[4063, 2709]","[1.0, 0.0, 0.0, 0.0]","[-0.015596853, -0.023404969, 0.0350156, -0.082..."
4,A1A4S6,MGLQPLEFSDCYLDSPWFRERIRAHEAELERTNKFIKELIKDGKNL...,9606,"[GO:0097708, GO:0140677, GO:0005622, GO:003198...","[2508, 15707, 2892, 5373, 2709]","[1.0, 0.0, 0.0, 0.0]","[-0.05801201, -0.061138455, -0.029753128, -0.0..."
5,A1A519,MKRRQKRKHLENEESQETAEKGGGMSKSQEDALQPGSTRVAKGWSQ...,9606,"[GO:0005622, GO:0043231, GO:0051252, GO:008009...","[13844, 3231, 2772]","[1.0, 0.0, 0.0, 0.0]","[-0.010918891, -0.028198358, 0.01869835, 0.007..."
6,A1L190,MDDADPEERNYDNMLKMLSDLNKDLEKLLEEMEKISVQATWMAYDM...,9606,"[GO:0005488, GO:0005515]",[2709],"[1.0, 0.0, 0.0, 0.0]","[-0.006491976, -0.03021013, 0.02057783, 0.0399..."
7,A1L3X0,MAFSDLTSRTVHLYDNWIKDADPRVEDWLLMSSPLPQTILLGFYVY...,9606,"[GO:0005622, GO:0000038, GO:0005789, GO:000382...","[7649, 2860, 5309, 2866, 11130, 10777, 12425, ...","[1.0, 0.0, 0.0, 0.0]","[0.06460048, 0.068392925, 0.012541338, -0.0225..."
8,A1X283,MPPRRSIVEVKVLDVQKRRVPNKHYVYIIRVTWSSGSTEAIYRRYS...,9606,"[GO:0001501, GO:0016043, GO:0048513, GO:000165...","[8229, 3546, 4007, 602, 2709, 16753, 508, 19224]","[1.0, 0.0, 0.0, 0.0]","[-0.034920264, -0.09441762, -0.008813337, -0.0..."
9,A2A2Y4,MFASCHCVPRGRRTMKMIHFRSSSVKSLSQEMRCTIRLLDDSEISC...,9606,"[GO:0005488, GO:0005515]",[2709],"[1.0, 0.0, 0.0, 0.0]","[-0.047029294, -0.08146118, -0.011144579, -0.0..."






In [2]:
# Kiểm tra xem có row nào thiếu cột embedding không
for f in files:
    print(f'Checking file: {f}')
    file_path = os.path.join(data_dir, f)
    try:
        parquet_file = pq.ParquetFile(file_path)
        schema = parquet_file.schema_arrow
        column_names = [field.name for field in schema]
        
        print(f"Columns: {column_names}")
        
        if 'embedding' not in column_names:
            print(f"⚠️ WARNING: File '{f}' does not have 'embedding' column!")
        else:
            print(f"✅ File '{f}' has 'embedding' column")
            
            # Đọc toàn bộ file và kiểm tra null values trong cột embedding
            df = pd.read_parquet(file_path)
            null_count = df['embedding'].isna().sum()
            empty_count = df['embedding'].apply(lambda x: x is None or (hasattr(x, '__len__') and len(x) == 0)).sum()
            
            if null_count > 0:
                print(f"⚠️ WARNING: {null_count} rows have NULL embedding values")
            if empty_count > 0:
                print(f"⚠️ WARNING: {empty_count} rows have empty embedding values")
            if null_count == 0 and empty_count == 0:
                print(f"✅ All {len(df)} rows have valid embedding values")
                
    except Exception as e:
        print(f"Error checking {f}: {e}")
    print("\n" + "="*50 + "\n")

Checking file: label.parquet
Columns: ['id', 'name', 'embedding']
✅ File 'label.parquet' has 'embedding' column
✅ All 26125 rows have valid embedding values


Checking file: test.parquet
Columns: ['id', 'seq', 'taxonomy', 'go_terms', 'go_terms_id', 'superkingdom', 'embedding']
✅ File 'test.parquet' has 'embedding' column
✅ All 26125 rows have valid embedding values


Checking file: test.parquet
Columns: ['id', 'seq', 'taxonomy', 'go_terms', 'go_terms_id', 'superkingdom', 'embedding']
✅ File 'test.parquet' has 'embedding' column
✅ All 224309 rows have valid embedding values


Checking file: train.parquet
Columns: ['id', 'seq', 'taxonomy', 'go_terms', 'go_terms_id', 'superkingdom', 'embedding']
✅ File 'train.parquet' has 'embedding' column
✅ All 224309 rows have valid embedding values


Checking file: train.parquet
Columns: ['id', 'seq', 'taxonomy', 'go_terms', 'go_terms_id', 'superkingdom', 'embedding']
✅ File 'train.parquet' has 'embedding' column
✅ All 82404 rows have valid embedding 