In [1]:
import pandas as pd
import os
import pyarrow.parquet as pq

# Define the path to the data directory
data_dir = '../../data/processed2'

# List all files in the directory
files = [f for f in os.listdir(data_dir) if f.endswith('.parquet')]

print(f"Found {len(files)} parquet files in {data_dir}:")
for f in files:
    print(f" - {f}")

print("\n" + "="*50 + "\n")

# Read and display head(10) for each file efficiently
for f in files:
    print(f'Reading file: {f}')
    file_path = os.path.join(data_dir, f)
    try:
        # Use pyarrow to read the file metadata and the first batch
        parquet_file = pq.ParquetFile(file_path)
        print(f"Metadata: {parquet_file.metadata}")
        print(f"Num rows: {parquet_file.metadata.num_rows}")
        
        # Read only the first batch (or first 10 rows)
        # iter_batches returns an iterator of RecordBatches
        first_batch = next(parquet_file.iter_batches(batch_size=10))
        df = first_batch.to_pandas()
        
        print("Head 10:")
        display(df.head(10))
    except Exception as e:
        print(f"Error reading {f}: {e}")
    print("\n" + "="*50 + "\n")

Found 2 parquet files in ../../data/processed2:
 - test.parquet
 - train.parquet


Reading file: test.parquet
Metadata: <pyarrow._parquet.FileMetaData object at 0x72ded43bccc0>
  created_by: parquet-cpp-arrow version 22.0.0
  num_columns: 7
  num_rows: 224309
  num_row_groups: 1
  format_version: 2.6
  serialized_size: 4570
Num rows: 224309
Head 10:


Unnamed: 0,id,seq,taxonomy,go_terms,go_terms_id,superkingdom,embedding
0,A0A0C5B5G6,MRWQEMGYIFYPRKLR,9606,,,"[1.0, 0.0, 0.0, 0.0]","[0.005658077, -0.02524874, 0.04387835, 0.00099..."
1,A0A1B0GTW7,MLLLLLLLLLLPPLVLRVAASRCLHDETQKSVSLLRPPFSQLPSKS...,9606,,,"[1.0, 0.0, 0.0, 0.0]","[0.0062481337, -0.049506485, 0.021152359, -0.0..."
2,A0JNW5,MAGIIKKQILKHLSRFTKNLSPDKINLSTLKGEGELKNLELDEEVL...,9606,,,"[1.0, 0.0, 0.0, 0.0]","[-0.037082493, -0.024758408, 0.043183815, -0.0..."
3,A0JP26,MVAEVCSMPAASAVKKPFDLRSKMGKWCHHRFPCCRGSGKSNMGTS...,9606,,,"[1.0, 0.0, 0.0, 0.0]","[0.0077521782, -0.0063698683, 0.035366222, -0...."
4,A0PK11,MPGWFKKAWYGLASLLSFSSFILIIVALVVPHWLSGKILCQTGVDL...,9606,,,"[1.0, 0.0, 0.0, 0.0]","[-0.015596853, -0.023404969, 0.0350156, -0.082..."
5,A1A4S6,MGLQPLEFSDCYLDSPWFRERIRAHEAELERTNKFIKELIKDGKNL...,9606,,,"[1.0, 0.0, 0.0, 0.0]","[-0.05801201, -0.061138455, -0.029753128, -0.0..."
6,A1A519,MKRRQKRKHLENEESQETAEKGGGMSKSQEDALQPGSTRVAKGWSQ...,9606,,,"[1.0, 0.0, 0.0, 0.0]","[-0.010918891, -0.028198358, 0.01869835, 0.007..."
7,A1L190,MDDADPEERNYDNMLKMLSDLNKDLEKLLEEMEKISVQATWMAYDM...,9606,,,"[1.0, 0.0, 0.0, 0.0]","[-0.006491976, -0.03021013, 0.02057783, 0.0399..."
8,A1L3X0,MAFSDLTSRTVHLYDNWIKDADPRVEDWLLMSSPLPQTILLGFYVY...,9606,,,"[1.0, 0.0, 0.0, 0.0]","[0.06460048, 0.068392925, 0.012541338, -0.0225..."
9,A1X283,MPPRRSIVEVKVLDVQKRRVPNKHYVYIIRVTWSSGSTEAIYRRYS...,9606,,,"[1.0, 0.0, 0.0, 0.0]","[-0.034920264, -0.09441762, -0.008813337, -0.0..."




Reading file: train.parquet
Metadata: <pyarrow._parquet.FileMetaData object at 0x72ded43bdcb0>
  created_by: parquet-cpp-arrow version 22.0.0
  num_columns: 7
  num_rows: 82286
  num_row_groups: 1
  format_version: 2.6
  serialized_size: 5198
Num rows: 82286
Head 10:


Unnamed: 0,id,seq,taxonomy,go_terms,go_terms_id,superkingdom,embedding
0,A0A0C5B5G6,MRWQEMGYIFYPRKLR,9606,"[GO:0005615, GO:0033687, GO:0043610, GO:003214...","[2383, 8644, 10661, 7978, 2801, 545, 12248, 23...","[1.0, 0.0, 0.0, 0.0]","[0.005658077, -0.02524874, 0.04387835, 0.00099..."
1,A0JNW5,MAGIIKKQILKHLSRFTKNLSPDKINLSTLKGEGELKNLELDEEVL...,9606,"[GO:0005769, GO:0005829, GO:0120013, GO:000551...","[2460, 2501, 16796, 2336, 8916, 16794, 10263]","[1.0, 0.0, 0.0, 0.0]","[-0.037082493, -0.024758408, 0.043183815, -0.0..."
2,A0JP26,MVAEVCSMPAASAVKKPFDLRSKMGKWCHHRFPCCRGSGKSNMGTS...,9606,[GO:0005515],[2336],"[1.0, 0.0, 0.0, 0.0]","[0.0077521782, -0.0063698683, 0.035366222, -0...."
3,A0PK11,MPGWFKKAWYGLASLLSFSSFILIIVALVVPHWLSGKILCQTGVDL...,9606,"[GO:0007605, GO:0005515]","[3576, 2336]","[1.0, 0.0, 0.0, 0.0]","[-0.015596853, -0.023404969, 0.0350156, -0.082..."
4,A1A4S6,MGLQPLEFSDCYLDSPWFRERIRAHEAELERTNKFIKELIKDGKNL...,9606,"[GO:0005829, GO:0051056, GO:0005515, GO:000509...","[2501, 12744, 2336, 2154, 4625]","[1.0, 0.0, 0.0, 0.0]","[-0.05801201, -0.061138455, -0.029753128, -0.0..."
5,A1A519,MKRRQKRKHLENEESQETAEKGGGMSKSQEDALQPGSTRVAKGWSQ...,9606,"[GO:0005634, GO:0045893, GO:0006366]","[2389, 11366, 2808]","[1.0, 0.0, 0.0, 0.0]","[-0.010918891, -0.028198358, 0.01869835, 0.007..."
6,A1L190,MDDADPEERNYDNMLKMLSDLNKDLEKLLEEMEKISVQATWMAYDM...,9606,[GO:0005515],[2336],"[1.0, 0.0, 0.0, 0.0]","[-0.006491976, -0.03021013, 0.02057783, 0.0399..."
7,A1L3X0,MAFSDLTSRTVHLYDNWIKDADPRVEDWLLMSSPLPQTILLGFYVY...,9606,"[GO:0019367, GO:0042761, GO:0035338, GO:000992...","[6452, 10238, 9240, 4570, 2476, 8954, 2336, 2470]","[1.0, 0.0, 0.0, 0.0]","[0.06460048, 0.068392925, 0.012541338, -0.0225..."
8,A1X283,MPPRRSIVEVKVLDVQKRRVPNKHYVYIIRVTWSSGSTEAIYRRYS...,9606,"[GO:0001654, GO:0060348, GO:0006801, GO:000750...","[550, 13543, 3097, 3525, 2336, 15367, 464, 6899]","[1.0, 0.0, 0.0, 0.0]","[-0.034920264, -0.09441762, -0.008813337, -0.0..."
9,A2A2Y4,MFASCHCVPRGRRTMKMIHFRSSSVKSLSQEMRCTIRLLDDSEISC...,9606,[GO:0005515],[2336],"[1.0, 0.0, 0.0, 0.0]","[-0.047029294, -0.08146118, -0.011144579, -0.0..."




