In [3]:
import pandas as pd
import re
from Bio import SeqIO
from IPython.display import display
from pathlib import Path
import os

# ================================
# 1Ô∏è‚É£ T·ª∞ ƒê·ªòNG T√åM PROJECT ROOT
# ================================
def find_project_root(target_folder_name="CAFA-6-Protein-Function-Function-Prediction-Kaggle"):
    path = Path.cwd()
    while path.name != target_folder_name:
        if path.parent == path:
            raise RuntimeError("‚ùå Kh√¥ng t√¨m th·∫•y th∆∞ m·ª•c project g·ªëc.")
        path = path.parent
    return path

# N·∫øu t√™n th∆∞ m·ª•c project c·ªßa b·∫°n KH√ÅC ‚Üí s·ª≠a t·∫°i ƒë√¢y:
PROJ_DIR = find_project_root("CAFA-6-Protein-Function-Prediction-Kaggle")

# ================================
# 2Ô∏è‚É£ C·∫§U H√åNH ƒê∆Ø·ªúNG D·∫™N
# ================================
BASE_DIR = PROJ_DIR / "data" / "Train"
TRAIN_FASTA_PATH = BASE_DIR / "train_sequences.fasta"
TRAIN_TERMS_PATH = BASE_DIR / "train_terms.tsv"

print("üìÅ Project Root:", PROJ_DIR)
print("üìÅ Train Folder:", BASE_DIR)
print("üìÑ FASTA Exists:", TRAIN_FASTA_PATH.exists())
print("üìÑ TERMS Exists:", TRAIN_TERMS_PATH.exists())

# N·∫øu file kh√¥ng t·ªìn t·∫°i ‚Üí d·ª´ng l·∫°i
if not TRAIN_FASTA_PATH.exists():
    raise FileNotFoundError(f"‚ùå Kh√¥ng t√¨m th·∫•y file FASTA: {TRAIN_FASTA_PATH}")

if not TRAIN_TERMS_PATH.exists():
    raise FileNotFoundError(f"‚ùå Kh√¥ng t√¨m th·∫•y file TERMS: {TRAIN_TERMS_PATH}")


# ================================
# 3Ô∏è‚É£ ƒê·ªåC FASTA + TR√çCH XU·∫§T FEATURES
# ================================
print("\n1. ƒêang ƒë·ªçc FASTA & tr√≠ch xu·∫•t Features...")

ids = []
sequences = []
pe_list = []
taxon_list = []
reviewed_list = []

for record in SeqIO.parse(str(TRAIN_FASTA_PATH), "fasta"):
    
    # L·∫•y Entry ID (chu·∫©n UniProt)
    parts = record.id.split("|")
    clean_id = parts[1] if len(parts) >= 2 else record.id

    header = record.description
    
    # PE (Protein Existence)
    pe_match = re.search(r"PE=(\d+)", header)
    pe_val = int(pe_match.group(1)) if pe_match else 0

    # OX (Taxonomy)
    ox_match = re.search(r"OX=(\d+)", header)
    ox_val = int(ox_match.group(1)) if ox_match else 0

    # Reviewed (Swiss-Prot)
    is_reviewed = 1 if record.id.startswith("sp|") else 0

    # Append
    ids.append(clean_id)
    sequences.append(str(record.seq))
    pe_list.append(pe_val)
    taxon_list.append(ox_val)
    reviewed_list.append(is_reviewed)

df_seq = pd.DataFrame({
    "EntryID": ids,
    "sequence": sequences,
    "PE": pe_list,
    "TaxonomyID": taxon_list,
    "Reviewed": reviewed_list
})

print(f"   -> Loaded {len(df_seq)} sequences.")
print(f"   -> First PE: {pe_list[0]}, TaxID: {taxon_list[0]}")


# ================================
# 4Ô∏è‚É£ ƒê·ªåC TERMS + GROUP LABELS
# ================================
print("\n2. ƒêang ƒë·ªçc Terms...")

df_terms = pd.read_csv(TRAIN_TERMS_PATH, sep="\t", usecols=["EntryID", "term"])

print("3. ƒêang gom nh√≥m labels...")
df_labels = df_terms.groupby("EntryID")["term"].apply(list).reset_index()


# ================================
# 5Ô∏è‚É£ GH√âP D·ªÆ LI·ªÜU
# ================================
print("4. ƒêang merge d·ªØ li·ªáu...")

df_final = pd.merge(df_seq, df_labels, on="EntryID", how="inner")

print("\n=== SAMPLE DATA ===")
display(df_final.head())

if len(df_final) > 0:
    sample = df_final.iloc[100]
    print("\n[SAMPLE DETAILS]")
    print(f"- EntryID:     {sample['EntryID']}")
    print(f"- TaxonomyID:  {sample['TaxonomyID']}")
    print(f"- PE:          {sample['PE']}")
    print(f"- Reviewed:    {sample['Reviewed']}")
    print(f"- Seq length:  {len(sample['sequence'])}")
    print(f"- GO Terms:    {sample['term'][:5]} (Total: {len(sample['term'])})")
else:
    print("‚ùå Kh√¥ng c√≥ d·ªØ li·ªáu sau khi merge ‚Äî ki·ªÉm tra file EntryID.")


üìÅ Project Root: d:\vhproj\CAFA-6-Protein-Function-Prediction-Kaggle
üìÅ Train Folder: d:\vhproj\CAFA-6-Protein-Function-Prediction-Kaggle\data\Train
üìÑ FASTA Exists: True
üìÑ TERMS Exists: True

1. ƒêang ƒë·ªçc FASTA & tr√≠ch xu·∫•t Features...
   -> Loaded 82404 sequences.
   -> First PE: 1, TaxID: 9606

2. ƒêang ƒë·ªçc Terms...
3. ƒêang gom nh√≥m labels...
4. ƒêang merge d·ªØ li·ªáu...

=== SAMPLE DATA ===


Unnamed: 0,EntryID,sequence,PE,TaxonomyID,Reviewed,term
0,A0A0C5B5G6,MRWQEMGYIFYPRKLR,1,9606,1,"[GO:0001649, GO:0033687, GO:0005615, GO:000563..."
1,A0JNW5,MAGIIKKQILKHLSRFTKNLSPDKINLSTLKGEGELKNLELDEEVL...,1,9606,1,"[GO:0120013, GO:0034498, GO:0005769, GO:012000..."
2,A0JP26,MVAEVCSMPAASAVKKPFDLRSKMGKWCHHRFPCCRGSGKSNMGTS...,1,9606,1,[GO:0005515]
3,A0PK11,MPGWFKKAWYGLASLLSFSSFILIIVALVVPHWLSGKILCQTGVDL...,1,9606,1,"[GO:0007605, GO:0005515]"
4,A1A4S6,MGLQPLEFSDCYLDSPWFRERIRAHEAELERTNKFIKELIKDGKNL...,1,9606,1,"[GO:0005829, GO:0010008, GO:0005515, GO:000509..."



[SAMPLE DETAILS]
- EntryID:     O14910
- TaxonomyID:  9606
- PE:          1
- Reviewed:    1
- Seq length:  233
- GO Terms:    ['GO:0005515', 'GO:0065003', 'GO:0006887', 'GO:0005886', 'GO:0097016'] (Total: 6)


In [None]:
from pathlib import Path

# ===== 1Ô∏è‚É£ X√°c ƒë·ªãnh ƒë√∫ng project root nh∆∞ ph·∫ßn tr√™n =====
def find_project_root(target_folder_name="CAFA-6-Protein-Function-Prediction-Kaggle"):
    path = Path.cwd()
    while path.name != target_folder_name:
        if path.parent == path:
            raise RuntimeError("‚ùå Kh√¥ng t√¨m th·∫•y project root.")
        path = path.parent
    return path

PROJ_DIR = find_project_root("CAFA-6-Protein-Function-Prediction-Kaggle")

# ===== 2Ô∏è‚É£ T·∫°o ƒë∆∞·ªùng d·∫´n file SAVE =====
SAVE_PATH = PROJ_DIR / "data" / "ver1.pkl"

print(f"üìÅ ƒêang l∆∞u d·ªØ li·ªáu v√†o: {SAVE_PATH}")

# ===== 3Ô∏è‚É£ L∆∞u file b·∫±ng pickle =====
df_final.to_pickle(SAVE_PATH)

print("‚úÖ ƒê√£ l∆∞u th√†nh c√¥ng!")
print(f"üëâ L·∫ßn sau ch·ªâ c·∫ßn: df = pd.read_pickle(r'{SAVE_PATH}')")


üìÅ ƒêang l∆∞u d·ªØ li·ªáu v√†o: d:\vhproj\CAFA-6-Protein-Function-Prediction-Kaggle\data\processed_data.pkl
‚úÖ ƒê√£ l∆∞u th√†nh c√¥ng!
üëâ L·∫ßn sau ch·ªâ c·∫ßn: df = pd.read_pickle(r'd:\vhproj\CAFA-6-Protein-Function-Prediction-Kaggle\data\processed_data.pkl')
