In [9]:
import pandas as pd
import os
from IPython.display import display

data_dir = "../data/Train"

# Load train_terms.tsv
print("Loading train_terms.tsv...")
train_terms_path = os.path.join(data_dir, "train_terms.tsv")
train_terms_df = pd.read_csv(train_terms_path, sep="\t")
print("train_terms.tsv head:")
display(train_terms_df.head())
print("\n")

# Load train_sequences.fasta into a DataFrame
print("Loading train_sequences.fasta...")
train_sequences_path = os.path.join(data_dir, "train_sequences.fasta")
sequences = []
current_sequence_id = None
current_sequence = []

with open(train_sequences_path, 'r') as f:
    for line in f:
        line = line.strip()
        if line.startswith('>'):
            if current_sequence_id is not None:
                sequences.append({'sequence_id': current_sequence_id, 'sequence': ''.join(current_sequence)})
            current_sequence_id = line[1:]
            current_sequence = []
        else:
            current_sequence.append(line)
    if current_sequence_id is not None:
        sequences.append({'sequence_id': current_sequence_id, 'sequence': ''.join(current_sequence)})

train_sequences_df = pd.DataFrame(sequences)
print("train_sequences.fasta head:")
display(train_sequences_df.head())
print("\n")

# Load train_taxonomy.tsv
print("Loading train_taxonomy.tsv...")
train_taxonomy_path = os.path.join(data_dir, "train_taxonomy.tsv")
train_taxonomy_df = pd.read_csv(train_taxonomy_path, sep="\t")
print("train_taxonomy.tsv head:")
display(train_taxonomy_df.head())
print("\n")

# Load go-basic.obo into a DataFrame
print("Loading go-basic.obo...")
go_obo_path = os.path.join(data_dir, "go-basic.obo")
terms = []
current_term = {}

with open(go_obo_path, 'r') as f:
    for line in f:
        line = line.strip()
        if line == '[Term]':
            if current_term:
                terms.append(current_term)
            current_term = {}
        elif line.startswith('id:'):
            current_term['id'] = line.split(':', 1)[1].strip()
        elif line.startswith('name:'):
            current_term['name'] = line.split(':', 1)[1].strip()
        elif line.startswith('namespace:'):
            current_term['namespace'] = line.split(':', 1)[1].strip()
        elif line.startswith('is_a:'):
            # Extract only the GO ID, ignore the name after '!'
            is_a_id = line.split(':', 1)[1].split('!', 1)[0].strip()
            if 'is_a' not in current_term:
                current_term['is_a'] = []
            current_term['is_a'].append(is_a_id)
    if current_term:  # Add the last term
        terms.append(current_term)

go_obo_df = pd.DataFrame(terms)
print("go-basic.obo head:")
display(go_obo_df.head())
print("\n")


Loading train_terms.tsv...
train_terms.tsv head:


Unnamed: 0,EntryID,term,aspect
0,Q5W0B1,GO:0000785,C
1,Q5W0B1,GO:0004842,F
2,Q5W0B1,GO:0051865,P
3,Q5W0B1,GO:0006275,P
4,Q5W0B1,GO:0006513,P




Loading train_sequences.fasta...
train_sequences.fasta head:


Unnamed: 0,sequence_id,sequence
0,sp|A0A0C5B5G6|MOTSC_HUMAN Mitochondrial-derive...,MRWQEMGYIFYPRKLR
1,sp|A0JNW5|BLT3B_HUMAN Bridge-like lipid transf...,MAGIIKKQILKHLSRFTKNLSPDKINLSTLKGEGELKNLELDEEVL...
2,sp|A0JP26|POTB3_HUMAN POTE ankyrin domain fami...,MVAEVCSMPAASAVKKPFDLRSKMGKWCHHRFPCCRGSGKSNMGTS...
3,sp|A0PK11|CLRN2_HUMAN Clarin-2 OS=Homo sapiens...,MPGWFKKAWYGLASLLSFSSFILIIVALVVPHWLSGKILCQTGVDL...
4,sp|A1A4S6|RHG10_HUMAN Rho GTPase-activating pr...,MGLQPLEFSDCYLDSPWFRERIRAHEAELERTNKFIKELIKDGKNL...




Loading train_taxonomy.tsv...
train_taxonomy.tsv head:


Unnamed: 0,A0A0C5B5G6,9606
0,A0JNW5,9606
1,A0JP26,9606
2,A0PK11,9606
3,A1A4S6,9606
4,A1A519,9606




Loading go-basic.obo...
go-basic.obo head:


Unnamed: 0,id,name,namespace,is_a
0,GO:0000001,mitochondrion inheritance,biological_process,"[GO:0048308, GO:0048311]"
1,GO:0000002,mitochondrial genome maintenance,biological_process,[GO:0007005]
2,GO:0000003,obsolete reproduction,biological_process,
3,GO:0000005,obsolete ribosomal chaperone activity,molecular_function,
4,GO:0000006,high-affinity zinc transmembrane transporter a...,molecular_function,[GO:0005385]




