# Import

In [10]:
import os, gc, re, csv
from collections import defaultdict

import pandas as pd
from tqdm.auto import tqdm
import numpy as np

from collections import Counter
import math

from Bio import SeqIO

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.multiclass import OneVsRestClassifier

# Parsing data

## Parse GO graph

In [11]:
# Copied from https://www.kaggle.com/code/seddiktrk/cafa-6-blend-goa-negative-propagation/notebook

def parse_obo(go_obo_path):
    parents = defaultdict(set)
    children = defaultdict(set)

    if not os.path.exists(go_obo_path):
        return parents, children

    with open(go_obo_path, "r") as f:
        cur_id = None
        for line in f:
            line = line.strip()
            if line == "[Term]":
                cur_id = None
            elif line.startswith("id: "):
                cur_id = line.split("id: ")[1].strip()
            elif line.startswith("is_a: "):
                pid = line.split()[1].strip()
                if cur_id:
                    parents[cur_id].add(pid)
                    children[pid].add(cur_id)
            elif line.startswith("relationship: part_of "):
                parts = line.split()
                if len(parts) >= 3:
                    pid = parts[2].strip()
                    if cur_id:
                        parents[cur_id].add(pid)
                        children[pid].add(cur_id)
    print(f"[io] Parsed OBO: {len(parents)} nodes with parents")
    return parents, children


def get_ancestors(go_id, parents):
    ans = set()
    stack = [go_id]
    while stack:
        cur = stack.pop()
        for p in parents.get(cur, []):
            if p not in ans:
                ans.add(p)
                stack.append(p)
    return ans


def get_descendants(go_id, children):
    desc = set()
    stack = [go_id]
    while stack:
        cur = stack.pop()
        for child in children.get(cur, []):
            if child not in desc:
                desc.add(child)
                stack.append(child)
    return desc

## Parse train data

In [12]:
terms_df = pd.read_csv(os.path.join("./Train/train_terms.tsv"), sep="\t", usecols=["EntryID", "term"])
train_annotations = terms_df.groupby("EntryID")["term"].apply(list).to_dict()

train_sq = []
train_answer = []

terms_to_answer = terms_df.groupby('EntryID')['term'].apply(list).to_dict()

cnt = 0


for record in SeqIO.parse(os.path.join("./Train/train_sequences.fasta"), "fasta"):
    try:
        if "|" in record.id:
            clean_id = record.id.split("|")[1]
        else:
            clean_id = record.id
        a = record.description.split("OX=")
        b = a[1].split(" ")[0]
        if clean_id:
            train_sq.append({
                "id": clean_id,
                "tax": b,
                "seq": str(record.seq),
                "answer": terms_to_answer[clean_id]
            })
        else:
            print("123")
    except IndexError:
        continue

test_sq = []

for record in SeqIO.parse(os.path.join("./Test/testsuperset.fasta"), "fasta"):
    tax = record.description.split(" ")[1]
    # if (tax not in test_gr):
    #     test_gr[tax] = []
    test_sq.append({
        "id": record.id,
        "tax": tax,
        "seq": str(record.seq)
    })

test_df = pd.DataFrame(test_sq)
train_df = pd.DataFrame(train_sq)

In [13]:
print(len(train_df), len(test_df))
print(terms_df[:10])
print(train_df[:10])
print(test_df[:10])

82404 224309
  EntryID        term
0  Q5W0B1  GO:0000785
1  Q5W0B1  GO:0004842
2  Q5W0B1  GO:0051865
3  Q5W0B1  GO:0006275
4  Q5W0B1  GO:0006513
5  Q5W0B1  GO:0003682
6  Q5W0B1  GO:0005515
7  Q3EC77  GO:0000138
8  Q3EC77  GO:0005794
9  Q8IZR5  GO:0005515
           id   tax                                                seq  \
0  A0A0C5B5G6  9606                                   MRWQEMGYIFYPRKLR   
1      A0JNW5  9606  MAGIIKKQILKHLSRFTKNLSPDKINLSTLKGEGELKNLELDEEVL...   
2      A0JP26  9606  MVAEVCSMPAASAVKKPFDLRSKMGKWCHHRFPCCRGSGKSNMGTS...   
3      A0PK11  9606  MPGWFKKAWYGLASLLSFSSFILIIVALVVPHWLSGKILCQTGVDL...   
4      A1A4S6  9606  MGLQPLEFSDCYLDSPWFRERIRAHEAELERTNKFIKELIKDGKNL...   
5      A1A519  9606  MKRRQKRKHLENEESQETAEKGGGMSKSQEDALQPGSTRVAKGWSQ...   
6      A1L190  9606  MDDADPEERNYDNMLKMLSDLNKDLEKLLEEMEKISVQATWMAYDM...   
7      A1L3X0  9606  MAFSDLTSRTVHLYDNWIKDADPRVEDWLLMSSPLPQTILLGFYVY...   
8      A1X283  9606  MPPRRSIVEVKVLDVQKRRVPNKHYVYIIRVTWSSGSTEAIYRRYS...   
9    

# Getting features

In [14]:
def get_bigrams(seq):
    return [seq[i:i+2] for i in range(len(seq)-1)]

def build_idf(train_seqs, vocab):
    df = Counter()
    total_docs = len(train_seqs)

    for seq in train_seqs:
        bigs = set(get_bigrams(seq))
        for b in bigs:
            if b in vocab:
                df[b] += 1

    idf = {}
    for b in vocab:
        idf[b] = math.log((1 + total_docs) / (1 + df[b])) + 1

    return idf

def seq_to_tfidf_vector(seq, vocab, idf, norm=True):
    bigs = get_bigrams(seq)
    count = Counter(bigs)

    vec = np.zeros(len(vocab), dtype=float)

    for i, b in enumerate(vocab):
        tf = count[b]
        vec[i] = tf * idf[b]

    if norm:
        s = np.linalg.norm(vec)
        if s > 0:
            vec = vec / s

    return vec

def build_feature_matrix(seqs, vocab, idf):
    return np.array([seq_to_tfidf_vector(seq, vocab, idf) for seq in seqs])



In [15]:
features = ['LL', 'SS', 'SL', 'LS', 'AA', 'LA', 'AL', 'EE', 'LE', 'VL', 'EL', 'LV', 'LG', 'GL', 'LK', 'AS', 'GS', 'LR', 'SG', 'KL', 'GG', 'LP', 'TL', 'SA', 'IL', 'RL', 'LT', 'DL', 'LD', 'EK', 'EA', 'AV', 'KK', 'VS', 'AG', 'SV', 'PS', 'SP', 'VA', 'LQ', 'LI', 'KE', 'ST', 'TS', 'PL', 'SE', 'AE', 'GA', 'VV', 'PP', 'SK', 'IS', 'QL', 'FL', 'ES', 'SR', 'ED', 'KS', 'NL', 'KA', 'SI', 'GV', 'DS', 'PA', 'VE', 'TA', 'AT', 'EV', 'SD', 'PG', 'RS', 'RR', 'GK',
            'DE', 'GE', 'TV', 'AK', 'VG', 'PE', 'TG', 'EG', 'AR', 'ER', 'AP', 'GT', 'TT', 'RE', 'EI', 'IA', 'NS', 'DG', 'GR', 'RK', 'FS', 'PV', 'DV', 'VD', 'SN', 'VI', 'ET', 'VP', 'TP', 'RG', 'GD', 'SQ', 'AD', 'DD', 'SF', 'KI', 'KT', 'GP', 'RV', 'IE', 'QQ', 'IG', 'VR', 'QA', 'TI', 'IP', 'KP', 'DK', 'KN', 'RI', 'DP', 'GF', 'GN', 'IR', 'EP', 'PD', 'NV', 'RP', 'QK', 'NE', 'HL', 'FV', 'GQ', 'DR', 'DF', 'IF', 'SY', 'RF', 'GY', 'TQ', 'FP', 'HP', 'VC', 'GC', 'GW']

idf = build_idf(train_df["seq"], features)
X_train = build_feature_matrix(train_df["seq"], features, idf)


# Get label

In [16]:
mlb = MultiLabelBinarizer(sparse_output=True)
y_train = mlb.fit_transform(train_df["answer"])
print(y_train.shape)

(82404, 26125)


# Random Forest

In [39]:
clf = OneVsRestClassifier(LogisticRegression(max_iter=500, random_state=42, n_jobs=-1), n_jobs=-1, verbose=1)
clf.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   16.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   31.5s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   55.8s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  2.2min


KeyboardInterrupt: 

# Get output from model

In [None]:
X_test = build_feature_matrix(test_df["seq"], features, idf)


In [None]:

arr = test_df['id']

'A0A0C5B5G6'

In [None]:
batch_size = 32768   # hoặc 10, 32 tùy RAM
threshold = 0.02  # THRESHOLD BẠN MUỐN

all_probs = []

for i in range(0, X_test.shape[0], batch_size):
    print(f"Predict for [{i}, {i+batch_size}]")
    X_batch = X_test[i:i+batch_size]

    # predict proba
    prob_batch = clf.predict_proba(X_batch)  # shape (batch, n_classes)

    # lọc theo threshold
    for j, prob_vec in enumerate(prob_batch):
        cur_id = arr[i+j]
        filtered = []
        for class_idx, p in enumerate(prob_vec):
            if p >= threshold:
                label = mlb.classes_[class_idx]
                filtered.append((label, float(p)))   # (GO_label, prob)


        all_probs.append({"id": cur_id, "preds": filtered})

Predict for [0, 32768]


KeyboardInterrupt: 

In [None]:
all_probs[:20]

[{'id': 'A0A0C5B5G6', 'preds': []},
 {'id': 'A0A1B0GTW7', 'preds': [('GO:0005515', 0.5318793590021064)]},
 {'id': 'A0JNW5',
  'preds': [('GO:0005515', 0.5373972077343487),
   ('GO:0005634', 0.3366663596828987),
   ('GO:0005737', 0.20013504702171248),
   ('GO:0005829', 0.20207630022082895)]},
 {'id': 'A0JP26',
  'preds': [('GO:0005515', 0.5490279628115253),
   ('GO:0005634', 0.26516550266480643)]},
 {'id': 'A0PK11',
  'preds': [('GO:0005515', 0.2009410212778832),
   ('GO:0005886', 0.3099333448417127)]},
 {'id': 'A1A4S6',
  'preds': [('GO:0005515', 0.6489025341207314),
   ('GO:0005829', 0.22149986596519217)]},
 {'id': 'A1A519',
  'preds': [('GO:0005515', 0.7235551952847804),
   ('GO:0005634', 0.26682648261967745)]},
 {'id': 'A1L190',
  'preds': [('GO:0005515', 0.5523456479797285),
   ('GO:0005829', 0.20431734189759046)]},
 {'id': 'A1L3X0', 'preds': [('GO:0005515', 0.2324389204897821)]},
 {'id': 'A1X283', 'preds': [('GO:0005515', 0.5949344346076535)]},
 {'id': 'A2A2Y4', 'preds': [('GO:000

In [None]:
# all_probs[:20]

output_file = "submission.tsv"

with open(output_file, "w") as f:
    for mp in all_probs:
        # mp['id'], mp['pred']
        for pred, prob in mp['preds']:
            f.write(mp['id']+"\t"+pred+"\t"+str(round(prob, 3))+"\n")

# with open(output_file, "w", newline='', encoding="utf-8") as f:
#     writer = csv.writer(f)
#     writer.writerow(["id", "label", "probability"])  # header

#     for sample_idx, sample_id in enumerate(ids):
#         for label_idx, label in enumerate(labels):
#             prob = probs[sample_idx][label_idx]
#             if prob >= threshold:
#                 writer.writerow([sample_id, label, float(prob)])