In [None]:
!pip install biopython

In [None]:
import os, gc, re, csv
from collections import defaultdict

import pandas as pd
from tqdm.auto import tqdm
import numpy as np

from collections import Counter
import math

from Bio import SeqIO

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.multiclass import OneVsRestClassifier

In [None]:
ROOT_DIR = './'
is_kaggle = True
if is_kaggle:
    ROOT_DIR = '/kaggle/input/cafa-6-protein-function-prediction/'

In [None]:
terms_df = pd.read_csv(os.path.join(ROOT_DIR + "Train/train_terms.tsv"), sep="\t", usecols=["EntryID", "term"])
train_annotations = terms_df.groupby("EntryID")["term"].apply(list).to_dict()

train_sq = []
train_answer = []

terms_to_answer = terms_df.groupby('EntryID')['term'].apply(list).to_dict()

cnt = 0


for record in SeqIO.parse(os.path.join(ROOT_DIR + "Train/train_sequences.fasta"), "fasta"):
    try:
        if "|" in record.id:
            clean_id = record.id.split("|")[1]
        else:
            clean_id = record.id
        a = record.description.split("OX=")
        b = a[1].split(" ")[0]
        if clean_id:
            train_sq.append({
                "id": clean_id,
                "tax": b,
                "seq": str(record.seq),
                "answer": terms_to_answer[clean_id]
            })
        else:
            print("123")
    except IndexError:
        continue

test_sq = []

for record in SeqIO.parse(os.path.join(ROOT_DIR + "Test/testsuperset.fasta"), "fasta"):
    tax = record.description.split(" ")[1]
    # if (tax not in test_gr):
    #     test_gr[tax] = []
    test_sq.append({
        "id": record.id,
        "tax": tax,
        "seq": str(record.seq)
    })

test_df = pd.DataFrame(test_sq)
train_df = pd.DataFrame(train_sq)

In [None]:
print(len(train_df), len(test_df))
print(terms_df[:10])
print(train_df[:10])
print(test_df[:10])

In [None]:
def get_bigrams(seq):
    return [seq[i:i+2] for i in range(len(seq)-1)]

def build_idf(train_seqs, vocab):
    df = Counter()
    total_docs = len(train_seqs)

    for seq in train_seqs:
        bigs = set(get_bigrams(seq))
        for b in bigs:
            if b in vocab:
                df[b] += 1

    idf = {}
    for b in vocab:
        idf[b] = math.log((1 + total_docs) / (1 + df[b])) + 1

    return idf

def seq_to_tfidf_vector(seq, vocab, idf, norm=True):
    bigs = get_bigrams(seq)
    count = Counter(bigs)

    vec = np.zeros(len(vocab), dtype=float)

    for i, b in enumerate(vocab):
        tf = count[b]
        vec[i] = tf * idf[b]

    if norm:
        s = np.linalg.norm(vec)
        if s > 0:
            vec = vec / s

    return vec

def build_feature_matrix(seqs, vocab, idf):
    return np.array([seq_to_tfidf_vector(seq, vocab, idf) for seq in seqs])



In [None]:
features = ['LL', 'SS', 'SL', 'LS', 'AA', 'LA', 'AL', 'EE', 'LE', 'VL', 'EL', 'LV', 'LG', 'GL', 'LK', 'AS', 'GS', 'LR', 'SG', 'KL', 'GG', 'LP', 'TL', 'SA', 'IL', 'RL', 'LT', 'DL', 'LD', 'EK', 'EA', 'AV', 'KK', 'VS', 'AG', 'SV', 'PS', 'SP', 'VA', 'LQ', 'LI', 'KE', 'ST', 'TS', 'PL', 'SE', 'AE', 'GA', 'VV', 'PP', 'SK', 'IS', 'QL', 'FL', 'ES', 'SR', 'ED', 'KS', 'NL', 'KA', 'SI', 'GV', 'DS', 'PA', 'VE', 'TA', 'AT', 'EV', 'SD', 'PG', 'RS', 'RR', 'GK',
            'DE', 'GE', 'TV', 'AK', 'VG', 'PE', 'TG', 'EG', 'AR', 'ER', 'AP', 'GT', 'TT', 'RE', 'EI', 'IA', 'NS', 'DG', 'GR', 'RK', 'FS', 'PV', 'DV', 'VD', 'SN', 'VI', 'ET', 'VP', 'TP', 'RG', 'GD', 'SQ', 'AD', 'DD', 'SF', 'KI', 'KT', 'GP', 'RV', 'IE', 'QQ', 'IG', 'VR', 'QA', 'TI', 'IP', 'KP', 'DK', 'KN', 'RI', 'DP', 'GF', 'GN', 'IR', 'EP', 'PD', 'NV', 'RP', 'QK', 'NE', 'HL', 'FV', 'GQ', 'DR', 'DF', 'IF', 'SY', 'RF', 'GY', 'TQ', 'FP', 'HP', 'VC', 'GC', 'GW']

idf = build_idf(train_df["seq"], features)
X_train = build_feature_matrix(train_df["seq"], features, idf)


In [None]:
mlb = MultiLabelBinarizer(sparse_output=True)
y_train = mlb.fit_transform(train_df["answer"])
print(y_train.shape)

In [None]:
clf = OneVsRestClassifier(LogisticRegression(max_iter=500, random_state=42, n_jobs=-1), n_jobs=-1, verbose=1)
clf.fit(X_train, y_train)

In [None]:
X_test = build_feature_matrix(test_df["seq"], features, idf)


In [None]:

arr = test_df['id']

In [None]:
batch_size = 32768
threshold = 0.02

all_probs = []

for i in range(0, X_test.shape[0], batch_size):
    print(f"Predict for [{i}, {i+batch_size}]")
    X_batch = X_test[i:i+batch_size]

    prob_batch = clf.predict_proba(X_batch)  # shape (batch, n_classes)

    for j, prob_vec in enumerate(prob_batch):
        cur_id = arr[i+j]
        filtered = []
        for class_idx, p in enumerate(prob_vec):
            if p >= threshold:
                label = mlb.classes_[class_idx]
                filtered.append((label, float(p)))   # (GO_label, prob)


        all_probs.append({"id": cur_id, "preds": filtered})

In [None]:
all_probs[:20]

In [None]:

output_file = "submission.tsv"

with open(output_file, "w") as f:
    for mp in all_probs:
        # mp['id'], mp['pred']
        for pred, prob in mp['preds']:
            f.write(mp['id']+"\t"+pred+"\t"+str(round(prob, 3))+"\n")
