This notebook ranks features of each embedding type (T5, ESM2, ProtBERT) based on RandomForest feature importances.

In [1]:
import os
import sys
import gc
import numpy as np
import pandas as pd

In [2]:
os.chdir('..')

In [3]:
sys.path.append('.')
sys.path.append('src')

In [4]:
from cafa_utils import *
from data_utils import *

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

In [6]:
num_of_labels = 750 # don't use all

## Rank T5 features

In [7]:
(train_terms_updated, train_protein_ids, test_protein_ids, 
 train_df, test_df, labels_to_consider, labels_df) = prepare_dataframes(
    n_labels=num_of_labels,
    emb_type='t5',
    )

Reading data and preparing stuff...
Preparations done


In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    train_df.values,
    labels_df.values,
    test_size=0.8, # fit only with a small set to save time
    random_state=123)

In [9]:
classifier = OneVsRestClassifier(
    make_pipeline(RandomForestClassifier(
        n_estimators=30,
        max_depth=3,
        random_state=123,
    )),
    n_jobs=-1, # use all processors
    verbose=1
)
classifier.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   28.6s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 750 out of 750 | elapsed:  9.8min finished


In [10]:
# calculate importance indices order
importances = []
for estimator in classifier.estimators_:
    importances.append(estimator.named_steps['randomforestclassifier'].feature_importances_)
importances = np.stack(importances, 0)

importances = np.mean(importances, 0)
importance_indices = np.argsort(importances)[::-1]

In [11]:
np.save('./input/t5_train_embeds_ranked.npy', 
        train_df[train_df.columns[importance_indices]].values)
np.save('./input/t5_test_embeds_ranked.npy', 
        test_df[test_df.columns[importance_indices]].values)

## Rank ESM2 features

In [12]:
(train_terms_updated, train_protein_ids, test_protein_ids, 
 train_df, test_df, labels_to_consider, labels_df) = prepare_dataframes(
    n_labels=num_of_labels,
    emb_type='esm2_3b',
    )

Reading data and preparing stuff...
Preparations done


In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    train_df.values,
    labels_df.values,
    test_size=0.8, # fit only with a small set to save time
    random_state=123)

In [14]:
classifier = OneVsRestClassifier(
    make_pipeline(RandomForestClassifier(
        n_estimators=30,
        max_depth=3,
        random_state=123,
    )),
    n_jobs=-1, # use all processors
    verbose=1
)
classifier.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   43.0s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 750 out of 750 | elapsed: 15.2min finished


In [15]:
# calculate importance indices order
importances = []
for estimator in classifier.estimators_:
    importances.append(estimator.named_steps['randomforestclassifier'].feature_importances_)
importances = np.stack(importances, 0)

importances = np.mean(importances, 0)
importance_indices = np.argsort(importances)[::-1]

In [16]:
np.save('./input/esm2_train_embeds_ranked.npy', 
        train_df[train_df.columns[importance_indices]].values)
np.save('./input/esm2_test_embeds_ranked.npy', 
        test_df[test_df.columns[importance_indices]].values)

## Rank ProtBERT features

In [17]:
(train_terms_updated, train_protein_ids, test_protein_ids, 
 train_df, test_df, labels_to_consider, labels_df) = prepare_dataframes(
    n_labels=num_of_labels,
    emb_type='protbert',
    )

Reading data and preparing stuff...
Preparations done


In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    train_df.values,
    labels_df.values,
    test_size=0.8, # fit only with a small set to save time
    random_state=123)

In [19]:
classifier = OneVsRestClassifier(
    make_pipeline(RandomForestClassifier(
        n_estimators=30,
        max_depth=3,
        random_state=123,
    )),
    n_jobs=-1, # use all processors
    verbose=1
)
classifier.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   27.4s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 750 out of 750 | elapsed:  9.7min finished


In [20]:
# calculate importance indices order
importances = []
for estimator in classifier.estimators_:
    importances.append(estimator.named_steps['randomforestclassifier'].feature_importances_)
importances = np.stack(importances, 0)

importances = np.mean(importances, 0)
importance_indices = np.argsort(importances)[::-1]

In [21]:
np.save('./input/protbert_train_embeds_ranked.npy', 
        train_df[train_df.columns[importance_indices]].values)
np.save('./input/protbert_test_embeds_ranked.npy', 
        test_df[test_df.columns[importance_indices]].values)