In [None]:
import os
import pickle
import numpy as np
from tqdm.notebook import tqdm

from scripts.helper_scripts.f1_max_score import count_f1_max

In [None]:
train_folder = ''
test_folder = ''
valid_folder = ''

files_train = os.listdir(train_folder)
files_test = os.listdir(test_folder)
files_valid = os.listdir(valid_folder)

In [None]:
features_train = []
labels_mf_train = []
labels_cc_train = []
labels_bp_train = []

for file in tqdm(files_train):
    with open(os.path.join(train_folder, file), 'rb') as f:
        data = pickle.load(f)
        for key in data.keys():
            features_train.append(data[key]['embeddings'].mean(axis=0))
            labels_mf_train.append(data[key]['label_MF'])
            labels_cc_train.append(data[key]['label_CC'])
            labels_bp_train.append(data[key]['label_BP'])

  0%|          | 0/26225 [00:00<?, ?it/s]

In [None]:
features_test = []
labels_mf_test = []
labels_cc_test = []
labels_bp_test = []
for file in tqdm(files_test):
    with open(os.path.join(test_folder, file), 'rb') as f:
        data = pickle.load(f)
        for key in data.keys():
            features_test.append(data[key]['embeddings'].mean(axis=0))
            labels_mf_test.append(data[key]['label_MF'])
            labels_cc_test.append(data[key]['label_CC'])
            labels_bp_test.append(data[key]['label_BP'])

y_test_mf = np.array(labels_mf_test)
y_test_cc = np.array(labels_cc_test)
y_test_bp = np.array(labels_bp_test)

features_valid = []
labels_mf_valid = []
labels_cc_valid = []
labels_bp_valid = []
for file in tqdm(files_valid):
    with open(os.path.join(valid_folder, file), 'rb') as f:
        data = pickle.load(f)
        for key in data.keys():
            features_valid.append(data[key]['averaged_embedding'])
            labels_mf_valid.append(data[key]['label_MF'])
            labels_cc_valid.append(data[key]['label_CC'])
            labels_bp_valid.append(data[key]['label_BP'])

y_valid_mf = np.array(labels_mf_valid)
y_valid_cc = np.array(labels_cc_valid)
y_valid_bp = np.array(labels_bp_valid)

  0%|          | 0/3350 [00:00<?, ?it/s]

# KNN

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

### On train

In [10]:
len(features_test)

3350

In [11]:
clf_mf = MultiOutputClassifier(KNeighborsClassifier(n_neighbors=3, weights='distance')).fit(features_train, labels_mf_train)
preds_mf = clf_mf.predict_proba(features_test)
print(f'F1 score MF, max: {count_f1_max(np.array([x[:, 1] if x.shape[1] == 2 else np.zeros(3350) for x in preds_mf]).T, y_test_mf)}')

F1 score MF, max: 0.599583089351654


In [12]:
clf_cc = MultiOutputClassifier(KNeighborsClassifier(n_neighbors=3, weights='distance')).fit(features_train, labels_cc_train)
preds_cc = clf_cc.predict_proba(features_test)
print(f'F1 score CC, max: {count_f1_max(np.array([x[:, 1] if x.shape[1] == 2 else np.zeros(3350) for x in preds_cc]).T, y_test_cc)}')

F1 score CC, max: 0.481135755777359


In [13]:
clf_bp = MultiOutputClassifier(KNeighborsClassifier(n_neighbors=3)).fit(features_train, labels_bp_train)
preds_bp = clf_bp.predict_proba(features_test)
print(f'F1 score BP, max: {count_f1_max(np.array([x[:, 1] if x.shape[1] == 2 else np.zeros(3350) for x in preds_bp]).T, y_test_bp)}')

F1 score BP, max: 0.47102034091949463


### K-fold cross-val

In [15]:
from sklearn.model_selection import KFold

In [20]:
kf = KFold(n_splits=5, shuffle=True, random_state=92)

f1_scores_mf = []
f1_scores_cc = []
f1_scores_bp = []

for train_index, test_index in kf.split(features_valid):
    X_train = [features_valid[i] for i in train_index]
    X_test = [features_valid[i] for i in test_index]
    y_mf = y_valid_mf[train_index]
    y_mf_t = y_valid_mf[test_index]
    y_cc = y_valid_cc[train_index]
    y_cc_t = y_valid_cc[test_index]
    y_bp = y_valid_bp[train_index]
    y_bp_t = y_valid_bp[test_index]

    n_samples = len(y_mf_t)

    clf_mf = MultiOutputClassifier(KNeighborsClassifier(n_neighbors=3)).fit(X_train, y_mf)
    preds_mf = clf_mf.predict_proba(X_test)
    f1_scores_mf.append(count_f1_max(np.array([x[:, 1] if x.shape[1] == 2 else np.zeros(n_samples) for x in preds_mf]).T, y_mf_t))

    clf_cc = MultiOutputClassifier(KNeighborsClassifier(n_neighbors=3)).fit(X_train, y_cc)
    preds_cc = clf_cc.predict_proba(X_test)
    f1_scores_cc.append(count_f1_max(np.array([x[:, 1] if x.shape[1] == 2 else np.zeros(n_samples) for x in preds_cc]).T, y_cc_t))

    clf_bp = MultiOutputClassifier(KNeighborsClassifier(n_neighbors=3)).fit(X_train, y_bp)
    preds_bp = clf_bp.predict_proba(X_test)
    f1_scores_bp.append(count_f1_max(np.array([x[:, 1] if x.shape[1] == 2 else np.zeros(n_samples) for x in preds_bp]).T, y_bp_t))

print(f'F1 score MF, max: {np.mean(f1_scores_mf)}, std: {np.std(f1_scores_mf)}')
print(f'F1 score CC, max: {np.mean(f1_scores_cc)}, std: {np.std(f1_scores_cc)}')
print(f'F1 score BP, max: {np.mean(f1_scores_bp)}, std: {np.std(f1_scores_bp)}')

F1 score MF, max: 0.2773600876331329, std: 0.01475842601081803
F1 score CC, max: 0.18349820375442505, std: 0.018393149493804224
F1 score BP, max: 0.25529301166534424, std: 0.014278667036727534


# NN

In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [13]:
class Baseline(nn.Module):
    def __init__(self, input_size, output_size):
        super(Baseline, self).__init__()
        self.fc1 = nn.Linear(input_size, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.dropout1 = nn.Dropout(0.2)
        self.fc2 = nn.Linear(512, 512)
        self.bn2 = nn.BatchNorm1d(512)
        self.dropout2 = nn.Dropout(0.2)
        self.fc3 = nn.Linear(512, output_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.bn1(x)
        x = self.dropout1(x)
        x = F.relu(self.fc2(x))
        x = self.bn2(x)
        x = self.dropout2(x)
        x = self.fc3(x)
        return x

In [14]:
def train_epoch(model, train_loader, optimizer, criterion, cuda=True):
    model.train()
    train_loss = 0
    for i, (inputs, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        if cuda:
            inputs = inputs.cuda()
            labels = labels.cuda()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.cpu().item()
    return train_loss / len(train_loader)

def evaluate(model, test_loader, criterion, cuda=True, length=1261):
    model.eval()
    test_loss = 0
    preds = []
    labels_gt = []
    with torch.no_grad():
        for i, (inputs, labels) in enumerate(test_loader):
            if cuda:
                inputs = inputs.cuda()
            outputs = model(inputs)
            preds.extend(F.sigmoid(outputs).cpu().numpy())
            labels_gt.extend(labels.cpu().numpy())
    return count_f1_max(np.array(preds).reshape(-1, length), np.array(labels_gt).reshape(-1, length))

def train(model, train_loader, test_loader, optimizer, criterion, epochs, cuda=True, scheduler=None):
    for epoch in tqdm(range(epochs)):
        train_loss = train_epoch(model, train_loader, optimizer, criterion, cuda)
        valid_f1 = evaluate(model, test_loader, criterion, cuda)
        if scheduler is not None:
            scheduler.step(valid_f1)
        print(f'Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Valid F1: {valid_f1:.4f}')

### Regular train

In [21]:
files_valid = os.listdir('/media/ssd-3t-2/amiftakhova/tda_proteins/avg_embeddings_valid')
features_valid = []
labels_mf_valid = []
labels_cc_valid = []
labels_bp_valid = []
for file in tqdm(files_valid):
    with open('/media/ssd-3t-2/amiftakhova/tda_proteins/avg_embeddings_valid/'+file, 'rb') as f:
        data = pickle.load(f)
        for key in data.keys():
            features_valid.append(data[key]['averaged_embedding'])
            labels_mf_valid.append(data[key]['label_MF'])
            labels_cc_valid.append(data[key]['label_CC'])
            labels_bp_valid.append(data[key]['label_BP'])

  0%|          | 0/1261 [00:00<?, ?it/s]

In [30]:
LR = 1e-3
BATCH_SIZE = 64

train_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(torch.Tensor(features_train), torch.Tensor(labels_mf_train)), batch_size=BATCH_SIZE, shuffle=True)
valid_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(torch.Tensor(features_valid), torch.Tensor(labels_mf_valid)), batch_size=BATCH_SIZE, shuffle=False)
test_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(torch.Tensor(features_test), torch.Tensor(labels_mf_test)), batch_size=BATCH_SIZE, shuffle=False)

out_features = len(y_test_mf[0])
model_mf = Baseline(1280, out_features).cuda()
optimizer = torch.optim.Adam(model_mf.parameters(), lr=LR)
criterion = nn.BCEWithLogitsLoss()
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=3)

train(model_mf, train_loader, valid_loader, optimizer, criterion, epochs=50)

f1_test = evaluate(model_mf, test_loader, criterion, cuda=True, length=len(y_test_mf))

  0%|          | 0/50 [00:00<?, ?it/s]

Epoch 1/50, Train Loss: 0.1566, Valid F1: 0.3343
Epoch 2/50, Train Loss: 0.0297, Valid F1: 0.5074
Epoch 3/50, Train Loss: 0.0236, Valid F1: 0.5798
Epoch 4/50, Train Loss: 0.0198, Valid F1: 0.6234
Epoch 5/50, Train Loss: 0.0175, Valid F1: 0.6698
Epoch 6/50, Train Loss: 0.0160, Valid F1: 0.6832
Epoch 7/50, Train Loss: 0.0147, Valid F1: 0.7056
Epoch 8/50, Train Loss: 0.0139, Valid F1: 0.7166
Epoch 9/50, Train Loss: 0.0131, Valid F1: 0.7277
Epoch 10/50, Train Loss: 0.0124, Valid F1: 0.7389
Epoch 11/50, Train Loss: 0.0118, Valid F1: 0.7363
Epoch 12/50, Train Loss: 0.0112, Valid F1: 0.7516
Epoch 13/50, Train Loss: 0.0107, Valid F1: 0.7557
Epoch 14/50, Train Loss: 0.0103, Valid F1: 0.7695
Epoch 15/50, Train Loss: 0.0098, Valid F1: 0.7696
Epoch 16/50, Train Loss: 0.0095, Valid F1: 0.7749
Epoch 17/50, Train Loss: 0.0093, Valid F1: 0.7743
Epoch 18/50, Train Loss: 0.0090, Valid F1: 0.7734
Epoch 19/50, Train Loss: 0.0087, Valid F1: 0.7832
Epoch 20/50, Train Loss: 0.0085, Valid F1: 0.7813
Epoch 21/

In [32]:
f1_test

0.6438495516777039

In [34]:
LR = 1e-3
BATCH_SIZE = 64

train_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(torch.Tensor(features_train), torch.Tensor(labels_cc_train)), batch_size=BATCH_SIZE, shuffle=True)
valid_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(torch.Tensor(features_valid), torch.Tensor(labels_cc_valid)), batch_size=BATCH_SIZE, shuffle=False)
test_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(torch.Tensor(features_test), torch.Tensor(labels_cc_test)), batch_size=BATCH_SIZE, shuffle=False)

out_features = len(y_test_cc[0])
model_cc = Baseline(1280, out_features).cuda()
optimizer = torch.optim.Adam(model_cc.parameters(), lr=LR)
criterion = nn.BCEWithLogitsLoss()
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=3)

train(model_cc, train_loader, valid_loader, optimizer, criterion, epochs=50)

f1_test = evaluate(model_cc, test_loader, criterion, cuda=True, length=len(y_test_cc))

  0%|          | 0/50 [00:00<?, ?it/s]

Epoch 1/50, Train Loss: 0.1477, Valid F1: 0.3634
Epoch 2/50, Train Loss: 0.0235, Valid F1: 0.4046
Epoch 3/50, Train Loss: 0.0212, Valid F1: 0.4563
Epoch 4/50, Train Loss: 0.0193, Valid F1: 0.4678
Epoch 5/50, Train Loss: 0.0185, Valid F1: 0.4849
Epoch 6/50, Train Loss: 0.0170, Valid F1: 0.5005
Epoch 7/50, Train Loss: 0.0163, Valid F1: 0.5233
Epoch 8/50, Train Loss: 0.0155, Valid F1: 0.5283
Epoch 9/50, Train Loss: 0.0149, Valid F1: 0.5203
Epoch 10/50, Train Loss: 0.0144, Valid F1: 0.5495
Epoch 11/50, Train Loss: 0.0139, Valid F1: 0.5368
Epoch 12/50, Train Loss: 0.0133, Valid F1: 0.5602
Epoch 13/50, Train Loss: 0.0128, Valid F1: 0.5699
Epoch 14/50, Train Loss: 0.0124, Valid F1: 0.5789
Epoch 15/50, Train Loss: 0.0121, Valid F1: 0.5930
Epoch 16/50, Train Loss: 0.0116, Valid F1: 0.5766
Epoch 17/50, Train Loss: 0.0113, Valid F1: 0.5774
Epoch 18/50, Train Loss: 0.0109, Valid F1: 0.5865
Epoch 19/50, Train Loss: 0.0106, Valid F1: 0.5934
Epoch 20/50, Train Loss: 0.0103, Valid F1: 0.6054
Epoch 21/

In [35]:
f1_test

0.48056766390800476

In [36]:
LR = 1e-3
BATCH_SIZE = 64

train_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(torch.Tensor(features_train), torch.Tensor(labels_bp_train)), batch_size=BATCH_SIZE, shuffle=True)
valid_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(torch.Tensor(features_valid), torch.Tensor(labels_bp_valid)), batch_size=BATCH_SIZE, shuffle=False)
test_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(torch.Tensor(features_test), torch.Tensor(labels_bp_test)), batch_size=BATCH_SIZE, shuffle=False)

out_features = len(y_test_bp[0])
model_bp = Baseline(1280, out_features).cuda()
optimizer = torch.optim.Adam(model_bp.parameters(), lr=LR)
criterion = nn.BCEWithLogitsLoss()
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=3)

train(model_bp, train_loader, valid_loader, optimizer, criterion, epochs=50)

f1_test = evaluate(model_bp, test_loader, criterion, cuda=True, length=len(y_test_bp))

  0%|          | 0/50 [00:00<?, ?it/s]

Epoch 1/50, Train Loss: 0.1517, Valid F1: 0.2473
Epoch 2/50, Train Loss: 0.0277, Valid F1: 0.3073
Epoch 3/50, Train Loss: 0.0252, Valid F1: 0.3459
Epoch 4/50, Train Loss: 0.0237, Valid F1: 0.3838
Epoch 5/50, Train Loss: 0.0221, Valid F1: 0.4101
Epoch 6/50, Train Loss: 0.0209, Valid F1: 0.4329
Epoch 7/50, Train Loss: 0.0201, Valid F1: 0.4520
Epoch 8/50, Train Loss: 0.0191, Valid F1: 0.4715
Epoch 9/50, Train Loss: 0.0182, Valid F1: 0.4786
Epoch 10/50, Train Loss: 0.0175, Valid F1: 0.4977
Epoch 11/50, Train Loss: 0.0170, Valid F1: 0.4948
Epoch 12/50, Train Loss: 0.0163, Valid F1: 0.5209
Epoch 13/50, Train Loss: 0.0159, Valid F1: 0.5306
Epoch 14/50, Train Loss: 0.0153, Valid F1: 0.5383
Epoch 15/50, Train Loss: 0.0149, Valid F1: 0.5395
Epoch 16/50, Train Loss: 0.0144, Valid F1: 0.5481
Epoch 17/50, Train Loss: 0.0141, Valid F1: 0.5585
Epoch 18/50, Train Loss: 0.0137, Valid F1: 0.5617
Epoch 19/50, Train Loss: 0.0134, Valid F1: 0.5659
Epoch 20/50, Train Loss: 0.0130, Valid F1: 0.5676
Epoch 21/

In [37]:
f1_test

0.45557740330696106