In [36]:
import pandas as pd
from os import path
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.utils.fixes import loguniform    
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import LeaveOneGroupOut
import pickle
import numpy as np
import os
import json
import csv

In [37]:
DATA_DIR = "../../data"

In [38]:
EVAL_DIR = "2021-12-03"

In [39]:
chen_train = pd.read_csv(path.join(DATA_DIR, "chen/deduplicated/chen_train_data.csv"), index_col=0)
chen_train.head()

Unnamed: 0,Antibody_ID,heavy,light,Y
2073,6aod,EVQLVQSGAEVKKPGASVKVSCKASGYTFTGYYMHWVRQAPGQGLE...,DIVMTKSPSSLSASVGDRVTITCRASQGIRNDLGWYQQKPGKAPKR...,0
1517,4yny,EVQLVESGGGLVQPGRSLKLSCAASGFTFSNYGMAWVRQTPTKGLE...,EFVLTQPNSVSTNLGSTVKLSCKRSTGNIGSNYVNWYQQHEGRSPT...,1
2025,5xcv,EVQLVESGGGLVQPGRSLKLSCAASGFTFSNYGMAWVRQTPTKGLE...,QFVLTQPNSVSTNLGSTVKLSCKRSTGNIGSNYVNWYQQHEGRSPT...,1
2070,6and,EVQLVESGGGLVQPGGSLRLSCAASGYEFSRSWMNWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRSSQSIVHSVGNTFLEWYQQKPG...,1
666,2xqy,QVQLQQPGAELVKPGASVKMSCKASGYSFTSYWMNWVKQRPGRGLE...,DIVLTQSPASLALSLGQRATISCRASKSVSTSGYSYMYWYQQKPGQ...,0


In [40]:
chen_valid = pd.read_csv(path.join(DATA_DIR, "chen/deduplicated/chen_valid_data.csv"), index_col=0)
chen_test = pd.read_csv(path.join(DATA_DIR, "chen/deduplicated/chen_test_data.csv"))
chen_valid = pd.concat([chen_valid, chen_test])
chen_valid.head()

Unnamed: 0.1,Antibody_ID,heavy,light,Y,Unnamed: 0
2169,6ct7,EVQLVESGGGLVEPGGSLRLSCAVSGFDFEKAWMSWVRQAPGQGLQ...,SYELTQPPSVSVSPGQTARITCSGEALPMQFAHWYQQRPGKAPVIV...,0,
1342,4nzu,AVSLVESGGGTVEPGSTLRLSCAASGFTFGSYAFHWVRQAPGDGLE...,DIEMTQSPSSLSASTGDKVTITCQASQDIAKFLDWYQQRPGKTPKL...,0,
1728,5i8c,QEVLVQSGAEVKKPGASVKVSCRAFGYTFTGNALHWVRQAPGQGLE...,DIQLTQSPSFLSASVGDKVTITCRASQGVRNELAWYQQKPGKAPNL...,1,
1729,5i8e,QEVLVQSGAEVKKPGASVKVSCRAFGYTFTGNALHWVRQAPGQGLE...,IQLTQSPSFLSASVGDKVTITCRASQGVRNELAWYQQKPGKAPNLL...,0,
2114,6bb4,QVQLQQSDAELVKPGASVKISCKASGYTFTDRTIHWVKQRPEQGLE...,DVQMIQSPSSLSASLGDIVTMTCQASQDTSINLNWFQQKPGKAPKL...,0,


In [41]:
tap_data = pd.read_csv(path.join(DATA_DIR, "tap/TAP_data.csv"))
tap_data.head()

Unnamed: 0,Antibody_ID,heavy,light,CDR_length,PSH,PPC,PNC,SFvCSP,Y
0,Abagovomab,QVKLQESGAELARPGASVKLSCKASGYTFTNYWMQWVKQRPGQGLD...,DIELTQSPASLSASVGETVTITCQASENIYSYLAWHQQKQGKSPQL...,46,129.7603,0.0,0.0,16.32,1
1,Abituzumab,QVQLQQSGGELAKPGASVKVSCKASGYTFSSFWMHWVRQAPGQGLE...,DIQMTQSPSSLSASVGDRVTITCRASQDISNYLAWYQQKPGKAPKL...,45,115.9106,0.0954,0.0421,-3.1,1
2,Abrilumab,QVQLVQSGAEVKKPGASVKVSCKVSGYTLSDLSIHWVRQAPGKGLE...,DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQQKPGKAPKL...,45,109.6995,0.0,0.8965,-4.0,1
3,Actoxumab,QVQLVESGGGVVQPGRSLRLSCAASGFSFSNYGMHWVRQAPGKGLE...,DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQHKPGKAPKL...,49,112.629,0.0,1.1247,3.1,1
4,Adalimumab,EVQLVESGGGLVQPGRSLRLSCAASGFTFDDYAMHWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRASQGIRNYLAWYQQKPGKAPKL...,48,111.2512,0.0485,1.1364,-19.5,1


In [42]:
chen_similarity = pd.read_csv(path.join(DATA_DIR, "chen/distances/deduplicated_anarci_similarity.csv"), index_col=0)
chen_similarity.head()

Unnamed: 0,0,1,2,3,4,5,7,8,9,10,...,2398,2400,2401,2402,2403,2404,2405,2406,2407,2408
0,1.0,0.777,0.7035,0.69995,0.65085,0.6735,0.66915,0.66915,0.761,0.718,...,0.5925,0.63185,0.6515,0.6185,0.7007,0.6055,0.54015,0.7046,0.6102,0.6284
1,0.777,1.0,0.75,0.735,0.69315,0.7024,0.6557,0.6557,0.74785,0.79775,...,0.596,0.65265,0.672,0.63085,0.6935,0.6075,0.5531,0.71665,0.6392,0.63185
2,0.7035,0.75,1.0,0.761,0.69315,0.6897,0.6455,0.6455,0.7085,0.7207,...,0.55975,0.605,0.6505,0.6212,0.6935,0.58815,0.55555,0.69965,0.5758,0.60875
3,0.69995,0.735,0.761,1.0,0.6616,0.687,0.655,0.655,0.68315,0.71335,...,0.578,0.63665,0.659,0.619,0.707,0.593,0.5495,0.67185,0.582,0.6137
4,0.65085,0.69315,0.69315,0.6616,1.0,0.63025,0.64,0.64,0.64965,0.68375,...,0.62485,0.6418,0.60255,0.6001,0.67985,0.6199,0.5434,0.7625,0.62135,0.6357


## Clustering

In [43]:
chen_train = pd.read_csv(path.join(DATA_DIR, "chen/deduplicated/chen_train_data_w_clusters.csv"))

In [44]:
chen_train["cluster_merged"].head()

0    3
1    3
2    3
3    4
4    4
Name: cluster_merged, dtype: int64

# Models

In [45]:
# different usage from the other models
def knn(n):
    model = NearestNeighbors(metric="precomputed")
    parameters = {'n_neighbors': [1,3,5]}
    return model, parameters, "kNN"

In [46]:
def logistic_regression(n):
    lr = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
    parameters = {'C':loguniform(0.001, 1000), 'penalty': ["l2"], "solver": ["lbfgs", "sag"]}
    return lr, parameters, "logistic_regression"

In [47]:
def random_forest(n):
    rf = RandomForestClassifier(random_state=42, n_jobs=-1, class_weight='balanced')
    parameters = {'n_estimators': np.arange(1, 200, 10), 'max_depth': np.arange(1, min(50,n), 2), 
                  'max_features': np.arange(0.1, 0.75, 0.05)}
    return rf, parameters, "random_forest"

In [48]:
def multilayer_perceptron(n):
    mlp = MLPClassifier(random_state=42, max_iter=int(1000))
    parameters = {'hidden_layer_sizes': [(100,), (50,), (100, 100)], "activation": ["relu", "logistic"]}
    return mlp, parameters, "multilayer_perceptron"

In [49]:
def svm(n):
    svc = SVC(max_iter=8000, probability=True, class_weight='balanced')
    parameters = {'C': loguniform(0.001, 100), 'kernel':["linear", "rbf"], 'gamma': loguniform(1e-3, 1e0)}
    return svc, parameters, "SVM"

In [50]:
def gradient_boosting(n):
    gb = GradientBoostingClassifier(random_state=42, n_iter_no_change=70)
    parameters = {'learning_rate': loguniform(0.01, 0.5), 
                  'n_estimators': np.arange(1, 200, 10), 
                  'max_depth': np.arange(1, min(20,n), 2), 
                  'max_features': np.arange(0.1, 0.6, 0.1)}
    return gb, parameters, "gradient_boosting"

In [51]:
def output_evaluation(model_type, params, best_params, metrics, data, outpath, preprocessing=None):
    prepro = "_"+preprocessing if preprocessing is not None else ""
    filename = os.path.join(DATA_DIR, "evaluations", outpath, f"{model_type}_{data}{prepro}.json")
    out_dict = {
        "model_type": model_type,
        "data": data
    }
    out_dict["params"] = {}
    out_dict["best_params"] = {}
    for key, value in params.items():
        out_dict["params"][key] = str(value)
    for key, value in best_params.items():
        out_dict["best_params"][key] = str(value)
    out_dict["metrics"] = metrics
    out_dict["preprocessing"] = "none" if preprocessing is None else preprocessing
    
    json.dump(out_dict, open(filename, "w"))
    
    filename_sum = os.path.join(DATA_DIR, f"evaluations/{outpath}/all.csv")
    #"../evaluations/all.csv"
    line = [model_type, data, out_dict["preprocessing"], metrics["f1"], metrics["mcc"], metrics["acc"],metrics["precision"],metrics["recall"],metrics["auc"], filename]
    with open(filename_sum, 'a', newline='') as csvfile:
        csvwriter = csv.writer(csvfile, delimiter='\t')
        csvwriter.writerow(line)

In [52]:
def train_and_eval(model_name, classifier, parameters, X_train, y_train, X_valid, y_valid, groups,
                   data_name, outpath, preprocessing=None):
    splitter = LeaveOneGroupOut()
    split = splitter.split(X_train, y_train, groups=groups)
    grid = RandomizedSearchCV(classifier, parameters, verbose=1, scoring="f1", cv=split)
    grid.fit(X_train, y_train)
    estimator = grid.best_estimator_
    best_params = grid.best_params_
    y_pred = estimator.predict(X_valid)
    prepro = "_"+preprocessing if preprocessing is not None else ""
    filename = path.join(DATA_DIR, "evaluations", outpath, "models", f"{model_name}_{data_name}{prepro}.pkl")
    with open(filename, 'wb') as f:
        pickle.dump(estimator, f)
    #pickle.dump(estimator, open(filename, "w"))
    metric_dict = {
        "f1": float(metrics.f1_score(y_valid, y_pred)),
        "acc": float(metrics.accuracy_score(y_valid, y_pred)),
        "mcc": float(metrics.matthews_corrcoef(y_valid, y_pred)),
        "auc": float(metrics.roc_auc_score(y_valid, y_pred)),
        "precision": float(metrics.precision_score(y_valid, y_pred)),
        "recall": float(metrics.recall_score(y_valid, y_pred))
    }
    
    print(f"{model_name}, {data_name}")
    print(f"F1: {metric_dict['f1']}")
    print(f"MCC: {metric_dict['mcc']}")
    print(f"Accuracy: {metric_dict['acc']}")
    print(f"Precision: {metric_dict['precision']}")
    print(f"Recall: {metric_dict['recall']}")
    print(f"AUC: {metric_dict['auc']}")
    print(f"-----")
    
    output_evaluation(model_name, parameters, best_params, metric_dict, data_name, outpath, preprocessing=preprocessing)

In [53]:
def try_all(X_train, y_train, X_valid, y_valid, groups, data_name, outpath, preprocessing=None):
    n = len(y_train)
    for model_creator in [logistic_regression, random_forest, gradient_boosting, svm, multilayer_perceptron]:
        classifier, params, model_label = model_creator(n)
        print("\n")
        print(f'Training model {model_label} on data {data_name} \n')
        train_and_eval(model_label, classifier, params, X_train, y_train, X_valid, y_valid, groups, 
                   data_name, outpath, preprocessing=preprocessing)

In [54]:
def test_on_tap(model_name, x_test, y_test,
                   data_name, outpath, preprocessing=None):
    prepro = "_"+preprocessing if preprocessing is not None else ""
    filename = path.join(DATA_DIR, "evaluations", outpath, "models", f"{model_name}_{data_name}{prepro}.pkl")
    with open(filename, 'rb') as f:
        estimator = pickle.load(f)
    y_pred = estimator.predict(x_test)
    metric_dict = {
        "f1": float(metrics.f1_score(y_test, y_pred)),
        "acc": float(metrics.accuracy_score(y_test, y_pred)),
        "mcc": float(metrics.matthews_corrcoef(y_test, y_pred)),
        "auc": float(metrics.roc_auc_score(y_test, y_pred)),
        "precision": float(metrics.precision_score(y_test, y_pred)),
        "recall": float(metrics.recall_score(y_test, y_pred))
    }
    filename_sum = os.path.join(DATA_DIR, f"evaluations/{outpath}/tap.csv")
    line = [model_name, data_name, prepro, metric_dict["f1"], metric_dict["mcc"], metric_dict["acc"],metric_dict["precision"],metric_dict["recall"],metric_dict["auc"], filename]
    with open(filename_sum, 'a', newline='') as csvfile:
        csvwriter = csv.writer(csvfile, delimiter='\t')
        csvwriter.writerow(line)

In [55]:
def test_all(x_test, y_test, data_name, outpath, preprocessing=None):
    for model in ["logistic_regression", "random_forest", "gradient_boosting", "SVM", "multilayer_perceptron"]:
        print(f"Testing model {model} on {data_name}...")
        test_on_tap(model, x_test, y_test, data_name, outpath, preprocessing=preprocessing)

# PyBioMed

In [56]:
x_chen = pd.read_feather(path.join(DATA_DIR, "chen/pybiomed/X_data.ftr"))
x_chen_train = x_chen.merge(chen_train[["Antibody_ID", "Y", "cluster_merged"]], left_on="Ab_ID", right_on="Antibody_ID").drop("Antibody_ID", axis=1)
x_chen_valid = x_chen.merge(chen_valid[["Antibody_ID", "Y"]], left_on="Ab_ID", right_on="Antibody_ID").drop("Antibody_ID", axis=1)
x_chen_train.head()

Unnamed: 0,Ab_ID,0,1,2,3,4,5,6,7,8,...,19750,19751,19752,19753,19754,19755,19756,19757,19758,19759
0,12e8,7.5,3.333,2.5,5.833,1.667,5.833,5.833,9.167,1.667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,15c8,9.244,0.0,3.361,5.882,1.681,4.202,6.723,8.403,2.521,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1a0q,5.882,1.681,3.361,5.042,1.681,5.882,5.042,9.244,1.681,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1a14,6.667,2.5,4.167,5.0,1.667,2.5,7.5,11.667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1a2y,3.448,5.172,4.31,6.034,1.724,2.586,6.034,10.345,0.862,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
scaler = StandardScaler()
scaler.fit(x_chen_train.drop("Ab_ID", axis=1))
x_chen_train_tr = scaler.transform(x_chen_train.drop(["Ab_ID", "Y", "cluster_merged"], axis=1))
x_chen_valid_tr = scaler.transform(x_chen_valid.drop(["Ab_ID", "Y"], axis=1))

In [58]:
x_tap = pd.read_feather(path.join(DATA_DIR, "tap/pybiomed/X_TAP_data.ftr"))
x_tap.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19750,19751,19752,19753,19754,19755,19756,19757,19758,19759
0,10.084,3.361,2.521,3.361,1.681,3.361,5.882,11.765,0.84,1.681,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,7.627,4.237,1.695,3.39,1.695,5.085,5.932,10.169,0.847,2.542,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5.932,1.695,0.0,5.932,1.695,5.085,6.78,10.169,0.847,1.695,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5.738,6.557,4.098,5.738,1.639,3.279,4.918,11.475,0.82,3.279,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,9.091,4.959,3.306,5.785,1.653,4.132,4.132,9.091,1.653,2.479,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [59]:
#scaler = StandardScaler()
#scaler.fit(x_tap)
x_tap = scaler.transform(x_tap)

In [None]:
try_all(x_chen_train_tr, x_chen_train["Y"], x_chen_valid_tr, x_chen_valid["Y"], x_chen_train["cluster_merged"], "pybiomed", EVAL_DIR, preprocessing="StandardScaler")



Training model logistic_regression on data pybiomed 

Fitting 10 folds for each of 10 candidates, totalling 100 fits




In [None]:
test_all(x_tap, tap_data["Y"], "pybiomed", EVAL_DIR, preprocessing="StandardScaler")

# Protparam

In [None]:
x_chen = pd.read_csv(path.join(DATA_DIR, "chen/protparam/protparam_features.csv"))
x_chen.rename({"Unnamed: 0": "Ab_ID"}, axis=1, inplace=True)
x_chen_train = x_chen.merge(chen_train[["Antibody_ID", "Y", "cluster_merged"]], left_on="Ab_ID", right_on="Antibody_ID").drop("Antibody_ID", axis=1)
x_chen_valid = x_chen.merge(chen_valid[["Antibody_ID", "Y"]], left_on="Ab_ID", right_on="Antibody_ID").drop("Antibody_ID", axis=1)
x_chen_train.head()

In [None]:
scaler = StandardScaler()
scaler.fit(x_chen_train.drop(["Ab_ID", "name", "cluster_merged", "Y"], axis=1))
x_chen_train = scaler.transform(x_chen_train.drop(["Ab_ID", "name", "cluster_merged", "Y"], axis=1))
x_chen_valid = scaler.transform(x_chen_valid.drop(["Ab_ID", "name", "Y"], axis=1))

In [None]:
try_all(x_chen_train, x_chen_train["Y"], x_chen_valid, x_chen_valid["Y"], x_chen_train["cluster_merged"], "protparam", EVAL_DIR, preprocessing="StandardScaler")

In [None]:
x_tap = pd.read_csv(path.join(DATA_DIR, "tap/protparam/protparam_features_tap.csv"))
x_tap.rename({"Unnamed: 0": "Ab_ID"}, axis=1, inplace=True)
x_tap.head()

In [None]:
x_tap = scaler.transform(x_tap.drop("Ab_ID", axis=1))

In [None]:
test_all(x_tap, tap_data["Y"], "protparam", EVAL_DIR, preprocessing="StandardScaler")

# BERT Embeddings

In [35]:
x_chen = pd.read_feather(path.join(DATA_DIR, "chen/embeddings/bert/bert_chen_embeddings.ftr"))
x_chen_train = x_chen.merge(chen_train[["Antibody_ID", "Y", "cluster_merged"]], left_on="Ab_ID", right_on="Antibody_ID").drop("Antibody_ID", axis=1)
x_chen_valid = x_chen.merge(chen_valid[["Antibody_ID", "Y"]], left_on="Ab_ID", right_on="Antibody_ID").drop("Antibody_ID", axis=1)
x_chen_train.head()

Unnamed: 0,Ab_ID,0,1,2,3,4,5,6,7,8,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
2073,6aod,0.020196,-0.020793,-0.010549,-0.038375,0.066936,-0.022921,0.017874,-0.088493,-0.018926,...,0.001973,-0.003466,-0.014579,0.007682,-0.027888,0.026192,-0.040531,-0.010679,0.007133,-0.027332
1517,4yny,-0.007447,-0.058638,0.013139,-0.029136,0.074901,-0.025573,0.035983,-0.105274,-0.01449,...,0.008387,0.007312,-0.01375,-0.003369,0.007132,0.04241,-0.044542,-0.034221,-0.023036,-0.013044
2025,5xcv,-0.005372,-0.056795,0.012577,-0.031566,0.071131,-0.023377,0.037339,-0.102565,-0.00934,...,0.008306,0.003619,-0.015327,-0.003542,0.00372,0.043285,-0.046977,-0.033989,-0.025775,-0.012205
2070,6and,-0.004229,-0.043975,0.003673,-0.031739,0.070902,-0.022174,0.038274,-0.111665,-0.030382,...,0.006203,-0.016125,-0.021334,-0.005837,-0.025217,0.024086,-0.032259,-0.02348,-0.013043,-0.035664
666,2xqy,0.003754,-0.018278,-0.001506,-0.037899,0.050809,-0.026173,0.003026,-0.090757,0.005515,...,-0.00372,0.004341,-0.036547,-0.009119,-0.038372,0.023391,-0.030063,-0.003554,0.000142,-0.041985


In [36]:
scaler = StandardScaler()
scaler.fit(x_chen_train.drop("Ab_ID", axis=1))
x_chen_train_tr = scaler.transform(x_chen_train.drop(["Ab_ID", "Y", "cluster_merged"], axis=1))
x_chen_valid_tr = scaler.transform(x_chen_valid.drop(["Ab_ID", "Y"], axis=1))

In [37]:
try_all(x_chen_train_tr, x_chen_train["Y"], x_chen_valid_tr, x_chen_valid["Y"], 
        x_chen_train["cluster_merged"], "bert", EVAL_DIR, preprocessing="StandardScaler")



Training model logistic_regression on data bert 

Fitting 5 folds for each of 10 candidates, totalling 50 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


logistic_regression, bert
F1: 0.38095238095238093
MCC: 0.18215562599463012
Accuracy: 0.6736401673640168
Precision: 0.3116883116883117
Recall: 0.4897959183673469
AUC: 0.6054242749731472
-----


Training model random_forest on data bert 

Fitting 5 folds for each of 10 candidates, totalling 50 fits
random_forest, bert
F1: 0.3703703703703704
MCC: 0.14183612246217095
Accuracy: 0.5732217573221757
Precision: 0.26548672566371684
Recall: 0.6122448979591837
AUC: 0.5877013963480129
-----


Training model gradient_boosting on data bert 

Fitting 5 folds for each of 10 candidates, totalling 50 fits
gradient_boosting, bert
F1: 0.1702127659574468
MCC: -0.032500171896566385
Accuracy: 0.6736401673640168
Precision: 0.17777777777777778
Recall: 0.16326530612244897
AUC: 0.4842642320085929
-----


Training model SVM on data bert 

Fitting 5 folds for each of 10 candidates, totalling 50 fits




SVM, bert
F1: 0.37313432835820903
MCC: 0.16395849043023675
Accuracy: 0.6485355648535565
Precision: 0.29411764705882354
Recall: 0.5102040816326531
AUC: 0.5972073039742213
-----


Training model multilayer_perceptron on data bert 

Fitting 5 folds for each of 6 candidates, totalling 30 fits




multilayer_perceptron, bert
F1: 0.3404255319148936
MCC: 0.17958286109399652
Accuracy: 0.7405857740585774
Precision: 0.35555555555555557
Recall: 0.32653061224489793
AUC: 0.5869495166487647
-----


In [38]:
x_tap = pd.read_feather(path.join(DATA_DIR, "tap/embeddings/bert/bert_tap_embeddings.ftr"))
x_tap.head()

Unnamed: 0,Ab_ID,0,1,2,3,4,5,6,7,8,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,Abagovomab,-0.004248,-0.024501,-0.01133,-0.02717,0.062747,-0.024793,0.009313,-0.083316,-0.005339,...,-0.008674,-0.002387,-0.021563,0.001087,-0.020986,0.047104,-0.038736,-0.02178,-0.022153,-0.024539
1,Abituzumab,0.006593,-0.013591,-0.008454,-0.043601,0.065095,-0.016896,-0.001596,-0.090935,-0.00294,...,0.000486,-0.013063,-0.021852,-0.003531,-0.02462,0.027387,-0.041001,-0.025708,-0.016437,-0.034342
2,Abrilumab,0.019445,-0.002642,-0.011395,-0.058757,0.060623,-0.015046,0.006317,-0.083772,-0.006775,...,0.005834,-0.017189,-0.014557,0.003359,-0.035368,0.020287,-0.033941,-0.023549,-0.00872,-0.044038
3,Actoxumab,-0.006365,-0.043729,0.005978,-0.025613,0.067748,-0.009542,0.017723,-0.096801,-0.007752,...,-2.4e-05,-0.012986,-0.019753,-0.004326,-0.044124,0.019544,-0.039559,-0.015679,-0.008837,-0.043877
4,Adalimumab,-0.012995,-0.035269,0.014127,-0.042136,0.080592,-0.012831,0.031889,-0.091308,-0.018166,...,-0.002067,-0.022957,-0.021145,-0.002762,-0.058199,0.026781,-0.046815,-0.01011,-0.008785,-0.041761


In [39]:
x_tap = scaler.transform(x_tap.drop("Ab_ID", axis=1))

In [40]:
test_all(x_tap, tap_data["Y"], "bert", EVAL_DIR, preprocessing="StandardScaler")

Testing model logistic_regression on bert...
Testing model random_forest on bert...
Testing model gradient_boosting on bert...
Testing model SVM on bert...
Testing model multilayer_perceptron on bert...


# SeqVec Embeddings

In [41]:
x_chen = pd.read_feather(path.join(DATA_DIR, "chen/embeddings/seqvec/seqvec_chen_embeddings.ftr"))
x_chen_train = x_chen.merge(chen_train[["Antibody_ID", "Y", "cluster_merged"]], left_on="Ab_ID", right_on="Antibody_ID").drop("Antibody_ID", axis=1)
x_chen_valid = x_chen.merge(chen_valid[["Antibody_ID", "Y"]], left_on="Ab_ID", right_on="Antibody_ID").drop("Antibody_ID", axis=1)
x_chen_train.head()

Unnamed: 0,Ab_ID,0,1,2,3,4,5,6,7,8,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
2073,6aod,0.03164,-0.191203,-0.268825,-0.134525,0.097435,0.00177,0.019496,0.118835,-0.136518,...,0.1044,-0.013639,0.13558,0.059035,-0.006357,-0.085073,0.009538,-0.000886,-0.01659,-0.071586
1517,4yny,0.063575,-0.219689,-0.285444,-0.134554,0.089054,-0.023122,0.004502,0.162193,-0.047376,...,-0.005196,-0.113235,0.048879,0.06119,-0.005911,-0.104957,0.026941,0.016444,-0.065021,-0.105957
2025,5xcv,0.06084,-0.227532,-0.280555,-0.136953,0.091411,-0.028528,0.00144,0.166315,-0.045701,...,-0.005788,-0.112651,0.049541,0.063261,-0.005133,-0.104734,0.02734,0.01749,-0.064584,-0.105592
2070,6and,0.062686,-0.220758,-0.278071,-0.137741,0.086655,-0.016114,-0.027874,0.158912,-0.119345,...,0.04129,-0.032623,0.144864,0.008824,-0.014455,-0.080329,-0.049089,0.033029,-0.017042,-0.044416
666,2xqy,-0.000458,-0.200469,-0.269982,-0.138054,0.112663,-0.007346,0.03744,0.075526,-0.178788,...,-0.020374,-0.059074,0.133179,0.018044,0.066013,-0.061189,-0.029644,-0.050942,-0.035482,-0.058019


In [42]:
scaler = StandardScaler()
scaler.fit(x_chen_train.drop("Ab_ID", axis=1))
x_chen_train_tr = scaler.transform(x_chen_train.drop(["Ab_ID", "Y", "cluster_merged"], axis=1))
x_chen_valid_tr = scaler.transform(x_chen_valid.drop(["Ab_ID", "Y"], axis=1))

In [43]:
try_all(x_chen_train_tr, x_chen_train["Y"], x_chen_valid_tr, x_chen_valid["Y"], 
        x_chen_train["cluster_merged"], "seqvec", EVAL_DIR, preprocessing="StandardScaler")



Training model logistic_regression on data seqvec 

Fitting 5 folds for each of 10 candidates, totalling 50 fits




logistic_regression, seqvec
F1: 0.3909774436090226
MCC: 0.1905571071905387
Accuracy: 0.6610878661087866
Precision: 0.30952380952380953
Recall: 0.5306122448979592
AUC: 0.6126745435016112
-----


Training model random_forest on data seqvec 

Fitting 5 folds for each of 10 candidates, totalling 50 fits
random_forest, seqvec
F1: 0.42519685039370075
MCC: 0.24332445999849997
Accuracy: 0.694560669456067
Precision: 0.34615384615384615
Recall: 0.5510204081632653
AUC: 0.6412996777658432
-----


Training model gradient_boosting on data seqvec 

Fitting 5 folds for each of 10 candidates, totalling 50 fits
gradient_boosting, seqvec
F1: 0.09090909090909091
MCC: -0.01956961135502369
Accuracy: 0.7489539748953975
Precision: 0.17647058823529413
Recall: 0.061224489795918366
AUC: 0.4937701396348013
-----


Training model SVM on data seqvec 

Fitting 5 folds for each of 10 candidates, totalling 50 fits




SVM, seqvec
F1: 0.31067961165048547
MCC: 0.12214837841247742
Accuracy: 0.702928870292887
Precision: 0.2962962962962963
Recall: 0.32653061224489793
AUC: 0.563265306122449
-----


Training model multilayer_perceptron on data seqvec 

Fitting 5 folds for each of 6 candidates, totalling 30 fits




multilayer_perceptron, seqvec
F1: 0.2916666666666667
MCC: 0.1137915268000298
Accuracy: 0.7154811715481172
Precision: 0.2978723404255319
Recall: 0.2857142857142857
AUC: 0.5560150375939849
-----


In [44]:
x_tap = pd.read_feather(path.join(DATA_DIR, "tap/embeddings/seqvec/seqvec_tap_embeddings.ftr"))
x_tap.head()

Unnamed: 0,Ab_ID,0,1,2,3,4,5,6,7,8,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,Abagovomab,-0.004248,-0.024501,-0.01133,-0.02717,0.062747,-0.024793,0.009313,-0.083316,-0.005339,...,-0.008674,-0.002387,-0.021563,0.001087,-0.020986,0.047104,-0.038736,-0.02178,-0.022153,-0.024539
1,Abituzumab,0.006593,-0.013591,-0.008454,-0.043601,0.065095,-0.016896,-0.001596,-0.090935,-0.00294,...,0.000486,-0.013063,-0.021852,-0.003531,-0.02462,0.027387,-0.041001,-0.025708,-0.016437,-0.034342
2,Abrilumab,0.019445,-0.002642,-0.011395,-0.058757,0.060623,-0.015046,0.006317,-0.083772,-0.006775,...,0.005834,-0.017189,-0.014557,0.003359,-0.035368,0.020287,-0.033941,-0.023549,-0.00872,-0.044038
3,Actoxumab,-0.006365,-0.043729,0.005978,-0.025613,0.067748,-0.009542,0.017723,-0.096801,-0.007752,...,-2.4e-05,-0.012986,-0.019753,-0.004326,-0.044124,0.019544,-0.039559,-0.015679,-0.008837,-0.043877
4,Adalimumab,-0.012995,-0.035269,0.014127,-0.042136,0.080592,-0.012831,0.031889,-0.091308,-0.018166,...,-0.002067,-0.022957,-0.021145,-0.002762,-0.058199,0.026781,-0.046815,-0.01011,-0.008785,-0.041761


In [45]:
x_tap = scaler.transform(x_tap.drop("Ab_ID", axis=1))

In [46]:
test_all(x_tap, tap_data["Y"], "seqvec", EVAL_DIR, preprocessing="StandardScaler")

Testing model logistic_regression on seqvec...
Testing model random_forest on seqvec...
Testing model gradient_boosting on seqvec...
Testing model SVM on seqvec...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Testing model multilayer_perceptron on seqvec...


  _warn_prf(average, modifier, msg_start, len(result))


# One-hot Encoding

In [47]:
x_heavy = pd.read_feather(path.join(DATA_DIR, "chen/abnumber/chen_heavy_one_hot.ftr")).set_index("index")
# rows 1921 and 2097 could not be encoded
x_light = pd.read_feather(path.join(DATA_DIR, "chen/abnumber/chen_light_one_hot.ftr")).set_index("Id")
x_chen = x_heavy.merge(x_light, left_index=True, right_index=True, suffixes=["_h", "_l"])
x_chen.index = x_heavy.index
train_idx = list(chen_train.index)
train_idx.remove(1921)
train_idx.remove(2097)
x_chen_train = x_chen.loc[train_idx]
x_chen_valid = x_chen.loc[chen_valid.index]
x_chen_train.head()

Unnamed: 0_level_0,Ab_ID_h,A_1_h,C_1_h,D_1_h,E_1_h,F_1_h,G_1_h,H_1_h,I_1_h,K_1_h,...,O_138_l,P_138_l,Q_138_l,R_138_l,S_138_l,T_138_l,U_138_l,V_138_l,W_138_l,Y_138_l
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2073,6aod,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1517,4yny,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2025,5xcv,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2070,6and,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
666,2xqy,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
train_labels = chen_train.loc[train_idx]["Y"]

In [50]:
scaler = StandardScaler()
scaler.fit(x_chen_train.drop(["Ab_ID_h", "Ab_ID_l"], axis=1))
x_chen_train = scaler.transform(x_chen_train.drop(["Ab_ID_h", "Ab_ID_l"], axis=1))
x_chen_valid = scaler.transform(x_chen_valid.drop(["Ab_ID_h", "Ab_ID_l"], axis=1))

In [53]:
# add groups for data split
try_all(x_chen_train, train_labels, x_chen_valid, chen_valid["Y"], "onehot", EVAL_DIR, preprocessing="StandardScaler")



Training model logistic_regression on data onehot 

Fitting 5 folds for each of 10 candidates, totalling 50 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

logistic_regression, onehot
F1: 0.375
MCC: 0.19015409084172083
Accuracy: 0.7071129707112971
Precision: 0.3333333333333333
Recall: 0.42857142857142855
AUC: 0.6037593984962406
-----


Training model random_forest on data onehot 

Fitting 5 folds for each of 10 candidates, totalling 50 fits
random_forest, onehot
F1: 0.3943661971830986
MCC: 0.18989133017294033
Accuracy: 0.6401673640167364
Precision: 0.3010752688172043
Recall: 0.5714285714285714
AUC: 0.6146616541353385
-----


Training model gradient_boosting on data onehot 

Fitting 5 folds for each of 10 candidates, totalling 50 fits
gradient_boosting, onehot
F1: 0.22988505747126436
MCC: 0.06261368369902223
Accuracy: 0.7196652719665272
Precision: 0.2631578947368421
Recall: 0.20408163265306123
AUC: 0.5283566058002148
-----


Training model SVM on data onehot 

Fitting 5 folds for each of 10 candidates, totalling 50 fits
SVM, onehot
F1: 0.25490196078431376
MCC: 0.05323544989970825
Accuracy: 0.6820083682008368
Precision: 0.24528301886792453




multilayer_perceptron, onehot
F1: 0.3035714285714286
MCC: 0.09606127984550701
Accuracy: 0.6736401673640168
Precision: 0.2698412698412698
Recall: 0.3469387755102041
AUC: 0.5524167561761547
-----


In [58]:
x_tap_heavy = pd.read_feather(path.join(DATA_DIR, "tap/abnumber/tap_heavy_one_hot.ftr"))
x_tap_light = pd.read_feather(path.join(DATA_DIR, "tap/abnumber/tap_light_one_hot.ftr"))
x_tap = x_tap_heavy.merge(x_tap_light, left_index=True, right_index=True, suffixes=["_h", "_l"])

In [59]:
x_tap = scaler.transform(x_tap.drop(["Ab_ID_h", "Ab_ID_l"], axis=1))

Feature names unseen at fit time:
- A_121
- A_122
- A_123
- A_124
- A_125
- ...
Feature names seen at fit time, yet now missing:
- A_121_h
- A_121_l
- A_122_h
- A_122_l
- A_123_h
- ...



ValueError: X has 5634 features, but StandardScaler is expecting 7524 features as input.

In [60]:
test_all(x_tap, tap_data["Y"], "onehot", EVAL_DIR, preprocessing="StandardScaler")

Testing model logistic_regression on onehot...


  f"X has feature names, but {self.__class__.__name__} was fitted without"


ValueError: could not convert string to float: 'Abagovomab'