In [2]:
import pandas as pd
from os import path
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.utils.fixes import loguniform    
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
import pickle
import numpy as np
import os
import json
import csv

In [3]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from collections import Counter

In [4]:
DATA_DIR = "../../data"

In [5]:
EVAL_DIR = "2021-11-28"

In [6]:
chen_train = pd.read_csv(path.join(DATA_DIR, "chen/deduplicated/chen_train_data.csv"), index_col=0)
chen_train.head()

Unnamed: 0,Antibody_ID,heavy,light,Y
2073,6aod,EVQLVQSGAEVKKPGASVKVSCKASGYTFTGYYMHWVRQAPGQGLE...,DIVMTKSPSSLSASVGDRVTITCRASQGIRNDLGWYQQKPGKAPKR...,0
1517,4yny,EVQLVESGGGLVQPGRSLKLSCAASGFTFSNYGMAWVRQTPTKGLE...,EFVLTQPNSVSTNLGSTVKLSCKRSTGNIGSNYVNWYQQHEGRSPT...,1
2025,5xcv,EVQLVESGGGLVQPGRSLKLSCAASGFTFSNYGMAWVRQTPTKGLE...,QFVLTQPNSVSTNLGSTVKLSCKRSTGNIGSNYVNWYQQHEGRSPT...,1
2070,6and,EVQLVESGGGLVQPGGSLRLSCAASGYEFSRSWMNWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRSSQSIVHSVGNTFLEWYQQKPG...,1
666,2xqy,QVQLQQPGAELVKPGASVKMSCKASGYSFTSYWMNWVKQRPGRGLE...,DIVLTQSPASLALSLGQRATISCRASKSVSTSGYSYMYWYQQKPGQ...,0


In [7]:
len(chen_train)

1338

In [8]:
chen_valid = pd.read_csv(path.join(DATA_DIR, "chen/deduplicated/chen_valid_data.csv"), index_col=0)
chen_test = pd.read_csv(path.join(DATA_DIR, "chen/deduplicated/chen_test_data.csv"))
chen_valid = pd.concat([chen_valid, chen_test])
chen_valid.head()

Unnamed: 0.1,Antibody_ID,heavy,light,Y,Unnamed: 0
2169,6ct7,EVQLVESGGGLVEPGGSLRLSCAVSGFDFEKAWMSWVRQAPGQGLQ...,SYELTQPPSVSVSPGQTARITCSGEALPMQFAHWYQQRPGKAPVIV...,0,
1342,4nzu,AVSLVESGGGTVEPGSTLRLSCAASGFTFGSYAFHWVRQAPGDGLE...,DIEMTQSPSSLSASTGDKVTITCQASQDIAKFLDWYQQRPGKTPKL...,0,
1728,5i8c,QEVLVQSGAEVKKPGASVKVSCRAFGYTFTGNALHWVRQAPGQGLE...,DIQLTQSPSFLSASVGDKVTITCRASQGVRNELAWYQQKPGKAPNL...,1,
1729,5i8e,QEVLVQSGAEVKKPGASVKVSCRAFGYTFTGNALHWVRQAPGQGLE...,IQLTQSPSFLSASVGDKVTITCRASQGVRNELAWYQQKPGKAPNLL...,0,
2114,6bb4,QVQLQQSDAELVKPGASVKISCKASGYTFTDRTIHWVKQRPEQGLE...,DVQMIQSPSSLSASLGDIVTMTCQASQDTSINLNWFQQKPGKAPKL...,0,


In [9]:
len(chen_valid)

239

In [10]:
tap_data = pd.read_csv(path.join(DATA_DIR, "tap/TAP_data.csv"))
tap_data.head()

Unnamed: 0,Antibody_ID,heavy,light,CDR_length,PSH,PPC,PNC,SFvCSP,Y
0,Abagovomab,QVKLQESGAELARPGASVKLSCKASGYTFTNYWMQWVKQRPGQGLD...,DIELTQSPASLSASVGETVTITCQASENIYSYLAWHQQKQGKSPQL...,46,129.7603,0.0,0.0,16.32,1
1,Abituzumab,QVQLQQSGGELAKPGASVKVSCKASGYTFSSFWMHWVRQAPGQGLE...,DIQMTQSPSSLSASVGDRVTITCRASQDISNYLAWYQQKPGKAPKL...,45,115.9106,0.0954,0.0421,-3.1,1
2,Abrilumab,QVQLVQSGAEVKKPGASVKVSCKVSGYTLSDLSIHWVRQAPGKGLE...,DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQQKPGKAPKL...,45,109.6995,0.0,0.8965,-4.0,1
3,Actoxumab,QVQLVESGGGVVQPGRSLRLSCAASGFSFSNYGMHWVRQAPGKGLE...,DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQHKPGKAPKL...,49,112.629,0.0,1.1247,3.1,1
4,Adalimumab,EVQLVESGGGLVQPGRSLRLSCAASGFTFDDYAMHWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRASQGIRNYLAWYQQKPGKAPKL...,48,111.2512,0.0485,1.1364,-19.5,1


# Models

In [11]:
def logistic_regression(n):
    lr = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
    parameters = {'C':loguniform(0.001, 1000), 'penalty': ["l2"], "solver": ["lbfgs", "sag"]}
    return lr, parameters, "logistic_regression"

In [12]:
def random_forest(n):
    rf = RandomForestClassifier(random_state=42, n_jobs=-1, class_weight='balanced')
    parameters = {'n_estimators': np.arange(1, 200, 10), 'max_depth': np.arange(1, min(50,n), 2), 
                  'max_features': np.arange(0.1, 0.75, 0.05)}
    return rf, parameters, "random_forest"

In [13]:
def multilayer_perceptron(n):
    mlp = MLPClassifier(random_state=42, max_iter=int(1000))
    parameters = {'hidden_layer_sizes': [(100,), (50,), (100, 100)], "activation": ["relu", "logistic"]}
    return mlp, parameters, "multilayer_perceptron"

In [14]:
def svm(n):
    svc = SVC(max_iter=8000, probability=True, class_weight='balanced')
    parameters = {'C': loguniform(0.001, 100), 'kernel':["linear", "rbf"], 'gamma': loguniform(1e-3, 1e0)}
    return svc, parameters, "SVM"

In [15]:
def gradient_boosting(n):
    gb = GradientBoostingClassifier(random_state=42, n_iter_no_change=70)
    parameters = {'learning_rate': loguniform(0.01, 0.5), 
                  'n_estimators': np.arange(1, 200, 10), 
                  'max_depth': np.arange(1, min(20,n), 2), 
                  'max_features': np.arange(0.1, 0.6, 0.1)}
    return gb, parameters, "gradient_boosting"

In [16]:
def output_evaluation(model_type, params, best_params, metrics, data, outpath, preprocessing=None, table="all"):
    prepro = "_"+preprocessing if preprocessing is not None else ""
    filename = os.path.join(DATA_DIR, "evaluations", outpath, f"{model_type}_{data}{prepro}.json")
    out_dict = {
        "model_type": model_type,
        "data": data
    }
    out_dict["params"] = {}
    out_dict["best_params"] = {}
    for key, value in params.items():
        out_dict["params"][key] = str(value)
    for key, value in best_params.items():
        out_dict["best_params"][key] = str(value)
    out_dict["metrics"] = metrics
    out_dict["preprocessing"] = "none" if preprocessing is None else preprocessing
    
    json.dump(out_dict, open(filename, "w"))
    
    filename_sum = os.path.join(DATA_DIR, f"evaluations/{outpath}/{table}.csv")
    #"../evaluations/all.csv"
    line = [model_type, data, out_dict["preprocessing"], metrics["f1"], metrics["mcc"], metrics["acc"],metrics["precision"],metrics["recall"],metrics["auc"], filename]
    with open(filename_sum, 'a', newline='') as csvfile:
        csvwriter = csv.writer(csvfile, delimiter='\t')
        csvwriter.writerow(line)

In [17]:
def train_and_eval(model_name, classifier, parameters, X_train, y_train, X_valid, y_valid, 
                   data_name, outpath, preprocessing=None):
    grid = RandomizedSearchCV(classifier, parameters, verbose=1, scoring="f1")
    grid.fit(X_train, y_train)
    estimator = grid.best_estimator_
    best_params = grid.best_params_
    y_pred = estimator.predict(X_valid)
    prepro = "_"+preprocessing if preprocessing is not None else ""
    filename = path.join(DATA_DIR, "evaluations", outpath, "models", f"{model_name}_{data_name}{prepro}.pkl")
    with open(filename, 'wb') as f:
        pickle.dump(estimator, f)
    #pickle.dump(estimator, open(filename, "w"))
    metric_dict = {
        "f1": float(metrics.f1_score(y_valid, y_pred)),
        "acc": float(metrics.accuracy_score(y_valid, y_pred)),
        "mcc": float(metrics.matthews_corrcoef(y_valid, y_pred)),
        "auc": float(metrics.roc_auc_score(y_valid, y_pred)),
        "precision": float(metrics.precision_score(y_valid, y_pred)),
        "recall": float(metrics.recall_score(y_valid, y_pred))
    }
    
    print(f"{model_name}, {data_name}")
    print(f"F1: {metric_dict['f1']}")
    print(f"MCC: {metric_dict['mcc']}")
    print(f"Accuracy: {metric_dict['acc']}")
    print(f"Precision: {metric_dict['precision']}")
    print(f"Recall: {metric_dict['recall']}")
    print(f"AUC: {metric_dict['auc']}")
    print(f"-----")
    
    output_evaluation(model_name, parameters, best_params, metric_dict, data_name, outpath, preprocessing=preprocessing)

In [18]:
def try_all(X_train, y_train, X_valid, y_valid, data_name, outpath, preprocessing=None):
    n = len(y_train)
    for model_creator in [logistic_regression, random_forest, gradient_boosting, svm, multilayer_perceptron]:
        classifier, params, model_label = model_creator(n)
        print("\n")
        print(f'Training model {model_label} on data {data_name} \n')
        train_and_eval(model_label, classifier, params, X_train, y_train, X_valid, y_valid, 
                   data_name, outpath, preprocessing=preprocessing)

In [19]:
def test_on_tap(model_name, x_test, y_test,
                   data_name, outpath, preprocessing=None):
    prepro = "_"+preprocessing if preprocessing is not None else ""
    filename = path.join(DATA_DIR, "evaluations", outpath, "models", f"{model_name}_{data_name}{prepro}.pkl")
    with open(filename, 'rb') as f:
        estimator = pickle.load(f)
    y_pred = estimator.predict(x_test)
    metric_dict = {
        "f1": float(metrics.f1_score(y_test, y_pred)),
        "acc": float(metrics.accuracy_score(y_test, y_pred)),
        "mcc": float(metrics.matthews_corrcoef(y_test, y_pred)),
        "auc": float(metrics.roc_auc_score(y_test, y_pred)),
        "precision": float(metrics.precision_score(y_test, y_pred)),
        "recall": float(metrics.recall_score(y_test, y_pred))
    }
    filename_sum = os.path.join(DATA_DIR, f"evaluations/{outpath}/tap.csv")
    line = [model_name, data_name, prepro, metric_dict["f1"], metric_dict["mcc"], metric_dict["acc"],metric_dict["precision"],metric_dict["recall"],metric_dict["auc"], filename]
    with open(filename_sum, 'a', newline='') as csvfile:
        csvwriter = csv.writer(csvfile, delimiter='\t')
        csvwriter.writerow(line)

In [20]:
def test_all(x_test, y_test, data_name, outpath, preprocessing=None):
    for model in ["logistic_regression", "random_forest", "gradient_boosting", "SVM", "multilayer_perceptron"]:
        print(f"Testing model {model} on {data_name}...")
        test_on_tap(model, x_test, y_test, data_name, outpath, preprocessing=preprocessing)

# PyBioMed

In [21]:
x_chen = pd.read_feather(path.join(DATA_DIR, "chen/pybiomed/X_data.ftr"))
x_chen_train = x_chen.loc[chen_train.index]
x_chen_valid = x_chen.loc[chen_valid.index]
x_chen_train.head()

Unnamed: 0,Ab_ID,0,1,2,3,4,5,6,7,8,...,19750,19751,19752,19753,19754,19755,19756,19757,19758,19759
2073,6aod,7.692,5.983,0.855,5.983,1.709,4.274,5.983,10.256,0.855,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1517,4yny,6.838,5.128,1.709,5.128,1.709,3.419,4.274,11.111,0.855,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2025,5xcv,6.838,5.128,1.709,5.128,2.564,3.419,4.274,11.111,0.855,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2070,6and,6.667,5.833,2.5,5.833,1.667,4.167,5.0,13.333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
666,2xqy,3.39,3.39,2.542,5.932,1.695,3.39,6.78,9.322,0.847,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
print('Original dataset shape %s' % Counter(chen_train["Y"])) 

Original dataset shape Counter({0: 1057, 1: 281})


In [None]:
sampler = RandomOverSampler(random_state=42)

x_chen_train, y_chen_train = sampler.fit_resample(x_chen_train, chen_train["Y"]) 
print('Resampled dataset shape %s' % Counter(y_chen_train)) 

In [28]:
x_tap = pd.read_feather(path.join(DATA_DIR, "tap/pybiomed/X_TAP_data.ftr"))
x_tap.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19750,19751,19752,19753,19754,19755,19756,19757,19758,19759
0,10.084,3.361,2.521,3.361,1.681,3.361,5.882,11.765,0.84,1.681,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,7.627,4.237,1.695,3.39,1.695,5.085,5.932,10.169,0.847,2.542,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5.932,1.695,0.0,5.932,1.695,5.085,6.78,10.169,0.847,1.695,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5.738,6.557,4.098,5.738,1.639,3.279,4.918,11.475,0.82,3.279,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,9.091,4.959,3.306,5.785,1.653,4.132,4.132,9.091,1.653,2.479,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
try_all(x_chen_train.drop("Ab_ID", axis=1), y_chen_train, x_chen_valid.drop("Ab_ID", axis=1), chen_valid["Y"], "pybiomed", EVAL_DIR, preprocessing="over-sampling")



Training model logistic_regression on data pybiomed 

Fitting 5 folds for each of 10 candidates, totalling 50 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

logistic_regression, pybiomed
F1: 0.30927835051546393
MCC: 0.13345994261998026
Accuracy: 0.7196652719665272
Precision: 0.3125
Recall: 0.30612244897959184
AUC: 0.5662191192266381
-----


Training model random_forest on data pybiomed 

Fitting 5 folds for each of 10 candidates, totalling 50 fits
random_forest, pybiomed
F1: 0.23076923076923075
MCC: 0.09694811588700399
Accuracy: 0.7489539748953975
Precision: 0.3103448275862069
Recall: 0.1836734693877551
AUC: 0.5392051557465091
-----


Training model gradient_boosting on data pybiomed 

Fitting 5 folds for each of 10 candidates, totalling 50 fits


  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does 

gradient_boosting, pybiomed
F1: 0.2597402597402597
MCC: 0.1372627495719451
Accuracy: 0.7615062761506276
Precision: 0.35714285714285715
Recall: 0.20408163265306123
AUC: 0.554672395273899
-----


Training model SVM on data pybiomed 

Fitting 5 folds for each of 10 candidates, totalling 50 fits




SVM, pybiomed
F1: 0.0625
MCC: -0.045950331000247154
Accuracy: 0.7489539748953975
Precision: 0.13333333333333333
Recall: 0.04081632653061224
AUC: 0.4861976369495167
-----


Training model multilayer_perceptron on data pybiomed 

Fitting 5 folds for each of 6 candidates, totalling 30 fits




multilayer_perceptron, pybiomed
F1: 0.25
MCC: 0.15056856888478837
Accuracy: 0.7740585774058577
Precision: 0.391304347826087
Recall: 0.1836734693877551
AUC: 0.5549946294307196
-----


In [25]:
test_all(x_tap, tap_data["Y"], "pybiomed", EVAL_DIR, preprocessing="over-sampling")

Testing model logistic_regression on pybiomed...
Testing model random_forest on pybiomed...
Testing model gradient_boosting on pybiomed...
Testing model SVM on pybiomed...
Testing model multilayer_perceptron on pybiomed...


In [23]:
x_chen_train = x_chen.loc[chen_train.index]

In [25]:
sampler = SMOTE(random_state=42)

x_chen_train, y_chen_train = sampler.fit_resample(x_chen_train.drop("Ab_ID", axis=1), chen_train["Y"]) 
print('Resampled dataset shape %s' % Counter(y_chen_train)) 

Resampled dataset shape Counter({0: 1057, 1: 1057})


In [26]:
try_all(x_chen_train, y_chen_train, x_chen_valid.drop("Ab_ID", axis=1), chen_valid["Y"], "pybiomed", EVAL_DIR, preprocessing="smote")



Training model logistic_regression on data pybiomed 

Fitting 5 folds for each of 10 candidates, totalling 50 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

logistic_regression, pybiomed
F1: 0.3653846153846154
MCC: 0.1901807693514209
Accuracy: 0.7238493723849372
Precision: 0.34545454545454546
Recall: 0.3877551020408163
AUC: 0.5991407089151449
-----


Training model random_forest on data pybiomed 

Fitting 5 folds for each of 10 candidates, totalling 50 fits
random_forest, pybiomed
F1: 0.282051282051282
MCC: 0.1604292109472614
Accuracy: 0.7656903765690377
Precision: 0.3793103448275862
Recall: 0.22448979591836735
AUC: 0.5648764769065522
-----


Training model gradient_boosting on data pybiomed 

Fitting 5 folds for each of 10 candidates, totalling 50 fits


  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does 

gradient_boosting, pybiomed
F1: 0.26666666666666666
MCC: 0.15542213783876196
Accuracy: 0.7698744769874477
Precision: 0.38461538461538464
Recall: 0.20408163265306123
AUC: 0.5599355531686359
-----


Training model SVM on data pybiomed 

Fitting 5 folds for each of 10 candidates, totalling 50 fits




SVM, pybiomed
F1: 0.38095238095238093
MCC: 0.2084411088386525
Accuracy: 0.7280334728033473
Precision: 0.35714285714285715
Recall: 0.40816326530612246
AUC: 0.6093447905477981
-----


Training model multilayer_perceptron on data pybiomed 

Fitting 5 folds for each of 6 candidates, totalling 30 fits




multilayer_perceptron, pybiomed
F1: 0.3870967741935484
MCC: 0.24011054483987018
Accuracy: 0.7615062761506276
Precision: 0.4090909090909091
Recall: 0.3673469387755102
AUC: 0.6152524167561761
-----


In [29]:
test_all(x_tap, tap_data["Y"], "pybiomed", EVAL_DIR, preprocessing="smote")

Testing model logistic_regression on pybiomed...
Testing model random_forest on pybiomed...
Testing model gradient_boosting on pybiomed...
Testing model SVM on pybiomed...
Testing model multilayer_perceptron on pybiomed...


# Protparam

In [30]:
x_chen = pd.read_csv(path.join(DATA_DIR, "chen/protparam/protparam_features.csv"))
x_chen.rename({"Unnamed: 0": "Ab_ID"}, axis=1, inplace=True)
x_chen_train = x_chen.loc[chen_train.index]
x_chen_valid = x_chen.loc[chen_valid.index]
x_chen_train.head()

Unnamed: 0,Ab_ID,aa_percent0_x,aa_percent1_x,aa_percent2_x,aa_percent3_x,aa_percent4_x,aa_percent5_x,aa_percent6_x,aa_percent7_x,aa_percent8_x,...,flexibility_y,isoelectric_y,mol_extinct1_y,mol_extinct2_y,mw_y,gravy_y,ss_faction1_y,ss_faction2_y,ss_faction3_y,name
2073,1xiw,0.04918,0.016393,0.04918,0.040984,0.032787,0.090164,0.008197,0.016393,0.081967,...,1.000398,6.756409,19940,20065,11853.0516,-0.441121,0.28972,0.280374,0.168224,1xiw
1517,5b3j,0.057377,0.016393,0.032787,0.040984,0.02459,0.114754,0.0,0.02459,0.040984,...,1.001878,6.753845,17420,17545,11736.0293,-0.425472,0.301887,0.301887,0.160377,5b3j
2025,1j1x,0.026549,0.017699,0.070796,0.026549,0.00885,0.070796,0.0,0.035398,0.035398,...,1.002679,6.911682,16960,17085,11598.7277,-0.327103,0.280374,0.364486,0.158879,1j1x
2070,5f7e,0.067227,0.016807,0.05042,0.033613,0.033613,0.10084,0.008403,0.016807,0.042017,...,1.004044,4.57074,14440,14565,11255.3375,-0.405825,0.281553,0.300971,0.145631,5f7e
666,5b71,0.097561,0.01626,0.04065,0.03252,0.01626,0.113821,0.02439,0.01626,0.04065,...,1.00693,7.952576,12950,13075,11559.6424,-0.365455,0.245455,0.354545,0.154545,5b71


In [31]:
sampler = RandomOverSampler(random_state=42)

x_chen_train, y_chen_train = sampler.fit_resample(x_chen_train, chen_train["Y"]) 
print('Resampled dataset shape %s' % Counter(y_chen_train)) 

Resampled dataset shape Counter({0: 1057, 1: 1057})


In [33]:
try_all(x_chen_train.drop(["Ab_ID", "name"], axis=1), y_chen_train, x_chen_valid.drop(["Ab_ID", "name"], axis=1), chen_valid["Y"], "protparam", EVAL_DIR, preprocessing="over-sampling")



Training model logistic_regression on data protparam 

Fitting 5 folds for each of 10 candidates, totalling 50 fits




logistic_regression, protparam
F1: 0.25675675675675674
MCC: -0.027290106983384053
Accuracy: 0.5397489539748954
Precision: 0.1919191919191919
Recall: 0.3877551020408163
AUC: 0.4833512352309345
-----


Training model random_forest on data protparam 

Fitting 5 folds for each of 10 candidates, totalling 50 fits




random_forest, protparam
F1: 0.1956521739130435
MCC: 0.004967244899887326
Accuracy: 0.6903765690376569
Precision: 0.20930232558139536
Recall: 0.1836734693877551
AUC: 0.5023630504833513
-----


Training model gradient_boosting on data protparam 

Fitting 5 folds for each of 10 candidates, totalling 50 fits


  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does 

gradient_boosting, protparam
F1: 0.15730337078651685
MCC: -0.03333884288417728
Accuracy: 0.6861924686192469
Precision: 0.175
Recall: 0.14285714285714285
AUC: 0.4845864661654135
-----


Training model SVM on data protparam 

Fitting 5 folds for each of 10 candidates, totalling 50 fits




SVM, protparam
F1: 0.11904761904761904
MCC: -0.06377924843196299
Accuracy: 0.6903765690376569
Precision: 0.14285714285714285
Recall: 0.10204081632653061
AUC: 0.4720730397422126
-----


Training model multilayer_perceptron on data protparam 

Fitting 5 folds for each of 6 candidates, totalling 30 fits




multilayer_perceptron, protparam
F1: 0.26027397260273977
MCC: -0.018721083534039826
Accuracy: 0.5481171548117155
Precision: 0.1958762886597938
Recall: 0.3877551020408163
AUC: 0.48861439312567134
-----


In [32]:
x_tap = pd.read_csv(path.join(DATA_DIR, "tap/protparam/protparam_features_tap.csv"))
x_tap.rename({"Unnamed: 0": "Ab_ID"}, axis=1, inplace=True)
x_tap.head()

Unnamed: 0,Ab_ID,aa_percent0_x,aa_percent1_x,aa_percent2_x,aa_percent3_x,aa_percent4_x,aa_percent5_x,aa_percent6_x,aa_percent7_x,aa_percent8_x,...,instability_y,flexibility_y,isoelectric_y,mol_extinct1_y,mol_extinct2_y,mw_y,gravy_y,ss_faction1_y,ss_faction2_y,ss_faction3_y
0,Abagovomab,0.10084,0.016807,0.033613,0.033613,0.02521,0.117647,0.008403,0.016807,0.067227,...,53.693458,0.999564,7.973724,14440,14565,11556.8384,-0.257009,0.299065,0.299065,0.196262
1,Abituzumab,0.076271,0.016949,0.033898,0.050847,0.033898,0.101695,0.008475,0.025424,0.033898,...,42.514019,1.001379,8.586625,17420,17545,11762.9686,-0.452336,0.280374,0.299065,0.121495
2,Abrilumab,0.059322,0.016949,0.059322,0.050847,0.025424,0.101695,0.008475,0.016949,0.059322,...,40.151402,1.002818,7.970307,22460,22585,11548.7104,-0.335514,0.271028,0.336449,0.158879
3,Actoxumab,0.057377,0.016393,0.057377,0.032787,0.032787,0.114754,0.008197,0.032787,0.02459,...,51.517757,1.000328,8.682102,22460,22585,11530.7383,-0.260748,0.271028,0.317757,0.158879
4,Adalimumab,0.090909,0.016529,0.057851,0.041322,0.024793,0.090909,0.016529,0.024793,0.024793,...,46.585047,1.000522,9.428646,15930,16055,11664.9632,-0.402804,0.271028,0.28972,0.168224


In [34]:
test_all(x_tap.drop("Ab_ID", axis=1), tap_data["Y"], "protparam", EVAL_DIR, preprocessing="over-sampling")

Testing model logistic_regression on protparam...
Testing model random_forest on protparam...
Testing model gradient_boosting on protparam...
Testing model SVM on protparam...
Testing model multilayer_perceptron on protparam...


In [35]:
x_chen_train = x_chen.loc[chen_train.index]

In [37]:
sampler = SMOTE(random_state=42)

x_chen_train, y_chen_train = sampler.fit_resample(x_chen_train.drop(["Ab_ID", "name"], axis=1), chen_train["Y"]) 
print('Resampled dataset shape %s' % Counter(y_chen_train)) 

Resampled dataset shape Counter({0: 1057, 1: 1057})


In [38]:
try_all(x_chen_train, y_chen_train, x_chen_valid.drop(["Ab_ID", "name"], axis=1), chen_valid["Y"], "protparam", EVAL_DIR, preprocessing="smote")



Training model logistic_regression on data protparam 

Fitting 5 folds for each of 10 candidates, totalling 50 fits




logistic_regression, protparam
F1: 0.27272727272727276
MCC: -0.011009019878480176
Accuracy: 0.5313807531380753
Precision: 0.2
Recall: 0.42857142857142855
AUC: 0.4932330827067669
-----


Training model random_forest on data protparam 

Fitting 5 folds for each of 10 candidates, totalling 50 fits
random_forest, protparam
F1: 0.23529411764705882
MCC: 0.0758927383717228
Accuracy: 0.7280334728033473
Precision: 0.2777777777777778
Recall: 0.20408163265306123
AUC: 0.5336197636949517
-----


Training model gradient_boosting on data protparam 

Fitting 5 folds for each of 10 candidates, totalling 50 fits


  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does 

gradient_boosting, protparam
F1: 0.2
MCC: 0.050723030787746565
Accuracy: 0.7322175732217573
Precision: 0.25806451612903225
Recall: 0.16326530612244897
AUC: 0.5211063372717507
-----


Training model SVM on data protparam 

Fitting 5 folds for each of 10 candidates, totalling 50 fits




SVM, protparam
F1: 0.2983425414364641
MCC: -0.0013080898147971149
Accuracy: 0.4686192468619247
Precision: 0.20454545454545456
Recall: 0.5510204081632653
AUC: 0.4991944146079484
-----


Training model multilayer_perceptron on data protparam 

Fitting 5 folds for each of 6 candidates, totalling 30 fits




multilayer_perceptron, protparam
F1: 0.0
MCC: -0.06625479339924215
Accuracy: 0.7782426778242678
Precision: 0.0
Recall: 0.0
AUC: 0.48947368421052634
-----


In [39]:
test_all(x_tap.drop("Ab_ID", axis=1), tap_data["Y"], "protparam", EVAL_DIR, preprocessing="smote")

Testing model logistic_regression on protparam...
Testing model random_forest on protparam...
Testing model gradient_boosting on protparam...
Testing model SVM on protparam...
Testing model multilayer_perceptron on protparam...


# BERT Embeddings

In [40]:
x_chen = pd.read_feather(path.join(DATA_DIR, "chen/embeddings/bert/bert_chen_embeddings.ftr"))
x_chen_train = x_chen.loc[chen_train.index]
x_chen_valid = x_chen.loc[chen_valid.index]
x_chen_train.head()

Unnamed: 0,Ab_ID,0,1,2,3,4,5,6,7,8,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
2073,6aod,0.020196,-0.020793,-0.010549,-0.038375,0.066936,-0.022921,0.017874,-0.088493,-0.018926,...,0.001973,-0.003466,-0.014579,0.007682,-0.027888,0.026192,-0.040531,-0.010679,0.007133,-0.027332
1517,4yny,-0.007447,-0.058638,0.013139,-0.029136,0.074901,-0.025573,0.035983,-0.105274,-0.01449,...,0.008387,0.007312,-0.01375,-0.003369,0.007132,0.04241,-0.044542,-0.034221,-0.023036,-0.013044
2025,5xcv,-0.005372,-0.056795,0.012577,-0.031566,0.071131,-0.023377,0.037339,-0.102565,-0.00934,...,0.008306,0.003619,-0.015327,-0.003542,0.00372,0.043285,-0.046977,-0.033989,-0.025775,-0.012205
2070,6and,-0.004229,-0.043975,0.003673,-0.031739,0.070902,-0.022174,0.038274,-0.111665,-0.030382,...,0.006203,-0.016125,-0.021334,-0.005837,-0.025217,0.024086,-0.032259,-0.02348,-0.013043,-0.035664
666,2xqy,0.003754,-0.018278,-0.001506,-0.037899,0.050809,-0.026173,0.003026,-0.090757,0.005515,...,-0.00372,0.004341,-0.036547,-0.009119,-0.038372,0.023391,-0.030063,-0.003554,0.000142,-0.041985


In [41]:
sampler = RandomOverSampler(random_state=42)

x_chen_train, y_chen_train = sampler.fit_resample(x_chen_train, chen_train["Y"]) 
print('Resampled dataset shape %s' % Counter(y_chen_train)) 

Resampled dataset shape Counter({0: 1057, 1: 1057})


In [42]:
try_all(x_chen_train.drop("Ab_ID", axis=1), y_chen_train, x_chen_valid.drop("Ab_ID", axis=1), chen_valid["Y"], "bert", EVAL_DIR, preprocessing="over-sampling")



Training model logistic_regression on data bert 

Fitting 5 folds for each of 10 candidates, totalling 50 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


logistic_regression, bert
F1: 0.34545454545454546
MCC: 0.15436255478528402
Accuracy: 0.698744769874477
Precision: 0.3114754098360656
Recall: 0.3877551020408163
AUC: 0.5833512352309345
-----


Training model random_forest on data bert 

Fitting 5 folds for each of 10 candidates, totalling 50 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


random_forest, bert
F1: 0.21621621621621617
MCC: 0.0973430625318641
Accuracy: 0.7573221757322176
Precision: 0.32
Recall: 0.16326530612244897
AUC: 0.5368958109559613
-----


Training model gradient_boosting on data bert 

Fitting 5 folds for each of 10 candidates, totalling 50 fits


  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does 

gradient_boosting, bert
F1: 0.16901408450704225
MCC: 0.05339910156767412
Accuracy: 0.7531380753138075
Precision: 0.2727272727272727
Recall: 0.12244897959183673
AUC: 0.5191192266380236
-----


Training model SVM on data bert 

Fitting 5 folds for each of 10 candidates, totalling 50 fits
SVM, bert
F1: 0.40310077519379844
MCC: 0.21080214098455255
Accuracy: 0.6778242677824268
Precision: 0.325
Recall: 0.5306122448979592
AUC: 0.6232008592910849
-----


Training model multilayer_perceptron on data bert 

Fitting 5 folds for each of 6 candidates, totalling 30 fits




multilayer_perceptron, bert
F1: 0.30000000000000004
MCC: 0.11494523488722636
Accuracy: 0.7071129707112971
Precision: 0.29411764705882354
Recall: 0.30612244897959184
AUC: 0.5583243823845329
-----


In [43]:
x_tap = pd.read_feather(path.join(DATA_DIR, "tap/embeddings/bert/bert_tap_embeddings.ftr"))
x_tap.head()

Unnamed: 0,Ab_ID,0,1,2,3,4,5,6,7,8,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,Abagovomab,-0.004248,-0.024501,-0.01133,-0.02717,0.062747,-0.024793,0.009313,-0.083316,-0.005339,...,-0.008674,-0.002387,-0.021563,0.001087,-0.020986,0.047104,-0.038736,-0.02178,-0.022153,-0.024539
1,Abituzumab,0.006593,-0.013591,-0.008454,-0.043601,0.065095,-0.016896,-0.001596,-0.090935,-0.00294,...,0.000486,-0.013063,-0.021852,-0.003531,-0.02462,0.027387,-0.041001,-0.025708,-0.016437,-0.034342
2,Abrilumab,0.019445,-0.002642,-0.011395,-0.058757,0.060623,-0.015046,0.006317,-0.083772,-0.006775,...,0.005834,-0.017189,-0.014557,0.003359,-0.035368,0.020287,-0.033941,-0.023549,-0.00872,-0.044038
3,Actoxumab,-0.006365,-0.043729,0.005978,-0.025613,0.067748,-0.009542,0.017723,-0.096801,-0.007752,...,-2.4e-05,-0.012986,-0.019753,-0.004326,-0.044124,0.019544,-0.039559,-0.015679,-0.008837,-0.043877
4,Adalimumab,-0.012995,-0.035269,0.014127,-0.042136,0.080592,-0.012831,0.031889,-0.091308,-0.018166,...,-0.002067,-0.022957,-0.021145,-0.002762,-0.058199,0.026781,-0.046815,-0.01011,-0.008785,-0.041761


In [45]:
test_all(x_tap.drop("Ab_ID", axis=1), tap_data["Y"], "bert", EVAL_DIR, preprocessing="over-sampling")

Testing model logistic_regression on bert...
Testing model random_forest on bert...
Testing model gradient_boosting on bert...
Testing model SVM on bert...
Testing model multilayer_perceptron on bert...


In [47]:
x_chen_train = x_chen.loc[chen_train.index]

sampler = SMOTE(random_state=42)

x_chen_train, y_chen_train = sampler.fit_resample(x_chen_train.drop("Ab_ID", axis=1), chen_train["Y"]) 
print('Resampled dataset shape %s' % Counter(y_chen_train)) 

Resampled dataset shape Counter({0: 1057, 1: 1057})


In [None]:
try_all(x_chen_train, y_chen_train, x_chen_valid.drop("Ab_ID", axis=1), chen_valid["Y"], "bert", EVAL_DIR, preprocessing="smote")



Training model logistic_regression on data bert 

Fitting 5 folds for each of 10 candidates, totalling 50 fits
logistic_regression, bert
F1: 0.32075471698113206
MCC: 0.12922774279890484
Accuracy: 0.698744769874477
Precision: 0.2982456140350877
Recall: 0.3469387755102041
AUC: 0.5682062298603652
-----


Training model random_forest on data bert 

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [None]:
test_all(x_tap.drop("Ab_ID", axis=1), tap_data["Y"], "bert", EVAL_DIR, preprocessing="smote")

# SeqVec Embeddings

In [None]:
x_chen = pd.read_feather(path.join(DATA_DIR, "chen/embeddings/seqvec/seqvec_chen_embeddings.ftr"))
x_chen_train = x_chen.loc[chen_train.index]
x_chen_valid = x_chen.loc[chen_valid.index]
x_chen_train.head()

In [None]:
sampler = RandomOverSampler(random_state=42)

x_chen_train, y_chen_train = sampler.fit_resample(x_chen_train, chen_train["Y"]) 
print('Resampled dataset shape %s' % Counter(y_chen_train)) 

In [None]:
try_all(x_chen_train.drop("Ab_ID", axis=1), y_chen_train, x_chen_valid.drop("Ab_ID", axis=1), chen_valid["Y"], "seqvec", EVAL_DIR, preprocessing="over-sampling")

In [None]:
x_tap = pd.read_feather(path.join(DATA_DIR, "tap/embeddings/seqvec/seqvec_tap_embeddings.ftr"))
x_tap.head()

In [None]:
test_all(x_tap.drop("Ab_ID", axis=1), tap_data["Y"], "seqvec", EVAL_DIR, preprocessing="over-sampling")

In [None]:
x_chen_train = x_chen.loc[chen_train.index]

sampler = SMOTE(random_state=42)

x_chen_train, y_chen_train = sampler.fit_resample(x_chen_train.drop("Ab_ID", axis=1), chen_train["Y"]) 
print('Resampled dataset shape %s' % Counter(y_chen_train)) 

In [None]:
try_all(x_chen_train, y_chen_train, x_chen_valid.drop("Ab_ID", axis=1), chen_valid["Y"], "seqvec", EVAL_DIR, preprocessing="smote")

In [None]:
test_all(x_tap.drop("Ab_ID", axis=1), tap_data["Y"], "seqvec", EVAL_DIR, preprocessing="smote")

# One-hot Encoding

In [None]:
x_heavy = pd.read_feather(path.join(DATA_DIR, "chen/abnumber/chen_heavy_one_hot.ftr")).set_index("index")
# rows 1921 and 2097 could not be encoded
x_light = pd.read_feather(path.join(DATA_DIR, "chen/abnumber/chen_light_one_hot.ftr")).set_index("Id")
x_chen = x_heavy.merge(x_light, left_index=True, right_index=True, suffixes=["_h", "_l"])
x_chen.index = x_heavy.index
train_idx = list(chen_train.index)
train_idx.remove(1921)
train_idx.remove(2097)
x_chen_train = x_chen.loc[train_idx]
x_chen_valid = x_chen.loc[chen_valid.index]
x_chen_train.head()

In [None]:
train_labels = chen_train.loc[train_idx]["Y"]

In [None]:
sampler = RandomOverSampler(random_state=42)

x_chen_train, y_chen_train = sampler.fit_resample(x_chen_train, train_labels) 
print('Resampled dataset shape %s' % Counter(y_chen_train)) 

In [None]:
try_all(x_chen_train.drop(["Ab_ID_h", "Ab_ID_l"], axis=1), y_chen_train, x_chen_valid.drop(["Ab_ID_h", "Ab_ID_l"], axis=1), chen_valid["Y"], "onehot", EVAL_DIR, preprocessing="over-sampling")

In [None]:
x_tap_heavy = pd.read_feather(path.join(DATA_DIR, "tap/abnumber/tap_heavy_one_hot.ftr"))
x_tap_light = pd.read_feather(path.join(DATA_DIR, "tap/abnumber/tap_light_one_hot.ftr"))
x_tap = x_heavy.merge(x_light, left_index=True, right_index=True, suffixes=["_h", "_l"])

In [None]:
test_all(x_tap, tap_data["Y"], "onehot", EVAL_DIR, preprocessing="over-sampling")

In [None]:
x_chen_train = x_chen.loc[chen_train.index]

sampler = SMOTE(random_state=42)

x_chen_train, y_chen_train = sampler.fit_resample(x_chen_train, chen_train["Y"]) 
print('Resampled dataset shape %s' % Counter(y_chen_train)) 

In [None]:
try_all(x_chen_train.drop("Ab_ID", axis=1), y_chen_train, x_chen_valid.drop("Ab_ID", axis=1), chen_valid["Y"], "onehot", EVAL_DIR, preprocessing="smote")

In [None]:
test_all(x_tap, tap_data["Y"], "onehot", EVAL_DIR, preprocessing="smote")