In [74]:
import pandas as pd
from os import path
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.utils.fixes import loguniform    
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import LeaveOneGroupOut
import pickle
import numpy as np
import os
import json
import csv

In [75]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from collections import Counter

In [76]:
DATA_DIR = "../../data"

In [77]:
EVAL_DIR = "2021-12-03"

In [78]:
chen_train = pd.read_csv(path.join(DATA_DIR, "chen/deduplicated/chen_train_data.csv"), index_col=0)
chen_train.head()

Unnamed: 0,Antibody_ID,heavy,light,Y
2073,6aod,EVQLVQSGAEVKKPGASVKVSCKASGYTFTGYYMHWVRQAPGQGLE...,DIVMTKSPSSLSASVGDRVTITCRASQGIRNDLGWYQQKPGKAPKR...,0
1517,4yny,EVQLVESGGGLVQPGRSLKLSCAASGFTFSNYGMAWVRQTPTKGLE...,EFVLTQPNSVSTNLGSTVKLSCKRSTGNIGSNYVNWYQQHEGRSPT...,1
2025,5xcv,EVQLVESGGGLVQPGRSLKLSCAASGFTFSNYGMAWVRQTPTKGLE...,QFVLTQPNSVSTNLGSTVKLSCKRSTGNIGSNYVNWYQQHEGRSPT...,1
2070,6and,EVQLVESGGGLVQPGGSLRLSCAASGYEFSRSWMNWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRSSQSIVHSVGNTFLEWYQQKPG...,1
666,2xqy,QVQLQQPGAELVKPGASVKMSCKASGYSFTSYWMNWVKQRPGRGLE...,DIVLTQSPASLALSLGQRATISCRASKSVSTSGYSYMYWYQQKPGQ...,0


In [79]:
len(chen_train)

1338

In [80]:
chen_valid = pd.read_csv(path.join(DATA_DIR, "chen/deduplicated/chen_valid_data.csv"), index_col=0)
chen_test = pd.read_csv(path.join(DATA_DIR, "chen/deduplicated/chen_test_data.csv"))
chen_valid = pd.concat([chen_valid, chen_test])
chen_valid.head()

Unnamed: 0.1,Antibody_ID,heavy,light,Y,Unnamed: 0
2169,6ct7,EVQLVESGGGLVEPGGSLRLSCAVSGFDFEKAWMSWVRQAPGQGLQ...,SYELTQPPSVSVSPGQTARITCSGEALPMQFAHWYQQRPGKAPVIV...,0,
1342,4nzu,AVSLVESGGGTVEPGSTLRLSCAASGFTFGSYAFHWVRQAPGDGLE...,DIEMTQSPSSLSASTGDKVTITCQASQDIAKFLDWYQQRPGKTPKL...,0,
1728,5i8c,QEVLVQSGAEVKKPGASVKVSCRAFGYTFTGNALHWVRQAPGQGLE...,DIQLTQSPSFLSASVGDKVTITCRASQGVRNELAWYQQKPGKAPNL...,1,
1729,5i8e,QEVLVQSGAEVKKPGASVKVSCRAFGYTFTGNALHWVRQAPGQGLE...,IQLTQSPSFLSASVGDKVTITCRASQGVRNELAWYQQKPGKAPNLL...,0,
2114,6bb4,QVQLQQSDAELVKPGASVKISCKASGYTFTDRTIHWVKQRPEQGLE...,DVQMIQSPSSLSASLGDIVTMTCQASQDTSINLNWFQQKPGKAPKL...,0,


In [81]:
len(chen_valid)

239

In [82]:
tap_data = pd.read_csv(path.join(DATA_DIR, "tap/TAP_data.csv"))
tap_data.head()

Unnamed: 0,Antibody_ID,heavy,light,CDR_length,PSH,PPC,PNC,SFvCSP,Y
0,Abagovomab,QVKLQESGAELARPGASVKLSCKASGYTFTNYWMQWVKQRPGQGLD...,DIELTQSPASLSASVGETVTITCQASENIYSYLAWHQQKQGKSPQL...,46,129.7603,0.0,0.0,16.32,1
1,Abituzumab,QVQLQQSGGELAKPGASVKVSCKASGYTFSSFWMHWVRQAPGQGLE...,DIQMTQSPSSLSASVGDRVTITCRASQDISNYLAWYQQKPGKAPKL...,45,115.9106,0.0954,0.0421,-3.1,1
2,Abrilumab,QVQLVQSGAEVKKPGASVKVSCKVSGYTLSDLSIHWVRQAPGKGLE...,DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQQKPGKAPKL...,45,109.6995,0.0,0.8965,-4.0,1
3,Actoxumab,QVQLVESGGGVVQPGRSLRLSCAASGFSFSNYGMHWVRQAPGKGLE...,DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQHKPGKAPKL...,49,112.629,0.0,1.1247,3.1,1
4,Adalimumab,EVQLVESGGGLVQPGRSLRLSCAASGFTFDDYAMHWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRASQGIRNYLAWYQQKPGKAPKL...,48,111.2512,0.0485,1.1364,-19.5,1


## Data clustering

In [83]:
chen_train = pd.read_csv(path.join(DATA_DIR, "chen/deduplicated/chen_train_data_w_clusters.csv"), index_col=0)
chen_train["cluster_merged"].value_counts()

0    276
3    189
7    147
5    137
1    132
4    129
6    120
8    106
9     59
2     43
Name: cluster_merged, dtype: int64

# Models

In [84]:
def logistic_regression(n):
    lr = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
    parameters = {'C':loguniform(0.001, 1000), 'penalty': ["l2"], "solver": ["lbfgs", "sag"]}
    return lr, parameters, "logistic_regression"

In [85]:
def random_forest(n):
    rf = RandomForestClassifier(random_state=42, n_jobs=-1, class_weight='balanced')
    parameters = {'n_estimators': np.arange(1, 200, 10), 'max_depth': np.arange(1, min(50,n), 2), 
                  'max_features': np.arange(0.1, 0.75, 0.05)}
    return rf, parameters, "random_forest"

In [86]:
def multilayer_perceptron(n):
    mlp = MLPClassifier(random_state=42, max_iter=int(1000))
    parameters = {'hidden_layer_sizes': [(100,), (50,), (100, 100)], "activation": ["relu", "logistic"]}
    return mlp, parameters, "multilayer_perceptron"

In [87]:
def svm(n):
    svc = SVC(max_iter=8000, probability=True, class_weight='balanced')
    parameters = {'C': loguniform(0.001, 100), 'kernel':["linear", "rbf"], 'gamma': loguniform(1e-3, 1e0)}
    return svc, parameters, "SVM"

In [88]:
def gradient_boosting(n):
    gb = GradientBoostingClassifier(random_state=42, n_iter_no_change=70)
    parameters = {'learning_rate': loguniform(0.01, 0.5), 
                  'n_estimators': np.arange(1, 200, 10), 
                  'max_depth': np.arange(1, min(20,n), 2), 
                  'max_features': np.arange(0.1, 0.6, 0.1)}
    return gb, parameters, "gradient_boosting"

In [89]:
def output_evaluation(model_type, params, best_params, metrics, data, outpath, preprocessing=None, table="all"):
    prepro = "_"+preprocessing if preprocessing is not None else ""
    filename = os.path.join(DATA_DIR, "evaluations", outpath, f"{model_type}_{data}{prepro}.json")
    out_dict = {
        "model_type": model_type,
        "data": data
    }
    out_dict["params"] = {}
    out_dict["best_params"] = {}
    for key, value in params.items():
        out_dict["params"][key] = str(value)
    for key, value in best_params.items():
        out_dict["best_params"][key] = str(value)
    out_dict["metrics"] = metrics
    out_dict["preprocessing"] = "none" if preprocessing is None else preprocessing
    
    json.dump(out_dict, open(filename, "w"))
    
    filename_sum = os.path.join(DATA_DIR, f"evaluations/{outpath}/{table}.csv")
    #"../evaluations/all.csv"
    line = [model_type, data, out_dict["preprocessing"], metrics["f1"], metrics["mcc"], metrics["acc"],metrics["precision"],metrics["recall"],metrics["auc"], filename]
    with open(filename_sum, 'a', newline='') as csvfile:
        csvwriter = csv.writer(csvfile, delimiter='\t')
        csvwriter.writerow(line)

In [90]:
def train_and_eval(model_name, classifier, parameters, X_train, y_train, X_valid, y_valid, groups,
                   data_name, outpath, preprocessing=None):
    splitter = LeaveOneGroupOut()
    split = splitter.split(X_train, y_train, groups=groups)
    grid = RandomizedSearchCV(classifier, parameters, verbose=1, scoring="f1", cv=split)
    grid.fit(X_train, y_train)
    estimator = grid.best_estimator_
    best_params = grid.best_params_
    y_pred = estimator.predict(X_valid)
    prepro = "_"+preprocessing if preprocessing is not None else ""
    filename = path.join(DATA_DIR, "evaluations", outpath, "models", f"{model_name}_{data_name}{prepro}.pkl")
    with open(filename, 'wb') as f:
        pickle.dump(estimator, f)
    #pickle.dump(estimator, open(filename, "w"))
    metric_dict = {
        "f1": float(metrics.f1_score(y_valid, y_pred)),
        "acc": float(metrics.accuracy_score(y_valid, y_pred)),
        "mcc": float(metrics.matthews_corrcoef(y_valid, y_pred)),
        "auc": float(metrics.roc_auc_score(y_valid, y_pred)),
        "precision": float(metrics.precision_score(y_valid, y_pred)),
        "recall": float(metrics.recall_score(y_valid, y_pred))
    }
    
    print(f"{model_name}, {data_name}")
    print(f"F1: {metric_dict['f1']}")
    print(f"MCC: {metric_dict['mcc']}")
    print(f"Accuracy: {metric_dict['acc']}")
    print(f"Precision: {metric_dict['precision']}")
    print(f"Recall: {metric_dict['recall']}")
    print(f"AUC: {metric_dict['auc']}")
    print(f"-----")
    
    output_evaluation(model_name, parameters, best_params, metric_dict, data_name, outpath, preprocessing=preprocessing)

In [91]:
def try_all(X_train, y_train, X_valid, y_valid, groups, data_name, outpath, preprocessing=None):
    n = len(y_train)
    for model_creator in [logistic_regression, random_forest, gradient_boosting, svm, multilayer_perceptron]:
        classifier, params, model_label = model_creator(n)
        print("\n")
        print(f'Training model {model_label} on data {data_name} \n')
        train_and_eval(model_label, classifier, params, X_train, y_train, X_valid, y_valid, groups,
                   data_name, outpath, preprocessing=preprocessing)

In [92]:
def test_on_tap(model_name, x_test, y_test,
                   data_name, outpath, preprocessing=None):
    prepro = "_"+preprocessing if preprocessing is not None else ""
    filename = path.join(DATA_DIR, "evaluations", outpath, "models", f"{model_name}_{data_name}{prepro}.pkl")
    with open(filename, 'rb') as f:
        estimator = pickle.load(f)
    y_pred = estimator.predict(x_test)
    metric_dict = {
        "f1": float(metrics.f1_score(y_test, y_pred)),
        "acc": float(metrics.accuracy_score(y_test, y_pred)),
        "mcc": float(metrics.matthews_corrcoef(y_test, y_pred)),
        "auc": float(metrics.roc_auc_score(y_test, y_pred)),
        "precision": float(metrics.precision_score(y_test, y_pred)),
        "recall": float(metrics.recall_score(y_test, y_pred))
    }
    filename_sum = os.path.join(DATA_DIR, f"evaluations/{outpath}/tap.csv")
    line = [model_name, data_name, prepro, metric_dict["f1"], metric_dict["mcc"], metric_dict["acc"],metric_dict["precision"],metric_dict["recall"],metric_dict["auc"], filename]
    with open(filename_sum, 'a', newline='') as csvfile:
        csvwriter = csv.writer(csvfile, delimiter='\t')
        csvwriter.writerow(line)

In [93]:
def test_all(x_test, y_test, data_name, outpath, preprocessing=None):
    for model in ["logistic_regression", "random_forest", "gradient_boosting", "SVM", "multilayer_perceptron"]:
        print(f"Testing model {model} on {data_name}...")
        test_on_tap(model, x_test, y_test, data_name, outpath, preprocessing=preprocessing)

# PyBioMed

In [94]:
x_chen = pd.read_feather(path.join(DATA_DIR, "chen/pybiomed/X_data.ftr"))
x_chen_train = x_chen.merge(chen_train[["Antibody_ID", "Y", "cluster_merged"]], left_on="Ab_ID", right_on="Antibody_ID").drop("Antibody_ID", axis=1)
x_chen_valid = x_chen.merge(chen_valid[["Antibody_ID", "Y"]], left_on="Ab_ID", right_on="Antibody_ID").drop("Antibody_ID", axis=1)
x_chen_train.head()

Unnamed: 0,Ab_ID,0,1,2,3,4,5,6,7,8,...,19750,19751,19752,19753,19754,19755,19756,19757,19758,19759
2073,6aod,7.692,5.983,0.855,5.983,1.709,4.274,5.983,10.256,0.855,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1517,4yny,6.838,5.128,1.709,5.128,1.709,3.419,4.274,11.111,0.855,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2025,5xcv,6.838,5.128,1.709,5.128,2.564,3.419,4.274,11.111,0.855,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2070,6and,6.667,5.833,2.5,5.833,1.667,4.167,5.0,13.333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
666,2xqy,3.39,3.39,2.542,5.932,1.695,3.39,6.78,9.322,0.847,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [95]:
print('Original dataset shape %s' % Counter(chen_train["Y"])) 

Original dataset shape Counter({0: 1057, 1: 281})


In [96]:
sampler = RandomOverSampler(random_state=42)

x_chen_train, y_chen_train = sampler.fit_resample(x_chen_train.drop("Y", axis=1), x_chen_train["Y"]) 
print('Resampled dataset shape %s' % Counter(y_chen_train)) 

Resampled dataset shape Counter({0: 1057, 1: 1057})


In [None]:
try_all(x_chen_train.drop(["Ab_ID", "cluster_merged"], axis=1), y_chen_train, 
        x_chen_valid.drop(["Ab_ID", "Y"], axis=1), x_chen_valid["Y"], 
        x_chen_train["cluster_merged"], "pybiomed", EVAL_DIR, preprocessing="over-sampling")



Training model logistic_regression on data pybiomed 

Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [98]:
x_tap = pd.read_feather(path.join(DATA_DIR, "tap/pybiomed/X_TAP_data.ftr"))
x_tap.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19750,19751,19752,19753,19754,19755,19756,19757,19758,19759
0,10.084,3.361,2.521,3.361,1.681,3.361,5.882,11.765,0.84,1.681,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,7.627,4.237,1.695,3.39,1.695,5.085,5.932,10.169,0.847,2.542,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5.932,1.695,0.0,5.932,1.695,5.085,6.78,10.169,0.847,1.695,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5.738,6.557,4.098,5.738,1.639,3.279,4.918,11.475,0.82,3.279,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,9.091,4.959,3.306,5.785,1.653,4.132,4.132,9.091,1.653,2.479,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
test_all(x_tap, tap_data["Y"], "pybiomed", EVAL_DIR, preprocessing="over-sampling")

In [None]:
x_chen_train = x_chen.merge(chen_train[["Antibody_ID", "Y", "cluster_merged"]], left_on="Ab_ID", right_on="Antibody_ID").drop("Antibody_ID", axis=1)

In [None]:
sampler = SMOTE(random_state=42)

x_chen_train, y_chen_train = sampler.fit_resample(x_chen_train.drop(["Ab_ID", "name", "Y"], axis=1), x_chen_train["Y"]) 
print('Resampled dataset shape %s' % Counter(y_chen_train)) 

In [None]:
try_all(x_chen_train.drop(["cluster_merged"], axis=1), y_chen_train, 
        x_chen_valid.drop(["Ab_ID", "Y"], axis=1), x_chen_valid["Y"], 
        x_chen_train["cluster_merged"], "pybiomed", EVAL_DIR, preprocessing="smote")

In [None]:
test_all(x_tap, tap_data["Y"], "pybiomed", EVAL_DIR, preprocessing="smote")

# Protparam

In [44]:
x_chen = pd.read_csv(path.join(DATA_DIR, "chen/protparam/protparam_features.csv"))
x_chen.rename({"Unnamed: 0": "Ab_ID"}, axis=1, inplace=True)

In [53]:
x_chen_train = x_chen.merge(chen_train[["Antibody_ID", "Y", "cluster_merged"]], left_on="Ab_ID", right_on="Antibody_ID").drop("Antibody_ID", axis=1)
x_chen_valid = x_chen.merge(chen_valid[["Antibody_ID", "Y"]], left_on="Ab_ID", right_on="Antibody_ID").drop("Antibody_ID", axis=1)
x_chen_train.head()

Unnamed: 0,Ab_ID,aa_percent0_x,aa_percent1_x,aa_percent2_x,aa_percent3_x,aa_percent4_x,aa_percent5_x,aa_percent6_x,aa_percent7_x,aa_percent8_x,...,mol_extinct1_y,mol_extinct2_y,mw_y,gravy_y,ss_faction1_y,ss_faction2_y,ss_faction3_y,name,Y,cluster_merged
0,3okd,0.081967,0.016393,0.040984,0.04918,0.040984,0.106557,0.008197,0.016393,0.040984,...,19940,20065,12297.7329,-0.427679,0.276786,0.294643,0.205357,3okd,1,1
1,5fb8,0.049587,0.016529,0.049587,0.041322,0.024793,0.07438,0.016529,0.033058,0.07438,...,15930,16055,12046.3419,-0.344144,0.279279,0.306306,0.225225,5fb8,1,0
2,4olz,0.056,0.032,0.048,0.04,0.032,0.088,0.008,0.032,0.04,...,21430,21555,11093.1436,-0.370297,0.306931,0.316832,0.158416,4olz,0,1
3,5i19,0.07563,0.016807,0.042017,0.042017,0.033613,0.134454,0.0,0.02521,0.033613,...,14440,14565,11531.7166,-0.316822,0.271028,0.327103,0.158879,5i19,1,0
4,2xqb,0.096,0.016,0.048,0.024,0.04,0.08,0.0,0.016,0.048,...,26930,27055,11672.8413,-0.599057,0.273585,0.330189,0.179245,2xqb,0,7


In [54]:
sampler = RandomOverSampler(random_state=42)

x_chen_train, y_chen_train = sampler.fit_resample(x_chen_train.drop("Y", axis=1), x_chen_train["Y"]) 
print('Resampled dataset shape %s' % Counter(y_chen_train)) 

Resampled dataset shape Counter({1: 1057, 0: 1057})


In [57]:
try_all(x_chen_train.drop(["Ab_ID", "name", "cluster_merged"], axis=1), y_chen_train, 
        x_chen_valid.drop(["Ab_ID", "name", "Y"], axis=1), x_chen_valid["Y"], 
        x_chen_train["cluster_merged"], "protparam", EVAL_DIR, preprocessing="over-sampling")



Training model logistic_regression on data protparam 

Fitting 10 folds for each of 10 candidates, totalling 100 fits




KeyboardInterrupt: 

In [58]:
x_tap = pd.read_csv(path.join(DATA_DIR, "tap/protparam/protparam_features_tap.csv"))
x_tap.rename({"Unnamed: 0": "Ab_ID"}, axis=1, inplace=True)
x_tap.head()

Unnamed: 0,Ab_ID,aa_percent0_x,aa_percent1_x,aa_percent2_x,aa_percent3_x,aa_percent4_x,aa_percent5_x,aa_percent6_x,aa_percent7_x,aa_percent8_x,...,instability_y,flexibility_y,isoelectric_y,mol_extinct1_y,mol_extinct2_y,mw_y,gravy_y,ss_faction1_y,ss_faction2_y,ss_faction3_y
0,Abagovomab,0.10084,0.016807,0.033613,0.033613,0.02521,0.117647,0.008403,0.016807,0.067227,...,53.693458,0.999564,7.973724,14440,14565,11556.8384,-0.257009,0.299065,0.299065,0.196262
1,Abituzumab,0.076271,0.016949,0.033898,0.050847,0.033898,0.101695,0.008475,0.025424,0.033898,...,42.514019,1.001379,8.586625,17420,17545,11762.9686,-0.452336,0.280374,0.299065,0.121495
2,Abrilumab,0.059322,0.016949,0.059322,0.050847,0.025424,0.101695,0.008475,0.016949,0.059322,...,40.151402,1.002818,7.970307,22460,22585,11548.7104,-0.335514,0.271028,0.336449,0.158879
3,Actoxumab,0.057377,0.016393,0.057377,0.032787,0.032787,0.114754,0.008197,0.032787,0.02459,...,51.517757,1.000328,8.682102,22460,22585,11530.7383,-0.260748,0.271028,0.317757,0.158879
4,Adalimumab,0.090909,0.016529,0.057851,0.041322,0.024793,0.090909,0.016529,0.024793,0.024793,...,46.585047,1.000522,9.428646,15930,16055,11664.9632,-0.402804,0.271028,0.28972,0.168224


In [59]:
test_all(x_tap.drop("Ab_ID", axis=1), tap_data["Y"], "protparam", EVAL_DIR, preprocessing="over-sampling")

Testing model logistic_regression on protparam...


FileNotFoundError: [Errno 2] No such file or directory: '../../data/evaluations/2021-12-03/models/logistic_regression_protparam_over-sampling.pkl'

In [62]:
x_chen_train = x_chen.merge(chen_train[["Antibody_ID", "Y", "cluster_merged"]], left_on="Ab_ID", right_on="Antibody_ID").drop("Antibody_ID", axis=1)

In [63]:
sampler = SMOTE(random_state=42)

x_chen_train, y_chen_train = sampler.fit_resample(x_chen_train.drop(["Ab_ID", "name", "Y"], axis=1), x_chen_train["Y"]) 
print('Resampled dataset shape %s' % Counter(y_chen_train)) 

Resampled dataset shape Counter({1: 1057, 0: 1057})


In [64]:
try_all(x_chen_train.drop(["cluster_merged"], axis=1), y_chen_train, 
        x_chen_valid.drop(["Ab_ID", "name", "Y"], axis=1), x_chen_valid["Y"], 
        x_chen_train["cluster_merged"], "protparam", EVAL_DIR, preprocessing="smote")



Training model logistic_regression on data protparam 

Fitting 10 folds for each of 10 candidates, totalling 100 fits




KeyboardInterrupt: 

In [39]:
test_all(x_tap.drop("Ab_ID", axis=1), tap_data["Y"], "protparam", EVAL_DIR, preprocessing="smote")

Testing model logistic_regression on protparam...
Testing model random_forest on protparam...
Testing model gradient_boosting on protparam...
Testing model SVM on protparam...
Testing model multilayer_perceptron on protparam...


# BERT Embeddings

In [66]:
x_chen = pd.read_feather(path.join(DATA_DIR, "chen/embeddings/bert/bert_chen_embeddings.ftr"))
x_chen_train = x_chen.merge(chen_train[["Antibody_ID", "Y", "cluster_merged"]], left_on="Ab_ID", right_on="Antibody_ID").drop("Antibody_ID", axis=1)
x_chen_valid = x_chen.merge(chen_valid[["Antibody_ID", "Y"]], left_on="Ab_ID", right_on="Antibody_ID").drop("Antibody_ID", axis=1)
x_chen_train.head()

Unnamed: 0,Ab_ID,0,1,2,3,4,5,6,7,8,...,2040,2041,2042,2043,2044,2045,2046,2047,Y,cluster_merged
0,1a0q,-0.017142,-0.008645,0.008613,-0.04673,0.061336,-0.017951,0.020446,-0.108641,0.002343,...,-0.037837,-0.009891,-0.018668,0.045293,-0.042225,-0.015849,-0.002904,-0.023377,1,2
1,1a14,-0.01677,-0.016028,0.005558,-0.022389,0.048435,-0.029777,0.005205,-0.091042,0.002346,...,-0.027305,0.012193,0.013407,0.019745,-0.029758,-0.021011,0.000632,-0.014885,0,4
2,1a2y,0.022052,-0.024678,0.013147,-0.041079,0.05456,-0.009407,-0.007112,-0.076221,0.010534,...,-0.031617,-0.013962,-0.021564,0.045667,-0.028034,-0.023921,-0.020638,-0.019502,0,1
3,1a3l,-0.004759,0.000985,-0.010184,-0.027024,0.073321,-0.015843,0.003232,-0.09023,0.003318,...,-0.016519,0.000595,-0.000173,0.037635,-0.030517,-0.020022,-0.004965,-0.012199,0,4
4,1a4j,-0.008561,-0.015271,0.014696,-0.026142,0.0753,-0.019657,0.03189,-0.106666,-0.001778,...,-0.020049,-0.007701,-0.031986,0.017794,-0.022749,-0.019076,-0.008999,-0.030474,0,7


In [67]:
sampler = RandomOverSampler(random_state=42)

x_chen_train, y_chen_train = sampler.fit_resample(x_chen_train.drop("Y", axis=1), x_chen_train["Y"]) 
print('Resampled dataset shape %s' % Counter(y_chen_train))

Resampled dataset shape Counter({1: 1057, 0: 1057})


In [68]:
try_all(x_chen_train.drop(["Ab_ID", "name", "cluster_merged"], axis=1), y_chen_train, 
        x_chen_valid.drop(["Ab_ID", "name", "Y"], axis=1), x_chen_valid["Y"], 
        x_chen_train["cluster_merged"], "bert", EVAL_DIR, preprocessing="over-sampling")



Training model logistic_regression on data bert 

Fitting 10 folds for each of 10 candidates, totalling 100 fits


KeyboardInterrupt: 

In [69]:
x_tap = pd.read_feather(path.join(DATA_DIR, "tap/embeddings/bert/bert_tap_embeddings.ftr"))
x_tap.head()

Unnamed: 0,Ab_ID,0,1,2,3,4,5,6,7,8,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,Abagovomab,-0.004248,-0.024501,-0.01133,-0.02717,0.062747,-0.024793,0.009313,-0.083316,-0.005339,...,-0.008674,-0.002387,-0.021563,0.001087,-0.020986,0.047104,-0.038736,-0.02178,-0.022153,-0.024539
1,Abituzumab,0.006593,-0.013591,-0.008454,-0.043601,0.065095,-0.016896,-0.001596,-0.090935,-0.00294,...,0.000486,-0.013063,-0.021852,-0.003531,-0.02462,0.027387,-0.041001,-0.025708,-0.016437,-0.034342
2,Abrilumab,0.019445,-0.002642,-0.011395,-0.058757,0.060623,-0.015046,0.006317,-0.083772,-0.006775,...,0.005834,-0.017189,-0.014557,0.003359,-0.035368,0.020287,-0.033941,-0.023549,-0.00872,-0.044038
3,Actoxumab,-0.006365,-0.043729,0.005978,-0.025613,0.067748,-0.009542,0.017723,-0.096801,-0.007752,...,-2.4e-05,-0.012986,-0.019753,-0.004326,-0.044124,0.019544,-0.039559,-0.015679,-0.008837,-0.043877
4,Adalimumab,-0.012995,-0.035269,0.014127,-0.042136,0.080592,-0.012831,0.031889,-0.091308,-0.018166,...,-0.002067,-0.022957,-0.021145,-0.002762,-0.058199,0.026781,-0.046815,-0.01011,-0.008785,-0.041761


In [45]:
test_all(x_tap.drop("Ab_ID", axis=1), tap_data["Y"], "bert", EVAL_DIR, preprocessing="over-sampling")

Testing model logistic_regression on bert...
Testing model random_forest on bert...
Testing model gradient_boosting on bert...
Testing model SVM on bert...
Testing model multilayer_perceptron on bert...


In [70]:
x_chen_train = x_chen.merge(chen_train[["Antibody_ID", "Y", "cluster_merged"]], left_on="Ab_ID", right_on="Antibody_ID").drop("Antibody_ID", axis=1)

sampler = SMOTE(random_state=42)

x_chen_train, y_chen_train = sampler.fit_resample(x_chen_train.drop(["Ab_ID", "Y"], axis=1), x_chen_train["Y"]) 
print('Resampled dataset shape %s' % Counter(y_chen_train)) 

Resampled dataset shape Counter({1: 1057, 0: 1057})


In [48]:
try_all(x_chen_train.drop(["cluster_merged"], axis=1), y_chen_train, 
        x_chen_valid.drop(["Ab_ID", "Y"], axis=1), x_chen_valid["Y"], 
        x_chen_train["cluster_merged"], "bert", EVAL_DIR, preprocessing="smote")



Training model logistic_regression on data bert 

Fitting 5 folds for each of 10 candidates, totalling 50 fits
logistic_regression, bert
F1: 0.32075471698113206
MCC: 0.12922774279890484
Accuracy: 0.698744769874477
Precision: 0.2982456140350877
Recall: 0.3469387755102041
AUC: 0.5682062298603652
-----


Training model random_forest on data bert 

Fitting 5 folds for each of 10 candidates, totalling 50 fits
random_forest, bert
F1: 0.24390243902439027
MCC: 0.09716594376718923
Accuracy: 0.7405857740585774
Precision: 0.30303030303030304
Recall: 0.20408163265306123
AUC: 0.541514500537057
-----


Training model gradient_boosting on data bert 

Fitting 5 folds for each of 10 candidates, totalling 50 fits


  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does 

gradient_boosting, bert
F1: 0.23076923076923075
MCC: 0.09694811588700399
Accuracy: 0.7489539748953975
Precision: 0.3103448275862069
Recall: 0.1836734693877551
AUC: 0.5392051557465091
-----


Training model SVM on data bert 

Fitting 5 folds for each of 10 candidates, totalling 50 fits




SVM, bert
F1: 0.41818181818181815
MCC: 0.2494467057999306
Accuracy: 0.7322175732217573
Precision: 0.3770491803278688
Recall: 0.46938775510204084
AUC: 0.6346938775510205
-----


Training model multilayer_perceptron on data bert 

Fitting 5 folds for each of 6 candidates, totalling 30 fits




multilayer_perceptron, bert
F1: 0.2727272727272727
MCC: 0.11230259609339714
Accuracy: 0.7322175732217573
Precision: 0.3076923076923077
Recall: 0.24489795918367346
AUC: 0.5513963480128894
-----


In [49]:
test_all(x_tap.drop("Ab_ID", axis=1), tap_data["Y"], "bert", EVAL_DIR, preprocessing="smote")

Testing model logistic_regression on bert...
Testing model random_forest on bert...
Testing model gradient_boosting on bert...
Testing model SVM on bert...
Testing model multilayer_perceptron on bert...


# SeqVec Embeddings

In [71]:
x_chen = pd.read_feather(path.join(DATA_DIR, "chen/embeddings/seqvec/seqvec_chen_embeddings.ftr"))
x_chen_train = x_chen.merge(chen_train[["Antibody_ID", "Y", "cluster_merged"]], left_on="Ab_ID", right_on="Antibody_ID").drop("Antibody_ID", axis=1)
x_chen_valid = x_chen.merge(chen_valid[["Antibody_ID", "Y"]], left_on="Ab_ID", right_on="Antibody_ID").drop("Antibody_ID", axis=1)
x_chen_train.head()

Unnamed: 0,Ab_ID,0,1,2,3,4,5,6,7,8,...,2040,2041,2042,2043,2044,2045,2046,2047,Y,cluster_merged
0,1a0q,0.073347,-0.186018,-0.233184,-0.165189,0.064193,-0.030149,0.043134,0.111651,-0.120727,...,0.138852,0.049886,-0.044254,-0.012993,-0.0321,-0.005154,-0.035896,-0.053616,1,2
1,1a14,0.066646,-0.216164,-0.275053,-0.124785,0.051032,-0.010701,0.042427,0.123428,-0.098271,...,0.141865,-0.114347,0.011103,-0.044319,-0.028007,0.020282,-0.050135,-0.095745,0,4
2,1a2y,0.08928,-0.191509,-0.165287,-0.087335,0.067743,-0.043121,0.027767,0.232575,-0.080467,...,0.216815,0.082564,0.016979,-0.003557,-0.081891,0.005624,-0.041633,-0.220336,0,1
3,1a3l,0.007294,-0.190514,-0.289291,-0.150268,0.056585,0.023869,-0.016217,0.094631,-0.208477,...,0.179691,-0.022877,0.00362,-0.055541,-0.035867,-0.030935,-0.006666,0.009036,0,4
4,1a4j,-0.016434,-0.144044,-0.292733,-0.124054,0.080313,-0.05148,0.000747,0.087235,-0.18232,...,0.267914,0.044216,0.020647,0.060805,-0.116175,0.111569,0.045578,-0.108075,0,7


In [72]:
sampler = RandomOverSampler(random_state=42)

x_chen_train, y_chen_train = sampler.fit_resample(x_chen_train.drop("Y", axis=1), x_chen_train["Y"]) 
print('Resampled dataset shape %s' % Counter(y_chen_train)) 

Resampled dataset shape Counter({1: 1057, 0: 1057})


In [52]:
try_all(x_chen_train.drop(["Ab_ID", "cluster_merged"], axis=1), y_chen_train, 
        x_chen_valid.drop("Ab_ID", "Y", axis=1), x_chen_valid["Y"], x_chen_train["cluster_merged"], 
        "seqvec", EVAL_DIR, preprocessing="over-sampling")



Training model logistic_regression on data seqvec 

Fitting 5 folds for each of 10 candidates, totalling 50 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

logistic_regression, seqvec
F1: 0.32989690721649484
MCC: 0.15932930700455064
Accuracy: 0.7280334728033473
Precision: 0.3333333333333333
Recall: 0.32653061224489793
AUC: 0.5790547798066595
-----


Training model random_forest on data seqvec 

Fitting 5 folds for each of 10 candidates, totalling 50 fits




random_forest, seqvec
F1: 0.3
MCC: 0.17411035250043286
Accuracy: 0.7656903765690377
Precision: 0.3870967741935484
Recall: 0.24489795918367346
AUC: 0.5724489795918368
-----


Training model gradient_boosting on data seqvec 

Fitting 5 folds for each of 10 candidates, totalling 50 fits


  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does 

gradient_boosting, seqvec
F1: 0.18666666666666665
MCC: 0.05556759229181543
Accuracy: 0.7447698744769874
Precision: 0.2692307692307692
Recall: 0.14285714285714285
AUC: 0.5214285714285714
-----


Training model SVM on data seqvec 

Fitting 5 folds for each of 10 candidates, totalling 50 fits




SVM, seqvec
F1: 0.31683168316831684
MCC: 0.13410759024069596
Accuracy: 0.7112970711297071
Precision: 0.3076923076923077
Recall: 0.32653061224489793
AUC: 0.5685284640171857
-----


Training model multilayer_perceptron on data seqvec 

Fitting 5 folds for each of 6 candidates, totalling 30 fits




multilayer_perceptron, seqvec
F1: 0.4036697247706422
MCC: 0.23181228637677556
Accuracy: 0.7280334728033473
Precision: 0.36666666666666664
Recall: 0.4489795918367347
AUC: 0.6244897959183674
-----


In [53]:
x_tap = pd.read_feather(path.join(DATA_DIR, "tap/embeddings/seqvec/seqvec_tap_embeddings.ftr"))
x_tap.head()

Unnamed: 0,Ab_ID,0,1,2,3,4,5,6,7,8,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,Abagovomab,-0.004248,-0.024501,-0.01133,-0.02717,0.062747,-0.024793,0.009313,-0.083316,-0.005339,...,-0.008674,-0.002387,-0.021563,0.001087,-0.020986,0.047104,-0.038736,-0.02178,-0.022153,-0.024539
1,Abituzumab,0.006593,-0.013591,-0.008454,-0.043601,0.065095,-0.016896,-0.001596,-0.090935,-0.00294,...,0.000486,-0.013063,-0.021852,-0.003531,-0.02462,0.027387,-0.041001,-0.025708,-0.016437,-0.034342
2,Abrilumab,0.019445,-0.002642,-0.011395,-0.058757,0.060623,-0.015046,0.006317,-0.083772,-0.006775,...,0.005834,-0.017189,-0.014557,0.003359,-0.035368,0.020287,-0.033941,-0.023549,-0.00872,-0.044038
3,Actoxumab,-0.006365,-0.043729,0.005978,-0.025613,0.067748,-0.009542,0.017723,-0.096801,-0.007752,...,-2.4e-05,-0.012986,-0.019753,-0.004326,-0.044124,0.019544,-0.039559,-0.015679,-0.008837,-0.043877
4,Adalimumab,-0.012995,-0.035269,0.014127,-0.042136,0.080592,-0.012831,0.031889,-0.091308,-0.018166,...,-0.002067,-0.022957,-0.021145,-0.002762,-0.058199,0.026781,-0.046815,-0.01011,-0.008785,-0.041761


In [54]:
test_all(x_tap.drop("Ab_ID", axis=1), tap_data["Y"], "seqvec", EVAL_DIR, preprocessing="over-sampling")

Testing model logistic_regression on seqvec...
Testing model random_forest on seqvec...
Testing model gradient_boosting on seqvec...
Testing model SVM on seqvec...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Testing model multilayer_perceptron on seqvec...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [73]:
x_chen_train = x_chen.merge(chen_train[["Antibody_ID", "Y", "cluster_merged"]], left_on="Ab_ID", right_on="Antibody_ID").drop("Antibody_ID", axis=1)

sampler = SMOTE(random_state=42)

x_chen_train, y_chen_train = sampler.fit_resample(x_chen_train.drop(["Ab_ID", "Y"], axis=1), x_chen_train["Y"]) 
print('Resampled dataset shape %s' % Counter(y_chen_train)) 

Resampled dataset shape Counter({1: 1057, 0: 1057})


In [56]:
try_all(x_chen_train.drop(["Ab_ID", "cluster_merged"], axis=1), y_chen_train, 
        x_chen_valid.drop(["Ab_ID", "Y"], axis=1), x_chen_valid["Y"], 
        x_chen_train["cluster_merged"], "seqvec", EVAL_DIR, preprocessing="smote")



Training model logistic_regression on data seqvec 

Fitting 5 folds for each of 10 candidates, totalling 50 fits




logistic_regression, seqvec
F1: 0.32653061224489793
MCC: 0.15284640171858216
Accuracy: 0.7238493723849372
Precision: 0.32653061224489793
Recall: 0.32653061224489793
AUC: 0.5764232008592911
-----


Training model random_forest on data seqvec 

Fitting 5 folds for each of 10 candidates, totalling 50 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


random_forest, seqvec
F1: 0.3146067415730337
MCC: 0.16100221685529514
Accuracy: 0.7447698744769874
Precision: 0.35
Recall: 0.2857142857142857
AUC: 0.5744360902255639
-----


Training model gradient_boosting on data seqvec 

Fitting 5 folds for each of 10 candidates, totalling 50 fits


  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does 

gradient_boosting, seqvec
F1: 0.2988505747126437
MCC: 0.14764022008576264
Accuracy: 0.7447698744769874
Precision: 0.34210526315789475
Recall: 0.2653061224489796
AUC: 0.5668635875402793
-----


Training model SVM on data seqvec 

Fitting 5 folds for each of 10 candidates, totalling 50 fits
SVM, seqvec
F1: 0.36697247706422015
MCC: 0.18400975277535248
Accuracy: 0.7112970711297071
Precision: 0.3333333333333333
Recall: 0.40816326530612246
AUC: 0.5988184747583244
-----


Training model multilayer_perceptron on data seqvec 

Fitting 5 folds for each of 6 candidates, totalling 30 fits




multilayer_perceptron, seqvec
F1: 0.34615384615384615
MCC: 0.16555823204102568
Accuracy: 0.7154811715481172
Precision: 0.32727272727272727
Recall: 0.3673469387755102
AUC: 0.5863050483351236
-----


In [57]:
test_all(x_tap.drop("Ab_ID", axis=1), tap_data["Y"], "seqvec", EVAL_DIR, preprocessing="smote")

Testing model logistic_regression on seqvec...
Testing model random_forest on seqvec...
Testing model gradient_boosting on seqvec...
Testing model SVM on seqvec...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Testing model multilayer_perceptron on seqvec...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# One-hot Encoding

In [None]:
x_heavy = pd.read_feather(path.join(DATA_DIR, "chen/abnumber/chen_heavy_one_hot.ftr")).set_index("index")
# rows 1921 and 2097 could not be encoded
x_light = pd.read_feather(path.join(DATA_DIR, "chen/abnumber/chen_light_one_hot.ftr")).set_index("Id")
x_chen = x_heavy.merge(x_light, left_index=True, right_index=True, suffixes=["_h", "_l"])
x_chen.index = x_heavy.index
train_idx = list(chen_train.index)
train_idx.remove(1921)
train_idx.remove(2097)
x_chen_train = x_chen.loc[train_idx]
x_chen_valid = x_chen.loc[chen_valid.index]
x_chen_train.head()

In [None]:
train_labels = chen_train.loc[train_idx]["Y"]

In [None]:
sampler = RandomOverSampler(random_state=42)

x_chen_train, y_chen_train = sampler.fit_resample(x_chen_train, train_labels) 
print('Resampled dataset shape %s' % Counter(y_chen_train)) 

In [None]:
# add groups!!
try_all(x_chen_train.drop(["Ab_ID_h", "Ab_ID_l"], axis=1), y_chen_train, x_chen_valid.drop(["Ab_ID_h", "Ab_ID_l"], axis=1), chen_valid["Y"], "onehot", EVAL_DIR, preprocessing="over-sampling")

In [None]:
x_tap_heavy = pd.read_feather(path.join(DATA_DIR, "tap/abnumber/tap_heavy_one_hot.ftr"))
x_tap_light = pd.read_feather(path.join(DATA_DIR, "tap/abnumber/tap_light_one_hot.ftr"))
x_tap = x_heavy.merge(x_light, left_index=True, right_index=True, suffixes=["_h", "_l"])

In [None]:
test_all(x_tap, tap_data["Y"], "onehot", EVAL_DIR, preprocessing="over-sampling")

In [None]:
x_chen_train = x_chen.loc[chen_train.index]

sampler = SMOTE(random_state=42)

x_chen_train, y_chen_train = sampler.fit_resample(x_chen_train, chen_train["Y"]) 
print('Resampled dataset shape %s' % Counter(y_chen_train)) 

In [None]:
try_all(x_chen_train.drop("Ab_ID", axis=1), y_chen_train, x_chen_valid.drop("Ab_ID", axis=1), chen_valid["Y"], "onehot", EVAL_DIR, preprocessing="smote")

In [None]:
test_all(x_tap, tap_data["Y"], "onehot", EVAL_DIR, preprocessing="smote")