In [1]:
import pandas as pd
from os import path
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.utils.fixes import loguniform    
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import LeaveOneGroupOut
#from imblearn.under_sampling import RandomUnderSampler
#from imblearn.over_sampling import RandomOverSampler
#from imblearn.over_sampling import SMOTE
from collections import Counter
import pickle
import numpy as np
import os
import json
import csv

In [2]:
DATA_DIR = "../../data"

# Load Data

In [5]:
chen_data = pd.read_csv(path.join(DATA_DIR, "chen/deduplicated/chen_train.csv"), index_col=0)
tap_data = pd.read_csv(path.join(DATA_DIR, "tap/TAP_data.csv"))

In [4]:
clusters = pd.read_csv(path.join(DATA_DIR, "chen/clustering.csv"), index_col=0)

In [5]:
cluster_assignment = {}
for i, cl in enumerate(clusters["0"]):
    cluster_assignment[cl] = cluster_assignment.get(cl, []) + [i]

In [6]:
seeds = [2, 13, 19, 27, 38, 42, 56, 63, 6, 78]

In [7]:
for seed in seeds:
    train_indices, test_indices = train_test_split(range(int(clusters.max())), test_size=0.2, random_state=seed)
    train_set = []
    for idx in train_indices:
        train_set += cluster_assignment[idx + 1]
    chen_train = chen_data.iloc[train_set]
    chen_train.to_csv(path.join(DATA_DIR, f"chen/deduplicated/crossval/chen_train_{seed}.csv"))
    
    test_set = []
    for idx in test_indices:
        test_set += cluster_assignment[idx + 1]
    chen_test = chen_data.iloc[test_set]
    chen_test.to_csv(path.join(DATA_DIR, f"chen/deduplicated/crossval/chen_test_{seed}.csv"))
    

# Training functions

In [8]:
def merge_clusters(train_df, cluster_df):
    df = train_df.merge(cluster_df, left_index=True, right_index=True).rename({"0": "cluster"}, axis=1)
    df["cluster_merged"] = df["cluster"]
    df["cluster_merged"][df["cluster"] < 300] = df["cluster"][df["cluster"] < 300] // 30
    df["cluster_merged"][df["cluster"] >= 300] = df["cluster"][df["cluster"] >= 300] // 100
    print(f'Unique clusters after merge: {df["cluster_merged"].nunique()}')
    return df

In [9]:
def knn(n):
    model = KNeighborsClassifier()  # default metric is Euclidean
    parameters = {'n_neighbors': [1,3,5]}
    return model, parameters, "kNN"

def logistic_regression(n):
    lr = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
    parameters = {'C':loguniform(0.001, 1000), 'penalty': ["l2"], "solver": ["lbfgs", "sag"]}
    return lr, parameters, "logistic_regression"

def random_forest(n):
    rf = RandomForestClassifier(random_state=42, n_jobs=-1, class_weight='balanced')
    parameters = {'n_estimators': np.arange(1, 200, 10), 'max_depth': np.arange(1, min(50,n), 2), 
                  'max_features': np.arange(0.1, 0.75, 0.05)}
    return rf, parameters, "random_forest"

def multilayer_perceptron(n):
    mlp = MLPClassifier(random_state=42, max_iter=int(1000))
    parameters = {'hidden_layer_sizes': [(100,), (50,), (50, 50), (100, 100)], "activation": ["relu", "logistic"]}
    return mlp, parameters, "multilayer_perceptron"

def svm(n):
    svc = SVC(max_iter=8000, probability=True, class_weight='balanced')
    parameters = {'C': loguniform(0.001, 100), 'kernel':["linear", "rbf"], 'gamma': loguniform(1e-3, 1e0)}
    return svc, parameters, "SVM"

def gradient_boosting(n):
    gb = GradientBoostingClassifier(random_state=42, n_iter_no_change=70)
    parameters = {'learning_rate': loguniform(0.01, 0.5), 
                  'n_estimators': np.arange(1, 200, 10), 
                  'max_depth': np.arange(1, min(20,n), 2), 
                  'max_features': np.arange(0.1, 0.6, 0.1)}
    return gb, parameters, "gradient_boosting"

In [10]:
def output_evaluation(model_type, params, best_params, metrics, data, outpath, preprocessing):
    prepro = "_"+preprocessing if preprocessing is not None else ""
    filename = os.path.join(DATA_DIR, "evaluations", outpath, f"{model_type}_{data}{prepro}.json")
    out_dict = {
        "model_type": model_type,
        "data": data
    }
    out_dict["params"] = {}
    out_dict["best_params"] = {}
    for key, value in params.items():
        out_dict["params"][key] = str(value)
    for key, value in best_params.items():
        out_dict["best_params"][key] = str(value)
    out_dict["metrics"] = metrics
    out_dict["preprocessing"] = "none" if preprocessing is None else preprocessing
    
    json.dump(out_dict, open(filename, "w"))
    
    filename_sum = os.path.join(DATA_DIR, f"evaluations/{outpath}/all.csv")
    line = [model_type, data, out_dict["preprocessing"], metrics["f1"], metrics["mcc"], metrics["acc"],metrics["precision"],metrics["recall"],metrics["auc"], filename]
    with open(filename_sum, 'a', newline='') as csvfile:
        csvwriter = csv.writer(csvfile, delimiter='\t')
        csvwriter.writerow(line)

In [11]:
def train_and_eval(model_name, classifier, parameters, X_train, y_train, X_valid, y_valid, groups,
                   data_name, outpath, preprocessing):
    splitter = LeaveOneGroupOut()
    split = splitter.split(X_train, y_train, groups=groups)
    grid = RandomizedSearchCV(classifier, parameters, verbose=0, scoring="f1", cv=split)
    grid.fit(X_train, y_train)
    estimator = grid.best_estimator_
    best_params = grid.best_params_
    y_pred = estimator.predict(X_valid)
    filename = path.join(DATA_DIR, "evaluations", outpath, "models", f"{model_name}_{data_name}_{preprocessing}.pkl")
    with open(filename, 'wb') as f:
        pickle.dump(estimator, f)
    metric_dict = {
        "f1": float(metrics.f1_score(y_valid, y_pred)),
        "acc": float(metrics.accuracy_score(y_valid, y_pred)),
        "mcc": float(metrics.matthews_corrcoef(y_valid, y_pred)),
        "auc": float(metrics.roc_auc_score(y_valid, y_pred)),
        "precision": float(metrics.precision_score(y_valid, y_pred)),
        "recall": float(metrics.recall_score(y_valid, y_pred))
    }
    
    print(f"{model_name}, {data_name}, {preprocessing}")
    print(f"F1: {metric_dict['f1']}")
    print(f"MCC: {metric_dict['mcc']}")
    print(f"Accuracy: {metric_dict['acc']}")
    print(f"Precision: {metric_dict['precision']}")
    print(f"Recall: {metric_dict['recall']}")
    print(f"AUC: {metric_dict['auc']}")
    print(f"-----")
    
    output_evaluation(model_name, parameters, best_params, metric_dict, data_name, outpath, preprocessing)

In [12]:
def try_all(X_train, y_train, X_valid, y_valid, groups, data_name, outpath, preprocessing, models):
    n = len(y_train)
    for model_creator in models:
    #for model_creator in [logistic_regression, random_forest]:

        classifier, params, model_label = model_creator(n)
        print("\n")
        print(f'Training model {model_label} on data {data_name} with preprocessing {preprocessing} \n')
        train_and_eval(model_label, classifier, params, X_train, y_train, X_valid, y_valid, groups,
                   data_name, outpath, preprocessing)

In [13]:
def test_on_tap(model_name, x_test, y_test,
                   data_name, outpath, preprocessing=None):
    prepro = "_"+preprocessing if preprocessing is not None else ""
    filename = path.join(DATA_DIR, "evaluations", outpath, "models", f"{model_name}_{data_name}{prepro}.pkl")
    with open(filename, 'rb') as f:
        estimator = pickle.load(f)
    y_pred = estimator.predict(x_test)
    metric_dict = {
        "f1": float(metrics.f1_score(y_test, y_pred)),
        "acc": float(metrics.accuracy_score(y_test, y_pred)),
        "mcc": float(metrics.matthews_corrcoef(y_test, y_pred)),
        "auc": float(metrics.roc_auc_score(y_test, y_pred)),
        "precision": float(metrics.precision_score(y_test, y_pred)),
        "recall": float(metrics.recall_score(y_test, y_pred))
    }
    filename_sum = os.path.join(DATA_DIR, f"evaluations/{outpath}/tap.csv")
    line = [model_name, data_name, prepro, metric_dict["f1"], metric_dict["mcc"], metric_dict["acc"],metric_dict["precision"],metric_dict["recall"],metric_dict["auc"], filename]
    with open(filename_sum, 'a', newline='') as csvfile:
        csvwriter = csv.writer(csvfile, delimiter='\t')
        csvwriter.writerow(line)

In [14]:
def test_all(x_test, y_test, data_name, outpath, preprocessing):
    for model in ["logistic_regression", "random_forest", "gradient_boosting", "SVM", "multilayer_perceptron"]:
        print(f"Testing model {model} on {data_name} with preprocessing {preprocessing}...")
        test_on_tap(model, x_test, y_test, data_name, outpath, preprocessing=preprocessing)

## Loading data representations

In [15]:
def integer_encoded(train_df, test_df):
    x_chen = pd.read_csv(path.join(DATA_DIR, "chen/integer_encoding/chen_integer_encoded.csv"), index_col=0)
    x_chen_train = x_chen.merge(train_df[["Antibody_ID", "Y", "cluster_merged"]], left_on="Ab_ID", right_on="Antibody_ID").drop("Antibody_ID", axis=1)
    x_chen_test = x_chen.merge(test_df[["Antibody_ID", "Y"]], left_on="Ab_ID", right_on="Antibody_ID").drop("Antibody_ID", axis=1)
    x_tap = pd.read_csv(path.join(DATA_DIR, "tap/integer_encoding/tap_integer_encoded.csv"))
    x_tap.drop("Ab_ID", axis=1, inplace=True)
    return x_chen_train, x_chen_test, x_tap

In [16]:
def pybiomed(train_df, test_df):
    x_chen = pd.read_feather(path.join(DATA_DIR, "chen/pybiomed/X_data.ftr"))
    x_chen_train = x_chen.merge(train_df[["Antibody_ID", "Y", "cluster_merged"]], left_on="Ab_ID", right_on="Antibody_ID").drop("Antibody_ID", axis=1)
    x_chen_test = x_chen.merge(test_df[["Antibody_ID", "Y"]], left_on="Ab_ID", right_on="Antibody_ID").drop("Antibody_ID", axis=1)
    x_tap = pd.read_feather(path.join(DATA_DIR, "tap/pybiomed/X_TAP_data.ftr"))
    return x_chen_train, x_chen_test, x_tap

In [60]:
def protparam(train_df, test_df):
    x_chen = pd.read_csv(path.join(DATA_DIR, "chen/protparam/protparam_features.csv"))
    x_chen.rename({"Unnamed: 0": "Ab_ID"}, axis=1, inplace=True)
    x_chen = x_chen.drop("name", axis=1)
    x_chen_train = x_chen.merge(train_df[["Antibody_ID", "Y"]].reset_index(), left_on="Ab_ID", right_on="Antibody_ID").set_index('index').drop("Antibody_ID", axis=1)
    x_chen_test = x_chen.merge(test_df[["Antibody_ID", "Y"]].reset_index(), left_on="Ab_ID", right_on="Antibody_ID").set_index('index').drop("Antibody_ID", axis=1)
    
    x_tap = pd.read_csv(path.join(DATA_DIR, "tap/protparam/protparam_features_tap.csv"))
    x_tap = x_tap.drop("Unnamed: 0", axis=1)
    return x_chen_train.sort_index(), x_chen_test.sort_index(), x_tap.sort_index()

In [30]:
def bert(train_df, test_df):
    x_chen = pd.read_feather(path.join(DATA_DIR, "chen/embeddings/bert/bert_chen_embeddings.ftr"))
    x_chen_train = x_chen.merge(train_df[["Antibody_ID", "Y", "cluster_merged"]], left_on="Ab_ID", right_on="Antibody_ID").drop("Antibody_ID", axis=1)
    x_chen_test = x_chen.merge(test_df[["Antibody_ID", "Y"]], left_on="Ab_ID", right_on="Antibody_ID").drop("Antibody_ID", axis=1)
    x_tap = pd.read_feather(path.join(DATA_DIR, "tap/embeddings/bert/bert_tap_embeddings.ftr"))
    x_tap = x_tap.drop("Ab_ID", axis=1)
    return x_chen_train, x_chen_test, x_tap

In [19]:
def seqvec(train_df, test_df):
    x_chen = pd.read_feather(path.join(DATA_DIR, "chen/embeddings/seqvec/seqvec_chen_embeddings.ftr"))
    x_chen_train = x_chen.merge(train_df[["Antibody_ID", "Y", "cluster_merged"]], left_on="Ab_ID", right_on="Antibody_ID").drop("Antibody_ID", axis=1)
    x_chen_test = x_chen.merge(test_df[["Antibody_ID", "Y"]], left_on="Ab_ID", right_on="Antibody_ID").drop("Antibody_ID", axis=1)
    x_tap = pd.read_feather(path.join(DATA_DIR, "tap/embeddings/seqvec/seqvec_tap_embeddings.ftr"))
    x_tap = x_tap.drop("Ab_ID", axis=1)
    return x_chen_train, x_chen_test, x_tap

In [1]:
def sapiens(train_df, test_df):
    x_chen = pd.read_csv(path.join(DATA_DIR, "chen/embeddings/sapiens/sapiens_chen_embeddings.csv"), index_col=0).drop("Y", axis=1)
    x_chen_train = x_chen.merge(train_df[["Antibody_ID", "Y"]].reset_index(), left_on="Ab_ID", right_on="Antibody_ID").set_index('index').drop("Antibody_ID", axis=1)
    x_chen_test = x_chen.merge(test_df[["Antibody_ID", "Y"]].reset_index(), left_on="Ab_ID", right_on="Antibody_ID").set_index('index').drop("Antibody_ID", axis=1)
    x_tap = pd.read_csv(path.join(DATA_DIR, "tap/embeddings/sapiens/sapiens_tap_embeddings.csv"), index_col=0)
    x_tap = x_tap.drop(["Ab_ID", "Y"], axis=1)
    return x_chen_train, x_chen_test, x_tap

In [3]:
train_4 = pd.read_csv(path.join(DATA_DIR, "chen/deduplicated/crossval/chen_4_a.csv"), index_col=0).sort_index()
test_4 = pd.read_csv(path.join(DATA_DIR, "chen/deduplicated/crossval/chen_4_b.csv"), index_col=0).sort_index()
train_4.head()

Unnamed: 0,Antibody_ID,heavy,light,Y,cluster
2,1a0q,EVQLQESDAELVKPGASVKISCKASGYTFTDHVIHWVKQKPEQGLE...,DIELTQSPSSLSASLGGKVTITCKASQDIKKYIGWYQHKPGKQPRL...,1,102
4,1a2y,QVQLQESGPGLVAPSQSLSITCTVSGFSLTGYGVNWVRQPPGKGLE...,DIVLTQSPASLSASVGETVTITCRASGNIHNYLAWYQQKQGKSPQL...,0,59
5,1a3l,EVQLEESGPELVRPGTSVKISCKASGYTFTNYWLGWVKQRPGHGFE...,DIVLTQAAFSNPVTLGASASISCRSSKSLLNSNGIIHMYWYLQKPG...,0,477
7,1a4j,QVQLLESGPELKKPGETVKISCKASGYTFTNYGMNWVKQAPGKGLK...,ELVMTQTPLSLPVSLGDQASISCRSSQSLVHSNGNTYLHWYLQKPG...,0,230
8,1a4k,QVQLLESGPELKKPGETVKISCKASGYTFTNYGMNWVKQAPGKGLK...,ELVMTQTPLSLPVSLGDQASISCRSSQSLLHSNGNTYLHWYLQKPG...,0,230


In [9]:
len(test_4)

654

In [6]:
tr, te, tap = onehot(train_4, test_4)

In [7]:
te

Unnamed: 0_level_0,Ab_ID,1_-_x,1_A_x,1_C_x,1_D_x,1_E_x,1_F_x,1_G_x,1_H_x,1_I_x,...,127_N_y,127_P_y,127_Q_y,127_R_y,127_S_y,127_T_y,127_V_y,127_W_y,127_Y_y,Y
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,12e8,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1a14,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11,1a6u,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,1a6v,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19,1ad0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2388,6phc,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2389,6phg,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2391,6pxg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2404,6s5a,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
len(te)

653

In [13]:
sapiens_chen = pd.read_csv(path.join(DATA_DIR, "chen/embeddings/sapiens/sapiens_chen_embeddings.csv"), index_col=0).drop("Y", axis=1)


In [24]:
#sapiens_chen.merge(test_4[["Antibody_ID", "Y"]].reset_index(), left_on="Ab_ID", right_on="Antibody_ID").set_index('index').drop("Antibody_ID", axis=1)
test_4.reset_index().merge(sapiens_chen,indicator = True, how='left', right_on="Ab_ID", left_on="Antibody_ID").loc[lambda x : x['_merge']!='both']

Unnamed: 0,index,Antibody_ID,heavy,light,Y,cluster,Ab_ID,0,1,2,...,119.1,120.1,121.1,122.1,123.1,124.1,125.1,126.1,127.1,_merge
37,1237,4k3e,EVQLRESGPSLVQPSQTLSLTCTASGFSLSDKAVGWVRQAPGKALE...,EAVLNQPSSVSGSLGQRVSITCSGSSSNVGNGYVSWYQLIPGSAPR...,1,715,,,,,...,,,,,,,,,,left_only
40,1236,4k3d,EVQLRESGPSLVKPSQTLSLTCTASGFSLSDKAVGWVRQAPGKALE...,EAVLNQPSSVSGSLGQRVSITCSGSSSNVGNGYVSWYQLIPGSAPR...,0,893,,,,,...,,,,,,,,,,left_only
212,1631,5e99,QVQLRESGPSLVKPSQTLSLTCTASGFSLSDKAVGWVRQAPGKALE...,QAVLNQPSSVSGSLGQRVSITCSGSSSNVGNGYVSWYQLIPGSAPR...,1,330,,,,,...,,,,,,,,,,left_only
254,1356,4odh,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSYGMHWVRQAPGKGLE...,QSVLTQPPSVSAAPGQKVTISCSGSSSNIGNNYVSWYQQLPGTAPK...,0,446,,,,,...,,,,,,,,,,left_only
264,1354,4od1,VQLVESGGGVVQPGKSLRLSCAASRFSFNRYGMHWVRQAPGKGLEW...,QSVLTQPPSVSAAPGQKVTISCSGSSSNIGNNFVSWYQQRPGTAPK...,0,69,,,,,...,,,,,,,,,,left_only
265,1355,4od3,EVQLVESGGGVVQPGRSLRLSCVGSQFSFNRYGMHWVRQAPGKGLE...,QAVLTQPPSVSAAPGQNVTISCSGSGSNIGNNFVSWYQQRPGTAPK...,0,69,,,,,...,,,,,,,,,,left_only
471,1749,5ijv,QVQLRESGPSLVKPSQTLSLTCTASGFSLSDKAVGWVRQAPGKALE...,QAVLNQPSSVSGSLGQRVSITCSGSSSNVGNGYVSWYQLIPGSAPR...,1,403,,,,,...,,,,,,,,,,left_only
500,2216,6e9g,QVQLRESGPSLVKPSQTLSLTCTASGFSLSDKAVGWVRQAPGKALE...,EAVLNQPSSVSGSLGQRVSITCSGSSSNVGNGYVSWYQLIPGSAPR...,1,863,,,,,...,,,,,,,,,,left_only
600,1752,5ilt,QVQLRESGPSLVKPSQTLSLTCTASGFSLSDKAVGWVRQAPEKALE...,QAVLNQPSSVSGSLGQRVSITCSGSSSNVGNGYVSWYQLIPGSAPR...,1,455,,,,,...,,,,,,,,,,left_only


In [28]:
test_4.reset_index().merge(sapiens_chen,indicator = True, how='left', right_on="Ab_ID", left_on="Antibody_ID").loc[lambda x : x['_merge']=='left_only']["index"]

37     1237
40     1236
212    1631
254    1356
264    1354
265    1355
471    1749
500    2216
600    1752
Name: index, dtype: int64

In [11]:
# onehot
test_4.reset_index().merge(te,indicator = True, how='left', right_on="Ab_ID", left_on="Antibody_ID").loc[lambda x : x['_merge']!='both']

Unnamed: 0,index,Antibody_ID,heavy,light,Y_x,cluster,Ab_ID,1_-_x,1_A_x,1_C_x,...,127_P_y,127_Q_y,127_R_y,127_S_y,127_T_y,127_V_y,127_W_y,127_Y_y,Y_y,_merge
507,1921,5ubz,QVQLQESGPRLVKPSDTLSLTCTVSGGSITSDSHYWGWVRQSPGKG...,QAVVTQPPSASGTPGQRVTISCSGSSSNIGSNTVNWYQQLPGLAPK...,0,823,,,,,...,,,,,,,,,,left_only


In [5]:
def onehot(train_df, test_df):
    x_chen = pd.read_feather(path.join(DATA_DIR, "chen/onehot/chen_onehot_short.ftr"))
    x_chen_train = x_chen.merge(train_df[["Antibody_ID", "Y"]].reset_index(), left_on="Ab_ID", right_on="Antibody_ID").set_index('index').drop("Antibody_ID", axis=1)
    x_chen_test = x_chen.merge(test_df[["Antibody_ID", "Y"]].reset_index(), left_on="Ab_ID", right_on="Antibody_ID").set_index('index').drop("Antibody_ID", axis=1)
    x_tap = pd.read_feather(path.join(DATA_DIR, "tap/onehot/tap_onehot_short.ftr"))
    x_tap = x_tap.drop(["Ab_ID"], axis=1)
    return x_chen_train, x_chen_test, x_tap

## Preprocessing

In [20]:
def no_prepro(train_df, test_df, tap_df):
    return train_df.drop("Y", axis=1), train_df["Y"], test_df, tap_df

In [21]:
def scaling(train_df, test_df, tap_df):
    scaler = StandardScaler()
    scaler.fit(train_df.drop(["Ab_ID", "Y", "cluster_merged"], axis=1))
    x_train_tr = scaler.transform(train_df.drop(["Ab_ID", "Y", "cluster_merged"], axis=1))
    x_train_df = pd.DataFrame(data=train_df,  index=train_df.index, columns=train_df.drop(["Ab_ID", "Y", "cluster_merged"], axis=1).columns)
    x_train_df["cluster_merged"] = train_df["cluster_merged"]
    x_train_df["Ab_ID"] = train_df["Ab_ID"]
    
    x_test_tr = scaler.transform(test_df.drop(["Ab_ID", "Y"], axis=1))
    x_test_df = pd.DataFrame(data=test_df,  index=test_df.index, columns=test_df.drop(["Ab_ID", "Y"], axis=1).columns)
    x_test_df["Y"] = test_df["Y"]
    x_test_df["Ab_ID"] = test_df["Ab_ID"]
    
    x_tap_tr = scaler.transform(tap_df)
    x_tap_df = pd.DataFrame(data=tap_df,  index=tap_df.index, columns=tap_df.columns)

    return x_train_df, train_df["Y"], x_test_df, x_tap_df

In [22]:
def oversampling(train_df, test_df, tap_df):
    sampler = RandomOverSampler(random_state=42)
    x_train, y_train = sampler.fit_resample(train_df.drop("Y", axis=1), train_df["Y"])
    return x_train, y_train, test_df, tap_df

In [23]:
def smote_os(train_df, test_df, tap_df):
    sampler = SMOTE(random_state=42)
    x_train_tr, y_train = sampler.fit_resample(train_df.drop(["Ab_ID", "Y"], axis=1), train_df["Y"])
    x_train_tr["Ab_ID"] = ""
    return x_train_tr, y_train, test_df, tap_df

In [24]:
def undersampling(train_df, test_df, tap_df):
    sampler = RandomUnderSampler(random_state=42)
    x_train, y_train = sampler.fit_resample(train_df.drop("Y", axis=1), train_df["Y"]) 
    return x_train, y_train, test_df, tap_df

## All together

In [25]:
data_loaders = [integer_encoded, pybiomed, protparam, bert, seqvec, sapiens, onehot]
model_creators = [knn, logistic_regression, random_forest, multilayer_perceptron, svm, gradient_boosting]
preprocessing = [no_prepro, scaling, oversampling, smote_os, undersampling]

In [26]:
def crossval_round(train_df, test_df, eval_dir):
    for prepro in preprocessing:
        prepro_name = prepro.__name__
        for data_rep in data_loaders:
            data_name = data_rep.__name__
            x_train, x_test, x_tap = data_rep(train_df, test_df)
            x_train_tr, y_train_tr, x_test_tr, tap_tr = prepro(x_train, x_test, x_tap)
            
            try_all(x_train_tr.drop(["Ab_ID", "cluster_merged"], axis=1), y_train_tr, 
                x_test_tr.drop(["Ab_ID", "Y"], axis=1), x_test_tr["Y"], 
                x_train_tr["cluster_merged"], data_name, eval_dir, prepro_name, model_creators)
            
            test_all(tap_tr, tap_data["Y"], data_name, eval_dir, prepro_name)

In [None]:
chen_filtered = pd.read_csv(path.join(DATA_DIR, f"chen/deduplicated/filtered_chen_data.csv"), index_col=0)
for seed in seeds:
    chen_train = pd.read_csv(path.join(DATA_DIR, f"chen/deduplicated/crossval/chen_train_{seed}.csv"), index_col=0)
    chen_train = merge_clusters(chen_train, clusters)
    chen_train = chen_train.merge(chen_filtered[["Antibody_ID"]], on="Antibody_ID")
    
    chen_test = pd.read_csv(path.join(DATA_DIR, f"chen/deduplicated/crossval/chen_test_{seed}.csv"), index_col=0)
    eval_dir = f"10-fold-cross-val/training_split_{seed}"
    try:
        os.mkdir(os.path.join(DATA_DIR, f"evaluations/{eval_dir}"))
        os.mkdir(os.path.join(DATA_DIR, f"evaluations/{eval_dir}/models"))
    except:
        pass
    crossval_round(chen_train, chen_test, eval_dir)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unique clusters after merge: 10


Training model kNN on data integer_encoded with preprocessing no_prepro 

kNN, integer_encoded, no_prepro
F1: 0.22222222222222224
MCC: -0.007061249691130693
Accuracy: 0.6455696202531646
Precision: 0.21621621621621623
Recall: 0.22857142857142856
AUC: 0.4963995354239257
-----


Training model logistic_regression on data integer_encoded with preprocessing no_prepro 



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

logistic_regression, integer_encoded, no_prepro
F1: 0.391304347826087
MCC: 0.17053871945589077
Accuracy: 0.6455696202531646
Precision: 0.3157894736842105
Recall: 0.5142857142857142
AUC: 0.5986062717770034
-----


Training model random_forest on data integer_encoded with preprocessing no_prepro 

random_forest, integer_encoded, no_prepro
F1: 0.24203821656050956
MCC: -0.0046430622913821665
Accuracy: 0.6234177215189873
Precision: 0.21839080459770116
Recall: 0.2714285714285714
AUC: 0.49750290360046456
-----


Training model multilayer_perceptron on data integer_encoded with preprocessing no_prepro 





multilayer_perceptron, integer_encoded, no_prepro
F1: 0.2923076923076923
MCC: 0.11092370107563784
Accuracy: 0.7088607594936709
Precision: 0.31666666666666665
Recall: 0.2714285714285714
AUC: 0.5523809523809524
-----


Training model SVM on data integer_encoded with preprocessing no_prepro 



  "X does not have valid feature names, but"


SVM, integer_encoded, no_prepro
F1: 0.41530054644808745
MCC: 0.20619025693587661
Accuracy: 0.6613924050632911
Precision: 0.336283185840708
Recall: 0.5428571428571428
AUC: 0.6189895470383273
-----


Training model gradient_boosting on data integer_encoded with preprocessing no_prepro 



  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does 

gradient_boosting, integer_encoded, no_prepro
F1: 0.12500000000000003
MCC: 0.006669777491504802
Accuracy: 0.7341772151898734
Precision: 0.23076923076923078
Recall: 0.08571428571428572
AUC: 0.5022067363530778
-----
Testing model logistic_regression on integer_encoded with preprocessing no_prepro...
Testing model random_forest on integer_encoded with preprocessing no_prepro...
Testing model gradient_boosting on integer_encoded with preprocessing no_prepro...
Testing model SVM on integer_encoded with preprocessing no_prepro...
Testing model multilayer_perceptron on integer_encoded with preprocessing no_prepro...


Training model kNN on data pybiomed with preprocessing no_prepro 





kNN, pybiomed, no_prepro
F1: 0.3165467625899281
MCC: 0.12386704697272526
Accuracy: 0.6993670886075949
Precision: 0.3188405797101449
Recall: 0.3142857142857143
AUC: 0.5616144018583044
-----


Training model logistic_regression on data pybiomed with preprocessing no_prepro 



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [10]:
chen_train = pd.read_csv(path.join(DATA_DIR, f"chen/deduplicated/crossval/chen_train_2.csv"), index_col=0)
chen_train = merge_clusters(chen_train, clusters)
#chen_train = chen_train.merge(chen_filtered[["Antibody_ID"]], on="Antibody_ID")
chen_test = pd.read_csv(path.join(DATA_DIR, f"chen/deduplicated/crossval/chen_test_2.csv"), index_col=0)

Unique clusters after merge: 10


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
