In [42]:
import pandas as pd
import numpy as np
import json
import csv
import os
from sklearn import metrics

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

from sklearn.linear_model import LogisticRegression
from sklearn.utils.fixes import loguniform    
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from statistics import mean

In [11]:
DATA_DIR = "../../data"  

In [3]:
def logistic_regression(n):
    lr = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
    parameters = {'C':loguniform(0.001, 1000), 'penalty': ["l2"], "solver": ["lbfgs", "sag"]}
    return lr, parameters, "logistic_regression"

In [4]:
def random_forest(n):
    rf = RandomForestClassifier(random_state=42, n_jobs=-1, class_weight='balanced')
    parameters = {'n_estimators': np.arange(1, 200, 10), 'max_depth': np.arange(1, min(50,n), 2), 
                  'max_features': np.arange(0.1, 0.75, 0.05)}
    return rf, parameters, "random_forest"

In [5]:
def multilayer_perceptron(n):
    mlp = MLPClassifier(random_state=42, max_iter=int(1000))
    parameters = {'hidden_layer_sizes': [(100,), (50,), (100, 100)], "activation": ["relu", "logistic"]}
    return mlp, parameters, "multilayer_perceptron"

In [44]:
def train_and_eval(model_name, classifier, parameters, X_train, y_train, X_valid, y_valid, 
                   data_name, preprocessing=None):
    grid = RandomizedSearchCV(classifier, parameters, verbose=1, scoring="f1")
    grid.fit(X_train, y_train)
    estimator = grid.best_estimator_
    best_params = grid.best_params_
    y_pred = estimator.predict(X_valid)
    metric_dict = {
        "f1": float(metrics.f1_score(y_valid, y_pred)),
        "acc": float(metrics.accuracy_score(y_valid, y_pred)),
        "mcc": float(metrics.matthews_corrcoef(y_valid, y_pred)),
        "auc": float(metrics.roc_auc_score(y_valid, y_pred)),
        "precision": float(metrics.precision_score(y_valid, y_pred)),
        "recall": float(metrics.recall_score(y_valid, y_pred))
    }
    
    print(f"{model_name}, {data_name}")
    print(f"F1: {metric_dict['f1']}")
    print(f"MCC: {metric_dict['mcc']}")
    print(f"Accuracy: {metric_dict['acc']}")
    print(f"Precision: {metric_dict['precision']}")
    print(f"Recall: {metric_dict['recall']}")
    print(f"AUC: {metric_dict['auc']}")
    print(f"-----")
    
    return metric_dict["mcc"]

In [34]:
def test_data(X_data, y_data, data_name):
    X_train, X_valid, y_train, y_valid = train_test_split(X_data, y_data, test_size=0.3, random_state=42)
    n = len(y_train)
    classifier, params, model_label = logistic_regression(n)
    lr_mcc = train_and_eval("logistic regression", classifier, params, X_train, y_train, X_valid, y_valid, 
                   data_name)
    
    classifier, params, model_label = random_forest(n)
    rf_mcc = train_and_eval("random forest", classifier, params, X_train, y_train, X_valid, y_valid, 
                   data_name)
    
    classifier, params, model_label = multilayer_perceptron(n)
    mlp_mcc = train_and_eval("multilayer perceptron", classifier, params, X_train, y_train, X_valid, y_valid, 
                   data_name)
    
    max_mcc = max(lr_mcc, rf_mcc, mlp_mcc)
    avg_mcc = mean([lr_mcc, rf_mcc, mlp_mcc])
    
    print(f"Max MCC: {max_mcc}")
    print(f"Avg MCC: {avg_mcc}")

In [36]:
data = pd.read_csv(os.path.join(DATA_DIR, "chen/integer_encoding/chen_integer_encoded.csv"))
data.head()

Unnamed: 0.1,Unnamed: 0,Ab_ID,0,1,2,3,4,5,6,7,...,271,272,273,274,275,276,277,278,279,280
0,0,12e8,4,18,14,10,14,14,16,6,...,0,0,0,0,0,0,0,0,0,0
1,1,15c8,4,18,14,10,14,14,16,6,...,0,0,0,0,0,0,0,0,0,0
2,2,1a0q,4,18,14,10,14,4,16,3,...,0,0,0,0,0,0,0,0,0,0
3,3,1a14,14,18,14,10,14,14,16,6,...,0,0,0,0,0,0,0,0,0,0
4,4,1a2y,14,18,14,10,14,4,16,6,...,0,0,0,0,0,0,0,0,0,0


In [37]:
data.drop("Unnamed: 0", axis=1, inplace=True)
data.head()

Unnamed: 0,Ab_ID,0,1,2,3,4,5,6,7,8,...,271,272,273,274,275,276,277,278,279,280
0,12e8,4,18,14,10,14,14,16,6,1,...,0,0,0,0,0,0,0,0,0,0
1,15c8,4,18,14,10,14,14,16,6,1,...,0,0,0,0,0,0,0,0,0,0
2,1a0q,4,18,14,10,14,4,16,3,1,...,0,0,0,0,0,0,0,0,0,0
3,1a14,14,18,14,10,14,14,16,6,1,...,0,0,0,0,0,0,0,0,0,0
4,1a2y,14,18,14,10,14,4,16,6,13,...,0,0,0,0,0,0,0,0,0,0


In [17]:
chen_data = pd.read_csv(os.path.join(DATA_DIR, "chen/chen_data.csv"))
chen_data.head()

Unnamed: 0,Antibody_ID,heavy,light,Y
0,12e8,EVQLQQSGAEVVRSGASVKLSCTASGFNIKDYYIHWVKQRPEKGLE...,DIVMTQSQKFMSTSVGDRVSITCKASQNVGTAVAWYQQKPGQSPKL...,0
1,15c8,EVQLQQSGAELVKPGASVKLSCTASGFNIKDTYMHWVKQKPEQGLE...,DIVLTQSPAIMSASLGERVTMTCTASSSVSSSNLHWYQQKPGSSPK...,0
2,1a0q,EVQLQESDAELVKPGASVKISCKASGYTFTDHVIHWVKQKPEQGLE...,DIELTQSPSSLSASLGGKVTITCKASQDIKKYIGWYQHKPGKQPRL...,1
3,1a14,QVQLQQSGAELVKPGASVRMSCKASGYTFTNYNMYWVKQSPGQGLE...,DIELTQTTSSLSASLGDRVTISCRASQDISNYLNWYQQNPDGTVKL...,0
4,1a2y,QVQLQESGPGLVAPSQSLSITCTVSGFSLTGYGVNWVRQPPGKGLE...,DIVLTQSPASLSASVGETVTITCRASGNIHNYLAWYQQKQGKSPQL...,0


In [45]:
test_data(data.drop("Ab_ID", axis=1), chen_data["Y"], "Simple numbering")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
logistic regression, Simple numbering
F1: 0.5204081632653061
MCC: 0.36883134829018427
Accuracy: 0.739972337482711
Precision: 0.42857142857142855
Recall: 0.6623376623376623
AUC: 0.7116609225572319
-----
Fitting 5 folds for each of 10 candidates, totalling 50 fits
random forest, Simple numbering
F1: 0.5909090909090908
MCC: 0.5132724650605403
Accuracy: 0.8506224066390041
Precision: 0.7090909090909091
Recall: 0.5064935064935064
AUC: 0.7251272453381417
-----
Fitting 5 folds for each of 6 candidates, totalling 30 fits
multilayer perceptron, Simple numbering
F1: 0.5468164794007491
MCC: 0.4551980246229353
Accuracy: 0.8326417704011065
Precision: 0.6460176991150443
Recall: 0.474025974025974
AUC: 0.7018636021272224
-----
Max MCC: 0.5132724650605403
Avg MCC: 0.4457672793245533


In [24]:
len(chen_data)

2409

In [52]:
data = pd.read_csv(os.path.join(DATA_DIR, "chen/integer_encoding/chen_integers_from_anarci.csv"))
data.head()

Unnamed: 0.1,Unnamed: 0,Ab_ID,1,2,3,3A,4,4A,5,6,...,118.1,119.1,120.1,121.1,122.1,123.1,124.1,125.1,126.1,127.1
0,0,12e8,4.0,18.0,14.0,21.0,10.0,21.0,14.0,14.0,...,5,6,1,6,17,9,10,4,10,9
1,1,15c8,4.0,18.0,14.0,21.0,10.0,21.0,14.0,14.0,...,5,6,6,6,17,9,10,4,8,9
2,2,1a0q,4.0,18.0,14.0,21.0,10.0,21.0,14.0,4.0,...,5,6,6,6,17,9,10,4,8,9
3,3,1a14,14.0,18.0,14.0,21.0,10.0,21.0,14.0,14.0,...,5,6,6,6,17,21,21,21,21,21
4,4,1a2y,14.0,18.0,14.0,21.0,10.0,21.0,14.0,4.0,...,5,6,6,6,17,9,10,4,8,9


In [53]:
data.drop("Unnamed: 0", axis=1, inplace=True)
data.dropna(inplace=True)
data.head()

Unnamed: 0,Ab_ID,1,2,3,3A,4,4A,5,6,7,...,118.1,119.1,120.1,121.1,122.1,123.1,124.1,125.1,126.1,127.1
0,12e8,4.0,18.0,14.0,21.0,10.0,21.0,14.0,14.0,16.0,...,5,6,1,6,17,9,10,4,10,9
1,15c8,4.0,18.0,14.0,21.0,10.0,21.0,14.0,14.0,16.0,...,5,6,6,6,17,9,10,4,8,9
2,1a0q,4.0,18.0,14.0,21.0,10.0,21.0,14.0,4.0,16.0,...,5,6,6,6,17,9,10,4,8,9
3,1a14,14.0,18.0,14.0,21.0,10.0,21.0,14.0,14.0,16.0,...,5,6,6,6,17,21,21,21,21,21
4,1a2y,14.0,18.0,14.0,21.0,10.0,21.0,14.0,4.0,16.0,...,5,6,6,6,17,9,10,4,8,9


In [57]:
test_data(data.drop("Ab_ID", axis=1), chen_data.loc[data.index]["Y"], "ANARCI numbering")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
logistic regression, ANARCI numbering
F1: 0.5219512195121953
MCC: 0.37681271399049737
Accuracy: 0.7285318559556787
Precision: 0.4115384615384615
Recall: 0.7133333333333334
AUC: 0.7229254079254079
-----
Fitting 5 folds for each of 10 candidates, totalling 50 fits
random forest, ANARCI numbering
F1: 0.5376344086021506
MCC: 0.3986065840991702
Accuracy: 0.7617728531855956
Precision: 0.45045045045045046
Recall: 0.6666666666666666
AUC: 0.7266899766899766
-----
Fitting 5 folds for each of 6 candidates, totalling 30 fits
multilayer perceptron, ANARCI numbering
F1: 0.0
MCC: 0.0
Accuracy: 0.7922437673130194
Precision: 0.0
Recall: 0.0
AUC: 0.5
-----
Max MCC: 0.3986065840991702
Avg MCC: 0.25847309936322255


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


In [58]:
data = pd.read_csv(os.path.join(DATA_DIR, "chen/integer_encoding/chen_integer_encoded_separate.csv"))
data.head()

Unnamed: 0,Ab_ID,0,1,2,3,4,5,6,7,8,...,280,281,282,283,284,285,286,287,288,289
0,12e8,4,18,14,10,14,14,16,6,1,...,0,0,0,0,0,0,0,0,0,0
1,15c8,4,18,14,10,14,14,16,6,1,...,0,0,0,0,0,0,0,0,0,0
2,1a0q,4,18,14,10,14,4,16,3,1,...,0,0,0,0,0,0,0,0,0,0
3,1a14,14,18,14,10,14,14,16,6,1,...,0,0,0,0,0,0,0,0,0,0
4,1a2y,14,18,14,10,14,4,16,6,13,...,0,0,0,0,0,0,0,0,0,0


In [59]:
test_data(data.drop("Ab_ID", axis=1), chen_data["Y"], "H and L separate numbering")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
logistic regression, H and L separate numbering
F1: 0.5444743935309972
MCC: 0.40376474790820144
Accuracy: 0.7662517289073306
Precision: 0.46543778801843316
Recall: 0.6558441558441559
AUC: 0.725988861753361
-----
Fitting 5 folds for each of 10 candidates, totalling 50 fits
random forest, H and L separate numbering
F1: 0.5970149253731343
MCC: 0.5164817993091685
Accuracy: 0.8506224066390041
Precision: 0.7017543859649122
Recall: 0.5194805194805194
AUC: 0.7298632825873599
-----
Fitting 5 folds for each of 6 candidates, totalling 30 fits
multilayer perceptron, H and L separate numbering
F1: 0.5203252032520326
MCC: 0.45012595917001846
Accuracy: 0.8367911479944675
Precision: 0.6956521739130435
Recall: 0.4155844155844156
AUC: 0.6831876383721727
-----
Max MCC: 0.5164817993091685
Avg MCC: 0.4567908354624628
