# Import packages

In [1]:
import pandas as pd

from datetime import datetime
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA
from sklearn.svm import SVR, SVC

from sklearn.model_selection import train_test_split, GridSearchCV, \
    RepeatedStratifiedKFold

from sklearn.metrics import mean_squared_error, mean_absolute_error, \
    r2_score, make_scorer, recall_score, accuracy_score, f1_score, \
    precision_score, balanced_accuracy_score, roc_curve, auc

from collinearity import SelectNonCollinear

# Custom functions

In [2]:
from pickle_managment import save_pickle

# Regression

In [None]:
regression_df_expanded_cleaned = pd.read_csv(
    r'datasets\train_datasets\regression_df_expanded_cleaned_train.csv.zip'
)
regression_df_expanded_cleaned

In [None]:
# Data pre-processing

regression_X = regression_df_expanded_cleaned.loc[
                   :,
                   ~regression_df_expanded_cleaned.columns.isin(
                       ['SMILES', 'logBB'])
                   ]

regression_y = regression_df_expanded_cleaned['logBB']


data_processing_pipeline = Pipeline([
    ('scaler', StandardScaler().set_output(transform='pandas'))
])

regression_X_processed=data_processing_pipeline.fit_transform(regression_X)
regression_X_processed

In [None]:


regression_X_train, regression_X_test, regression_y_train, regression_y_test = train_test_split(
    regression_X_processed,
    regression_y,
    test_size=0.2,
    random_state=1,
    shuffle=True
)

## SVM

### Model training

In [None]:
start_time = datetime.now()

svr_model = SVR()

svr_regressor_grid_search = GridSearchCV(
    estimator=svr_model,
    param_grid={
        'kernel': ['rbf'],  #Kernel to solve with model, should try rbf, poly, and sigmoid
        'gamma': ['auto', 3],  #Solver value important for rbf kernel
        'C': [1, 5] #Regularization parameter
    },
    cv=8,  #Number of fold for cross validation. It should be 8 or 10
    scoring={
        # All these are only viable in the negative option
        'MAE': 'neg_mean_absolute_error',
        'MSE': 'neg_mean_squared_error',
        'R2': 'r2'
    },
    refit='R2',

    n_jobs=1,
    # -1 means using all processors, but it won't give you any messages.
    # Only using 1 for my computer print out the training messages

    verbose=10  #Provide detailed messages
)

svr_regressor_grid_search.fit(regression_X_train, regression_y_train)

end_time = datetime.now()
print('GridSearchCV took {}'.format(end_time - start_time))

In [None]:
svr_regressor_results_df = pd.DataFrame(svr_regressor_grid_search.cv_results_)
#Make the GridSearch results into a df
svr_regressor_results_df.drop(
    list(svr_regressor_results_df.filter(regex='time|split|std')),
    axis=1,
    inplace=True
)  # Remove columns that aren't very interesting

svr_regressor_results_df = svr_regressor_results_df.sort_values(
    by='rank_test_R2')
svr_regressor_results_df

In [None]:
svr_regressor_results_df.to_csv(
    r'model_grid_search\svm_regressor_results.csv',
    index=False
)

In [None]:
best_svr_regressor = svr_regressor_grid_search.best_estimator_
save_pickle(
    best_svr_regressor,
    r'model_pickles\best_svm_regressor.pkl'
)
# To load this best model again, use load_pickle(r'model_pickles\best_svm_regressor.pkl')

### Model interpretation

# Classification

## SVM

### Model training

#### Balanced by centroid method

In [3]:
classification_centroid_df = pd.read_csv(
    r"datasets/balanced_datasets/BBB_classification_balanced_centroid.csv.zip"
)

classification_centroid_df

Unnamed: 0,SMILES,BBB+/BBB-,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,...,157_y,158_y,159_y,160_y,161_y,162_y,163_y,164_y,165_y,166_y
0,CN[C@@H]1[C@H](O[C@H]2[C@H](O[C@@H]3[C@H](NC(=...,BBB+,9.095732,9.095732,0.282774,-0.017526,0.586761,11.860043,358.488,332.280,...,0.333333,1.0,0.666667,1.0,1.0,1.0,1.0,1.0,1.0,0.0
1,CCCCC[C@H](N)C(O)(c1ccccc1)c1ccccc1,BBB+,15.143070,15.143070,0.021937,-1.615734,0.147925,26.905882,1202.635,1090.747,...,1.000000,1.0,1.000000,1.0,1.0,0.0,0.0,1.0,1.0,0.0
2,Cc1nnc(SCC2=C(C(=O)O)N3C(=O)[C@@H](NC(=O)[C@H]...,BBB+,15.022344,15.022344,0.036460,-1.947471,0.109538,42.500000,1085.156,1008.548,...,1.000000,0.0,1.000000,1.0,0.0,1.0,1.0,1.0,1.0,0.0
3,C[C@@H]1C[C@H]2[C@@H]3C[C@H](F)C4=CC(=O)C(Cl)=...,BBB+,14.543654,14.543654,0.006968,-2.468650,0.116173,36.726027,1030.303,942.607,...,1.000000,1.0,1.000000,1.0,1.0,0.0,1.0,1.0,1.0,0.0
4,CC(C)=CCN1Cc2cc(Cl)cc3[nH]c(=S)n(c23)C[C@@H]1C,BBB+,14.995286,14.995286,0.035518,-1.944947,0.109538,42.500000,1085.156,1008.548,...,1.000000,0.0,1.000000,1.0,0.0,1.0,1.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5127,CC1CC2C(C(O)CC3(C)C2CCC3(O)C(=O)CO)C2(C)C=CC(=...,BBB-,12.428253,12.428253,0.083608,-1.721468,0.122643,47.948718,569.609,526.265,...,1.000000,1.0,1.000000,1.0,1.0,0.0,1.0,1.0,1.0,0.0
5128,CC[C@H]1CC(=O)[C@@H]2Oc3c(OC)ccc4c3[C@@]23CCN(...,BBB-,12.559895,12.559895,0.048057,-3.487690,0.443295,22.448276,440.503,420.343,...,1.000000,1.0,1.000000,1.0,1.0,1.0,1.0,1.0,1.0,0.0
5129,CN(C)C=Nc1c(I)cc(I)c(CCC(=O)O)c1I,BBB-,12.935121,12.935121,0.008292,-0.854188,0.192994,15.076923,551.769,498.345,...,1.000000,1.0,1.000000,1.0,1.0,1.0,1.0,1.0,1.0,0.0
5130,CC(=O)Oc1ccc(C(=C2CCCCC2)c2ccc(OC(C)=O)cc2)cc1,BBB-,13.042813,13.042813,0.190845,-3.798036,0.233814,31.275000,594.668,564.428,...,1.000000,1.0,1.000000,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [4]:
# Data pre-processing

classification_X = classification_centroid_df.loc[
                   :,
                   ~classification_centroid_df.columns.isin(
                       ['SMILES', 'BBB+/BBB-'])
                   ]

classification_y = classification_centroid_df['BBB+/BBB-']


data_processing_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(
        n_components=0.95, #When using the svd_solver='full', n_components
        # can be between 0 and 1 to represent the percentage of variance
        # that you want to explain
        svd_solver='full'
    ))
])
pickle_pipe = data_processing_pipeline.fit(classification_X)
classification_X_processed=data_processing_pipeline.transform(classification_X)
print(classification_X_processed.shape)

(5132, 1529)


In [5]:
save_pickle(
    pickle_pipe,
    r'model_outputs\svm_classifier\centroid_pipeline.pkl'
)

top_two_components = data_processing_pipeline['pca'].components_[0:2]


save_pickle(
    top_two_components,
    r'model_outputs\svm_classifier\top_two_centroid.pkl'
)

0

In [None]:
classification_X_train, classification_X_test, classification_y_train, classification_y_test = train_test_split(
    classification_X_processed,
    classification_y,
    test_size=0.2,
    random_state=1,
    shuffle=True,
    stratify=classification_y #Ensure train set and test set have the same
    # ratio for the 2 categories
)

In [None]:
save_pickle(classification_X_test, r'model_outputs\svm_classifier\centroid_x_test.pkl')

save_pickle(classification_y_test, r'model_outputs\svm_classifier\centroid_y_test.pkl')

In [None]:
start_time = datetime.now()

svc_centroid = SVC(
    random_state=1,
    probability=True
)

svc_centroid_grid_search = GridSearchCV(
    estimator=svc_centroid,
    param_grid={
        'C': [1,5], #Regularization parameter
        'kernel': ['rbf'], #Algorithm kernel
        'gamma': ['auto', 3] #Coefficient for algorithms
    },
    cv=RepeatedStratifiedKFold(
        n_splits=8,
        n_repeats=2, #Each time the split will be different
        random_state=1
    ),
    scoring={
        'Recall': make_scorer(
            recall_score, #Need pos_label
            pos_label='BBB+', #Without this, pos_label is default to be 1
            # and will through an error since 1 isn't "BBB+" or "BBB-"
            average='binary'
        ),
        'Precision': make_scorer(
            precision_score, #Need pos_label
            pos_label='BBB+',
            average='binary'
        ),
        'F1': make_scorer(
            f1_score, #Need pos_label
            pos_label='BBB+',
            average='binary'
        ),
        'Accuracy': 'accuracy', #accuracy_score doesn't need pos_label
        'Balanced accuracy': 'balanced_accuracy',
        'AUROC': 'roc_auc'
    },
    refit='AUROC',

    n_jobs=1,
    verbose=10
)

svc_centroid_grid_search.fit(classification_X_train, classification_y_train)

end_time = datetime.now()
print('GridSearchCV took {}'.format(end_time - start_time))

In [None]:
svc_centroid_results_df = pd.DataFrame(svc_centroid_grid_search.cv_results_)
#Make the GridSearch results into a df

svc_centroid_results_df.drop(
    list(svc_centroid_results_df.filter(regex='time|split|std')),
    axis=1,
    inplace=True
)  # Remove columns that aren't very interesting
svc_centroid_results_df = svc_centroid_results_df.sort_values(
    by='rank_test_AUROC')

svc_centroid_results_df

In [None]:
svc_centroid_results_df.to_csv(
    r'model_grid_search\svm_classifier_centroid_results.csv',
    index=False
)

In [None]:
best_svc_centroid_classifier = svc_centroid_grid_search.best_estimator_
save_pickle(
    best_svc_centroid_classifier,
    r'model_pickles\best_svm_classifier_centroid.pkl'
)
# To load this best model again, use load_pickle(r'model_pickles\svc_centroid_classifier\best_svc_centroid_classifier.pkl')

#### Balanced by SMOTEENN method

In [6]:
classification_smoteenn_df = pd.read_csv(
    r"datasets/balanced_datasets/BBB_classification_balanced_smoteenn.csv.zip"
)

classification_smoteenn_df

Unnamed: 0,SMILES,BBB+/BBB-,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,...,157_y,158_y,159_y,160_y,161_y,162_y,163_y,164_y,165_y,166_y
0,CN[C@@H]1[C@H](O[C@H]2[C@H](O[C@@H]3[C@H](NC(=...,BBB+,16.987088,16.987088,0.036070,-2.333658,0.607687,52.700000,444.902000,417.686000,...,1.000000,0.000000,1.000000,1.000000,0.0,0.0,1.0,1.000000,1.0,0.0
1,CCCCC[C@H](N)C(O)(c1ccccc1)c1ccccc1,BBB+,11.850791,11.850791,0.333611,-1.232222,0.556573,20.437500,224.260000,208.132000,...,0.000000,1.000000,1.000000,1.000000,1.0,0.0,1.0,1.000000,1.0,0.0
2,Cc1nnc(SCC2=C(C(=O)O)N3C(=O)[C@@H](NC(=O)[C@H]...,BBB+,13.647107,13.647107,0.012185,-1.422510,0.564246,42.285714,488.621000,448.301000,...,1.000000,0.000000,1.000000,1.000000,0.0,0.0,1.0,1.000000,1.0,0.0
3,C[C@@H]1C[C@H]2[C@@H]3C[C@H](F)C4=CC(=O)C(Cl)=...,BBB+,11.649464,11.649464,0.198757,-0.198757,0.793104,26.576923,350.422000,328.246000,...,0.000000,1.000000,1.000000,1.000000,1.0,1.0,1.0,1.000000,1.0,0.0
4,CC(C)=CCN1Cc2cc(Cl)cc3[nH]c(=S)n(c23)C[C@@H]1C,BBB+,11.786832,11.786832,0.063339,-0.683465,0.912447,33.200000,295.766000,277.622000,...,1.000000,1.000000,1.000000,1.000000,1.0,1.0,1.0,1.000000,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6132,O=c1[nH]cnc2c1ncn2[C@@H]1CC[C@@H](CO)O1,BBB-,13.634735,13.634735,0.027420,-2.184699,0.298412,28.342105,527.526000,498.294000,...,1.000000,1.000000,1.000000,1.000000,1.0,1.0,1.0,1.000000,1.0,0.0
6133,CCCCCCCCCCCCCCCCOP(=O)(O)OCC[N+](C)(C)C,BBB-,11.010582,11.010582,0.070069,-1.291404,0.195738,49.951014,448.391665,410.237000,...,1.000000,1.000000,1.000000,1.000000,1.0,0.0,1.0,1.000000,1.0,0.0
6134,O=C1CN2CCO[C@]2(c2ccccc2F)c2cc(Cl)ccc2N1CCO,BBB-,7.454496,7.454496,0.509827,0.401000,0.748302,12.771272,205.245974,191.543994,...,0.318647,0.681353,0.318647,0.318647,1.0,1.0,1.0,0.318647,1.0,0.0
6135,CC(C)(CO)C(O)C(=O)NCCCC(=O)O,BBB-,12.530398,12.530398,0.147649,-0.918220,0.208973,18.593750,471.492000,446.292000,...,1.000000,1.000000,1.000000,1.000000,1.0,1.0,0.0,1.000000,1.0,0.0


In [7]:
# Data pre-processing

classification_X = classification_smoteenn_df.loc[
                   :,
                   ~classification_smoteenn_df.columns.isin(
                       ['SMILES', 'BBB+/BBB-'])
                   ]

classification_y = classification_smoteenn_df['BBB+/BBB-']


data_processing_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(
        n_components=0.95, #When using the svd_solver='full', n_components
        # can be between 0 and 1 to represent the percentage of variance
        # that you want to explain
        svd_solver='full'
    ))
])

pickle_pipe = data_processing_pipeline.fit(classification_X)
classification_X_processed=data_processing_pipeline.transform(classification_X)
print(classification_X_processed.shape)

(6137, 1161)


In [8]:
save_pickle(
    pickle_pipe,
    r'model_outputs\svm_classifier\smoteenn_pipeline.pkl'
)

top_two_components = data_processing_pipeline['pca'].components_[0:2]


save_pickle(
    top_two_components,
    r'model_outputs\svm_classifier\top_two_smoteenn.pkl'
)

0

In [None]:
classification_X_train, classification_X_test, classification_y_train, classification_y_test = train_test_split(
    classification_X_processed,
    classification_y,
    test_size=0.2,
    random_state=1,
    shuffle=True,
    stratify=classification_y #Ensure train set and test set have the same
    # ratio for the 2 categories
)

In [None]:
save_pickle(classification_X_test, r'model_outputs\svm_classifier\smoteenn_x_test.pkl')

save_pickle(classification_y_test, r'model_outputs\svm_classifier\smoteenn_y_test.pkl')

In [None]:
start_time = datetime.now()

svc_smoteenn = SVC(
    random_state=1,
    probability=True
)

svc_smoteenn_grid_search = GridSearchCV(
    estimator=svc_smoteenn,
    param_grid={
        'C': [1,5], #Regularization parameter
        'kernel': ['rbf'], #Algorithm kernel. Should add poly and sigmoid
        'gamma': ['auto', 3] #Coefficient for algorithms
    },
    cv=RepeatedStratifiedKFold(
        n_splits=8,
        n_repeats=2, #Each time the split will be different
        random_state=1
    ),
    scoring={
        'Recall': make_scorer(
            recall_score, #Need pos_label
            pos_label='BBB+', #Without this, pos_label is default to be 1
            # and will through an error since 1 isn't "BBB+" or "BBB-"
            average='binary'
        ),
        'Precision': make_scorer(
            precision_score, #Need pos_label
            pos_label='BBB+',
            average='binary'
        ),
        'F1': make_scorer(
            f1_score, #Need pos_label
            pos_label='BBB+',
            average='binary'
        ),
        'Accuracy': 'accuracy', #accuracy_score doesn't need pos_label
        'Balanced accuracy': 'balanced_accuracy',
        'AUROC': 'roc_auc'
    },
    refit='AUROC',

    n_jobs=1,
    verbose=10
)

svc_smoteenn_grid_search.fit(classification_X_train, classification_y_train)

end_time = datetime.now()
print('GridSearchCV took {}'.format(end_time - start_time))

In [None]:
svc_smoteenn_results_df = pd.DataFrame(svc_smoteenn_grid_search.cv_results_)
#Make the GridSearch results into a df

svc_smoteenn_results_df.drop(
    list(svc_smoteenn_results_df.filter(regex='time|split|std')),
    axis=1,
    inplace=True
)  # Remove columns that aren't very interesting
svc_smoteenn_results_df = svc_smoteenn_results_df.sort_values(
    by='rank_test_AUROC')

svc_smoteenn_results_df

In [None]:
svc_smoteenn_results_df.to_csv(
    r'model_grid_search\svm_classifier_smoteenn_results.csv',
    index=False
)

In [None]:
best_svc_smoteenn_classifier = svc_smoteenn_grid_search.best_estimator_
save_pickle(
    best_svc_smoteenn_classifier,
    r'model_pickles\best_svm_classifier_smoteenn.pkl'
)
# To load this best model again, use load_pickle(r'model_pickles\svc_smoteenn_classifier\best_svc_smoteenn_classifier.pkl')