# Import packages

In [1]:
import pandas as pd

from datetime import datetime
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA
from sklearn.svm import SVR, SVC

from sklearn.model_selection import train_test_split, GridSearchCV, \
    RepeatedStratifiedKFold

from sklearn.metrics import mean_squared_error, mean_absolute_error, \
    r2_score, make_scorer, recall_score, accuracy_score, f1_score, \
    precision_score, balanced_accuracy_score, roc_curve, auc

from collinearity import SelectNonCollinear

# Custom functions

In [2]:
from pickle_managment import save_pickle

# Regression

In [3]:
regression_df_expanded_cleaned = pd.read_csv(
    r'datasets\train_datasets\regression_df_expanded_cleaned_train.csv.zip'
)
regression_df_expanded_cleaned

Unnamed: 0,SMILES,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,...,158_y,159_y,160_y,161_y,162_y,163_y,164_y,165_y,166_y,logBB
0,CC(C)(C)OC(=O)CCCC1=CC=C(C=C1)N(CCCl)CCCl,11.682268,11.682268,0.134704,-0.409691,0.474821,11.347826,360.325,333.109,359.141884,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.00
1,Clc1ccc(N2)c(C(C)=NC(O)C2=O)c1,11.306112,11.306112,0.532016,-1.350763,0.700249,20.066667,224.647,215.575,224.035255,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.61
2,CC1=C2OC3=C(C=CC=C3)C3(O)CNCC3C2=CC=C1,11.238358,11.238358,0.028750,-0.893218,0.770818,27.000000,267.328,250.192,267.125929,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.39
3,C[C@H]1[C@H]([C@H](C[C@@H](O1)O[C@H]2C[C@@](CC...,13.643057,13.643057,0.017252,-2.240737,0.235263,29.846154,543.525,514.293,543.174061,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,-0.83
4,CCC1(C(=O)NC(=O)NC1=O)C2=CC=CC=C2,11.975791,11.975791,0.293611,-1.310347,0.736884,18.529412,232.239,220.143,232.084792,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,-0.12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
940,C(=O)(N)N,9.000000,9.000000,0.833333,-0.833333,0.370507,6.000000,60.056,56.024,60.032363,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,-0.14
941,C[C@@H](CC1=CC=CC=C1)NC,3.213399,3.213399,0.567037,0.567037,0.691109,12.909091,149.237,134.117,149.120449,...,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.96
942,CNC(=O)C1=C(N=C(N=C1OCC2CCN(CC2)C)C#N)NCC3CCC4...,12.730831,12.730831,0.023700,-0.298674,0.670950,21.531250,440.592,404.304,440.289974,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.37
943,CCC1=C2C(=CC=C1)C3=C(N2)C(OCC3)(CC)CC(=O)O,11.285827,11.285827,0.004742,-0.820305,0.905784,21.428571,287.359,266.191,287.152144,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,-1.42


In [4]:
# Data pre-processing

regression_X = regression_df_expanded_cleaned.loc[
                   :,
                   ~regression_df_expanded_cleaned.columns.isin(
                       ['SMILES', 'logBB'])
                   ]

regression_y = regression_df_expanded_cleaned['logBB']


data_processing_pipeline = Pipeline([
    ('scaler', StandardScaler().set_output(transform='pandas'))
])

regression_X_processed=data_processing_pipeline.fit_transform(regression_X)
regression_X_processed

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,157_y,158_y,159_y,160_y,161_y,162_y,163_y,164_y,165_y,166_y
0,0.417052,0.417240,-0.342126,0.166963,-0.675608,-0.788599,0.242281,0.219503,0.237570,0.161789,...,0.877304,0.470032,0.838274,0.614452,0.436092,0.485071,0.418776,0.511549,0.347845,-0.072932
1,0.309766,0.309953,0.397096,-0.486012,0.499964,0.214680,-0.590699,-0.554520,-0.592608,-0.665765,...,0.877304,0.470032,0.838274,0.614452,0.436092,0.485071,0.418776,0.511549,0.347845,-0.072932
2,0.290441,0.290628,-0.539258,-0.168538,0.867969,1.012500,-0.328663,-0.326549,-0.327833,-0.283817,...,0.877304,0.470032,0.838274,0.614452,0.436092,0.485071,0.418776,0.511549,0.347845,-0.072932
3,0.976308,0.976493,-0.560650,-1.103531,-1.924866,1.340008,1.367017,1.412695,1.368376,1.403120,...,0.877304,0.470032,0.838274,0.614452,0.436092,0.485071,0.418776,0.511549,0.347845,-0.072932
4,0.500771,0.500958,-0.046470,-0.457969,0.691009,0.037788,-0.544088,-0.524437,-0.543147,-0.506620,...,-1.139855,0.470032,0.838274,0.614452,0.436092,0.485071,0.418776,0.511549,0.347845,-0.072932
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
940,-0.347982,-0.347793,0.957713,-0.126987,-1.219586,-1.403974,-1.601186,-1.605247,-1.600342,-1.525148,...,-1.139855,0.470032,-1.192928,-1.627467,0.436092,-2.061553,-2.387913,0.511549,-2.874840,-0.072932
941,-1.998434,-1.998239,0.462254,0.844677,0.452298,-0.608944,-1.053670,-1.090964,-1.052931,-0.952226,...,-1.139855,0.470032,-1.192928,0.614452,0.436092,0.485071,0.418776,-1.954847,0.347845,-0.072932
942,0.716123,0.716310,-0.548655,0.243993,0.347174,0.383210,0.735071,0.688359,0.736193,0.862027,...,0.877304,0.470032,0.838274,0.614452,0.436092,0.485071,0.418776,0.511549,0.347845,-0.072932
943,0.303980,0.304168,-0.583926,-0.117947,1.571798,0.371395,-0.205685,-0.221187,-0.204779,-0.124672,...,0.877304,-2.127514,0.838274,0.614452,0.436092,0.485071,0.418776,0.511549,0.347845,-0.072932


In [5]:


regression_X_train, regression_X_test, regression_y_train, regression_y_test = train_test_split(
    regression_X_processed,
    regression_y,
    test_size=0.2,
    random_state=1,
    shuffle=True
)

In [6]:
save_pickle(regression_X_test, r'model_outputs\svm_classifier\regression_x_test.pkl')

save_pickle(regression_y_test, r'model_outputs\svm_classifier\regression_y_test.pkl')

0

## SVM

### Model training

In [None]:
start_time = datetime.now()

svr_model = SVR()

svr_regressor_grid_search = GridSearchCV(
    estimator=svr_model,
    param_grid={
        'kernel': ['rbf'],  #Kernel to solve with model, should try rbf, poly, and sigmoid
        'gamma': ['auto', 3],  #Solver value important for rbf kernel
        'C': [1, 5] #Regularization parameter
    },
    cv=8,  #Number of fold for cross validation. It should be 8 or 10
    scoring={
        # All these are only viable in the negative option
        'MAE': 'neg_mean_absolute_error',
        'MSE': 'neg_mean_squared_error',
        'R2': 'r2'
    },
    refit='R2',

    n_jobs=1,
    # -1 means using all processors, but it won't give you any messages.
    # Only using 1 for my computer print out the training messages

    verbose=10  #Provide detailed messages
)

svr_regressor_grid_search.fit(regression_X_train, regression_y_train)

end_time = datetime.now()
print('GridSearchCV took {}'.format(end_time - start_time))

In [None]:
svr_regressor_results_df = pd.DataFrame(svr_regressor_grid_search.cv_results_)
#Make the GridSearch results into a df
svr_regressor_results_df.drop(
    list(svr_regressor_results_df.filter(regex='time|split|std')),
    axis=1,
    inplace=True
)  # Remove columns that aren't very interesting

svr_regressor_results_df = svr_regressor_results_df.sort_values(
    by='rank_test_R2')
svr_regressor_results_df

In [None]:
svr_regressor_results_df.to_csv(
    r'model_grid_search\svm_regressor_results.csv',
    index=False
)

In [None]:
best_svr_regressor = svr_regressor_grid_search.best_estimator_
save_pickle(
    best_svr_regressor,
    r'model_pickles\best_svm_regressor.pkl'
)
# To load this best model again, use load_pickle(r'model_pickles\best_svm_regressor.pkl')

### Model interpretation

# Classification

## SVM

### Model training

#### Balanced by centroid method

In [None]:
classification_centroid_df = pd.read_csv(
    r"datasets/balanced_datasets/BBB_classification_balanced_centroid.csv.zip"
)

classification_centroid_df

In [None]:
# Data pre-processing

classification_X = classification_centroid_df.loc[
                   :,
                   ~classification_centroid_df.columns.isin(
                       ['SMILES', 'BBB+/BBB-'])
                   ]

classification_y = classification_centroid_df['BBB+/BBB-']


data_processing_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(
        n_components=0.95, #When using the svd_solver='full', n_components
        # can be between 0 and 1 to represent the percentage of variance
        # that you want to explain
        svd_solver='full'
    ))
])
pickle_pipe = data_processing_pipeline.fit(classification_X)
classification_X_processed=data_processing_pipeline.transform(classification_X)
print(classification_X_processed.shape)

In [None]:
save_pickle(
    pickle_pipe,
    r'model_outputs\svm_classifier\centroid_pipeline.pkl'
)

top_two_components = data_processing_pipeline['pca'].components_[0:2]


save_pickle(
    top_two_components,
    r'model_outputs\svm_classifier\top_two_centroid.pkl'
)

In [None]:
classification_X_train, classification_X_test, classification_y_train, classification_y_test = train_test_split(
    classification_X_processed,
    classification_y,
    test_size=0.2,
    random_state=1,
    shuffle=True,
    stratify=classification_y #Ensure train set and test set have the same
    # ratio for the 2 categories
)

In [None]:
save_pickle(classification_X_test, r'model_outputs\svm_classifier\centroid_x_test.pkl')

save_pickle(classification_y_test, r'model_outputs\svm_classifier\centroid_y_test.pkl')

In [None]:
start_time = datetime.now()

svc_centroid = SVC(
    random_state=1,
    probability=True
)

svc_centroid_grid_search = GridSearchCV(
    estimator=svc_centroid,
    param_grid={
        'C': [1,5], #Regularization parameter
        'kernel': ['rbf'], #Algorithm kernel
        'gamma': ['auto', 3] #Coefficient for algorithms
    },
    cv=RepeatedStratifiedKFold(
        n_splits=8,
        n_repeats=2, #Each time the split will be different
        random_state=1
    ),
    scoring={
        'Recall': make_scorer(
            recall_score, #Need pos_label
            pos_label='BBB+', #Without this, pos_label is default to be 1
            # and will through an error since 1 isn't "BBB+" or "BBB-"
            average='binary'
        ),
        'Precision': make_scorer(
            precision_score, #Need pos_label
            pos_label='BBB+',
            average='binary'
        ),
        'F1': make_scorer(
            f1_score, #Need pos_label
            pos_label='BBB+',
            average='binary'
        ),
        'Accuracy': 'accuracy', #accuracy_score doesn't need pos_label
        'Balanced accuracy': 'balanced_accuracy',
        'AUROC': 'roc_auc'
    },
    refit='AUROC',

    n_jobs=1,
    verbose=10
)

svc_centroid_grid_search.fit(classification_X_train, classification_y_train)

end_time = datetime.now()
print('GridSearchCV took {}'.format(end_time - start_time))

In [None]:
svc_centroid_results_df = pd.DataFrame(svc_centroid_grid_search.cv_results_)
#Make the GridSearch results into a df

svc_centroid_results_df.drop(
    list(svc_centroid_results_df.filter(regex='time|split|std')),
    axis=1,
    inplace=True
)  # Remove columns that aren't very interesting
svc_centroid_results_df = svc_centroid_results_df.sort_values(
    by='rank_test_AUROC')

svc_centroid_results_df

In [None]:
svc_centroid_results_df.to_csv(
    r'model_grid_search\svm_classifier_centroid_results.csv',
    index=False
)

In [None]:
best_svc_centroid_classifier = svc_centroid_grid_search.best_estimator_
save_pickle(
    best_svc_centroid_classifier,
    r'model_pickles\best_svm_classifier_centroid.pkl'
)
# To load this best model again, use load_pickle(r'model_pickles\svc_centroid_classifier\best_svc_centroid_classifier.pkl')

#### Balanced by SMOTEENN method

In [None]:
classification_smoteenn_df = pd.read_csv(
    r"datasets/balanced_datasets/BBB_classification_balanced_smoteenn.csv.zip"
)

classification_smoteenn_df

In [None]:
# Data pre-processing

classification_X = classification_smoteenn_df.loc[
                   :,
                   ~classification_smoteenn_df.columns.isin(
                       ['SMILES', 'BBB+/BBB-'])
                   ]

classification_y = classification_smoteenn_df['BBB+/BBB-']


data_processing_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(
        n_components=0.95, #When using the svd_solver='full', n_components
        # can be between 0 and 1 to represent the percentage of variance
        # that you want to explain
        svd_solver='full'
    ))
])

pickle_pipe = data_processing_pipeline.fit(classification_X)
classification_X_processed=data_processing_pipeline.transform(classification_X)
print(classification_X_processed.shape)

In [None]:
save_pickle(
    pickle_pipe,
    r'model_outputs\svm_classifier\smoteenn_pipeline.pkl'
)

top_two_components = data_processing_pipeline['pca'].components_[0:2]


save_pickle(
    top_two_components,
    r'model_outputs\svm_classifier\top_two_smoteenn.pkl'
)

In [None]:
classification_X_train, classification_X_test, classification_y_train, classification_y_test = train_test_split(
    classification_X_processed,
    classification_y,
    test_size=0.2,
    random_state=1,
    shuffle=True,
    stratify=classification_y #Ensure train set and test set have the same
    # ratio for the 2 categories
)

In [None]:
save_pickle(classification_X_test, r'model_outputs\svm_classifier\smoteenn_x_test.pkl')

save_pickle(classification_y_test, r'model_outputs\svm_classifier\smoteenn_y_test.pkl')

In [None]:
start_time = datetime.now()

svc_smoteenn = SVC(
    random_state=1,
    probability=True
)

svc_smoteenn_grid_search = GridSearchCV(
    estimator=svc_smoteenn,
    param_grid={
        'C': [1,5], #Regularization parameter
        'kernel': ['rbf'], #Algorithm kernel. Should add poly and sigmoid
        'gamma': ['auto', 3] #Coefficient for algorithms
    },
    cv=RepeatedStratifiedKFold(
        n_splits=8,
        n_repeats=2, #Each time the split will be different
        random_state=1
    ),
    scoring={
        'Recall': make_scorer(
            recall_score, #Need pos_label
            pos_label='BBB+', #Without this, pos_label is default to be 1
            # and will through an error since 1 isn't "BBB+" or "BBB-"
            average='binary'
        ),
        'Precision': make_scorer(
            precision_score, #Need pos_label
            pos_label='BBB+',
            average='binary'
        ),
        'F1': make_scorer(
            f1_score, #Need pos_label
            pos_label='BBB+',
            average='binary'
        ),
        'Accuracy': 'accuracy', #accuracy_score doesn't need pos_label
        'Balanced accuracy': 'balanced_accuracy',
        'AUROC': 'roc_auc'
    },
    refit='AUROC',

    n_jobs=1,
    verbose=10
)

svc_smoteenn_grid_search.fit(classification_X_train, classification_y_train)

end_time = datetime.now()
print('GridSearchCV took {}'.format(end_time - start_time))

In [None]:
svc_smoteenn_results_df = pd.DataFrame(svc_smoteenn_grid_search.cv_results_)
#Make the GridSearch results into a df

svc_smoteenn_results_df.drop(
    list(svc_smoteenn_results_df.filter(regex='time|split|std')),
    axis=1,
    inplace=True
)  # Remove columns that aren't very interesting
svc_smoteenn_results_df = svc_smoteenn_results_df.sort_values(
    by='rank_test_AUROC')

svc_smoteenn_results_df

In [None]:
svc_smoteenn_results_df.to_csv(
    r'model_grid_search\svm_classifier_smoteenn_results.csv',
    index=False
)

In [None]:
best_svc_smoteenn_classifier = svc_smoteenn_grid_search.best_estimator_
save_pickle(
    best_svc_smoteenn_classifier,
    r'model_pickles\best_svm_classifier_smoteenn.pkl'
)
# To load this best model again, use load_pickle(r'model_pickles\svc_smoteenn_classifier\best_svc_smoteenn_classifier.pkl')