# Import packages

In [1]:
import pandas as pd

from datetime import datetime
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA
from sklearn.svm import SVR, SVC

from sklearn.model_selection import train_test_split, GridSearchCV, \
    RepeatedStratifiedKFold

from sklearn.metrics import mean_squared_error, mean_absolute_error, \
    r2_score, make_scorer, recall_score, accuracy_score, f1_score, \
    precision_score, balanced_accuracy_score, roc_curve, auc

# Custom functions

In [2]:
from pickle_managment import save_pickle

# Regression

In [3]:
regression_df_expanded_cleaned = pd.read_csv(
    r'datasets\train_datasets\regression_df_expanded_cleaned_train.csv.zip'
)
regression_df_expanded_cleaned

Unnamed: 0,SMILES,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,...,158_y,159_y,160_y,161_y,162_y,163_y,164_y,165_y,166_y,logBB
0,CC(C)(C)OC(=O)CCCC1=CC=C(C=C1)N(CCCl)CCCl,11.682268,11.682268,0.134704,-0.409691,0.474821,11.347826,360.325,333.109,359.141884,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.00
1,Clc1ccc(N2)c(C(C)=NC(O)C2=O)c1,11.306112,11.306112,0.532016,-1.350763,0.700249,20.066667,224.647,215.575,224.035255,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.61
2,CC1=C2OC3=C(C=CC=C3)C3(O)CNCC3C2=CC=C1,11.238358,11.238358,0.028750,-0.893218,0.770818,27.000000,267.328,250.192,267.125929,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.39
3,C[C@H]1[C@H]([C@H](C[C@@H](O1)O[C@H]2C[C@@](CC...,13.643057,13.643057,0.017252,-2.240737,0.235263,29.846154,543.525,514.293,543.174061,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,-0.83
4,CCC1(C(=O)NC(=O)NC1=O)C2=CC=CC=C2,11.975791,11.975791,0.293611,-1.310347,0.736884,18.529412,232.239,220.143,232.084792,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,-0.12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
940,C(=O)(N)N,9.000000,9.000000,0.833333,-0.833333,0.370507,6.000000,60.056,56.024,60.032363,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,-0.14
941,C[C@@H](CC1=CC=CC=C1)NC,3.213399,3.213399,0.567037,0.567037,0.691109,12.909091,149.237,134.117,149.120449,...,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.96
942,CNC(=O)C1=C(N=C(N=C1OCC2CCN(CC2)C)C#N)NCC3CCC4...,12.730831,12.730831,0.023700,-0.298674,0.670950,21.531250,440.592,404.304,440.289974,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.37
943,CCC1=C2C(=CC=C1)C3=C(N2)C(OCC3)(CC)CC(=O)O,11.285827,11.285827,0.004742,-0.820305,0.905784,21.428571,287.359,266.191,287.152144,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,-1.42


In [4]:
regression_X = regression_df_expanded_cleaned.loc[
               :,
               ~regression_df_expanded_cleaned.columns.isin(
                   ['SMILES', 'logBB'])
               ]

regression_y = regression_df_expanded_cleaned['logBB']

regression_X_train, regression_X_test, regression_y_train, regression_y_test = train_test_split(
    regression_X,
    regression_y,
    test_size=0.2,
    random_state=1,
    shuffle=True
)

## SVM

### Model training

In [5]:
start_time = datetime.now()

svr_model = SVR()

svr_regressor_grid_search = GridSearchCV(
    estimator=svr_model,
    param_grid={
        'kernel': ['rbf'],  #Kernel to solve with model, should try rbf, poly, and sigmoid
        'gamma': [0.1, 0.5],  #Solver value important for rbf kernel
        #'degree': [2,3,4] #Used for polynomial kernel
        #'C': [0.001, 0.01, 0.1, 1, 10, 100] #Regularization parameter
    },
    cv=2,  #Number of fold for cross validation. It should be 8 or 10
    scoring={
        # All these are only viable in the negative option
        'MAE': 'neg_mean_absolute_error',
        'MSE': 'neg_mean_squared_error',
        'R2': 'r2'
    },
    refit='R2',

    n_jobs=1,
    # -1 means using all processors, but it won't give you any messages.
    # Only using 1 for my computer print out the training messages

    verbose=10  #Provide detailed messages
)

svr_regressor_grid_search.fit(regression_X_train, regression_y_train)

end_time = datetime.now()
print('GridSearchCV took {}'.format(end_time - start_time))

Fitting 2 folds for each of 2 candidates, totalling 4 fits
[CV 1/2; 1/2] START gamma=0.1, kernel=rbf.......................................
[CV 1/2; 1/2] END gamma=0.1, kernel=rbf; MAE: (test=-0.586) MSE: (test=-0.575) R2: (test=0.065) total time=   0.7s
[CV 2/2; 1/2] START gamma=0.1, kernel=rbf.......................................
[CV 2/2; 1/2] END gamma=0.1, kernel=rbf; MAE: (test=-0.537) MSE: (test=-0.495) R2: (test=0.072) total time=   0.8s
[CV 1/2; 2/2] START gamma=0.5, kernel=rbf.......................................
[CV 1/2; 2/2] END gamma=0.5, kernel=rbf; MAE: (test=-0.586) MSE: (test=-0.575) R2: (test=0.065) total time=   0.9s
[CV 2/2; 2/2] START gamma=0.5, kernel=rbf.......................................
[CV 2/2; 2/2] END gamma=0.5, kernel=rbf; MAE: (test=-0.537) MSE: (test=-0.495) R2: (test=0.072) total time=   0.8s
GridSearchCV took 0:00:04.811224


In [6]:
svr_regressor_results_df = pd.DataFrame(svr_regressor_grid_search.cv_results_)
#Make the GridSearch results into a df
svr_regressor_results_df.drop(
    list(svr_regressor_results_df.filter(regex='time|split|std')),
    axis=1,
    inplace=True
)  # Remove columns that aren't very interesting

svr_regressor_results_df = svr_regressor_results_df.sort_values(
    by='rank_test_R2')
svr_regressor_results_df

Unnamed: 0,param_gamma,param_kernel,params,mean_test_MAE,rank_test_MAE,mean_test_MSE,rank_test_MSE,mean_test_R2,rank_test_R2
0,0.1,rbf,"{'gamma': 0.1, 'kernel': 'rbf'}",-0.561631,1,-0.534939,1,0.068754,1
1,0.5,rbf,"{'gamma': 0.5, 'kernel': 'rbf'}",-0.561638,2,-0.534952,2,0.06873,2


In [7]:
svr_regressor_results_df.to_csv(
    r'model_grid_search\svm_regressor_results.csv',
    index=False
)

In [8]:
best_svr_regressor = svr_regressor_grid_search.best_estimator_
save_pickle(
    best_svr_regressor,
    r'model_pickles\best_svm_regressor.pkl'
)
# To load this best model again, use load_pickle(r'model_pickles\svr_regressor\best_rf_regressor.pkl')

0

### Model interpretation

# Classification

## SVM

### Model training

#### Balanced by centroid method

In [9]:
classification_centroid_df = pd.read_csv(
    r"datasets/balanced_datasets/BBB_classification_balanced_centroid.csv.zip"
)

classification_centroid_df

Unnamed: 0,SMILES,BBB+/BBB-,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,...,157_y,158_y,159_y,160_y,161_y,162_y,163_y,164_y,165_y,166_y
0,CN[C@@H]1[C@H](O[C@H]2[C@H](O[C@@H]3[C@H](NC(=...,BBB+,9.095732,9.095732,0.282774,-0.017526,0.586761,11.860043,358.488,332.280,...,0.333333,1.0,0.666667,1.0,1.0,1.0,1.0,1.0,1.0,0.0
1,CCCCC[C@H](N)C(O)(c1ccccc1)c1ccccc1,BBB+,15.143070,15.143070,0.021937,-1.615734,0.147925,26.905882,1202.635,1090.747,...,1.000000,1.0,1.000000,1.0,1.0,0.0,0.0,1.0,1.0,0.0
2,Cc1nnc(SCC2=C(C(=O)O)N3C(=O)[C@@H](NC(=O)[C@H]...,BBB+,15.022344,15.022344,0.036460,-1.947471,0.109538,42.500000,1085.156,1008.548,...,1.000000,0.0,1.000000,1.0,0.0,1.0,1.0,1.0,1.0,0.0
3,C[C@@H]1C[C@H]2[C@@H]3C[C@H](F)C4=CC(=O)C(Cl)=...,BBB+,14.543654,14.543654,0.006968,-2.468650,0.116173,36.726027,1030.303,942.607,...,1.000000,1.0,1.000000,1.0,1.0,0.0,1.0,1.0,1.0,0.0
4,CC(C)=CCN1Cc2cc(Cl)cc3[nH]c(=S)n(c23)C[C@@H]1C,BBB+,14.995286,14.995286,0.035518,-1.944947,0.109538,42.500000,1085.156,1008.548,...,1.000000,0.0,1.000000,1.0,0.0,1.0,1.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5127,CC1CC2C(C(O)CC3(C)C2CCC3(O)C(=O)CO)C2(C)C=CC(=...,BBB-,12.428253,12.428253,0.083608,-1.721468,0.122643,47.948718,569.609,526.265,...,1.000000,1.0,1.000000,1.0,1.0,0.0,1.0,1.0,1.0,0.0
5128,CC[C@H]1CC(=O)[C@@H]2Oc3c(OC)ccc4c3[C@@]23CCN(...,BBB-,12.559895,12.559895,0.048057,-3.487690,0.443295,22.448276,440.503,420.343,...,1.000000,1.0,1.000000,1.0,1.0,1.0,1.0,1.0,1.0,0.0
5129,CN(C)C=Nc1c(I)cc(I)c(CCC(=O)O)c1I,BBB-,12.935121,12.935121,0.008292,-0.854188,0.192994,15.076923,551.769,498.345,...,1.000000,1.0,1.000000,1.0,1.0,1.0,1.0,1.0,1.0,0.0
5130,CC(=O)Oc1ccc(C(=C2CCCCC2)c2ccc(OC(C)=O)cc2)cc1,BBB-,13.042813,13.042813,0.190845,-3.798036,0.233814,31.275000,594.668,564.428,...,1.000000,1.0,1.000000,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [10]:
# Data pre-processing

classification_X = classification_centroid_df.loc[
                   :,
                   ~classification_centroid_df.columns.isin(
                       ['SMILES', 'BBB+/BBB-'])
                   ]

classification_y = classification_centroid_df['BBB+/BBB-']


data_processing_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(
        n_components=0.95, #When using the svd_solver='full', n_components
        # can be between 0 and 1 to represent the percentage of variance
        # that you want to explain
        svd_solver='full'
    ))
])

classification_X_processed=data_processing_pipeline.fit_transform(classification_X)
classification_X_processed

array([[-5.21368953,  3.62439041,  3.83142447, ..., -0.28812032,
         1.01953803,  1.14219888],
       [26.17726014,  2.11795901,  7.38495038, ..., -0.06678346,
        -0.10646903,  0.11734033],
       [30.11439095, -6.29187033, 11.81692613, ...,  0.16951343,
         0.13249661, -0.20160932],
       ...,
       [ 4.95976758, -2.1562782 ,  4.54480143, ...,  0.19877777,
         0.478151  ,  0.5183374 ],
       [ 9.25061068,  9.77543853, -4.86349062, ...,  0.56208832,
        -0.43130308, -0.22326148],
       [-4.13634063,  2.39963095,  3.04815532, ..., -0.65332458,
         0.38452521,  0.0315058 ]])

In [11]:
classification_X_train, classification_X_test, classification_y_train, classification_y_test = train_test_split(
    classification_X_processed,
    classification_y,
    test_size=0.2,
    random_state=1,
    shuffle=True,
    stratify=classification_y #Ensure train set and test set have the same
    # ratio for the 2 categories
)

In [12]:
start_time = datetime.now()

svc_centroid = SVC(
    random_state=1
)

svc_centroid_grid_search = GridSearchCV(
    estimator=svc_centroid,
    param_grid={
        # 'C': [0.001, 0.01, 0.1, 1, 10, 100], #Regularization parameter
        'kernel': ['rbf'], #Algorithm kernel. Should add poly and sigmoid
        #'degree': [2,3,4], #Polynomial degree
        'gamma': [0.1, 0.5] #Coefficient for algorithms
    },
    cv=RepeatedStratifiedKFold(
        n_splits=3,
        n_repeats=2, #Each time the split will be different
        random_state=1
    ),
    scoring={
        'Recall': make_scorer(
            recall_score, #Need pos_label
            pos_label='BBB+', #Without this, pos_label is default to be 1
            # and will through an error since 1 isn't "BBB+" or "BBB-"
            average='binary'
        ),
        'Precision': make_scorer(
            precision_score, #Need pos_label
            pos_label='BBB+',
            average='binary'
        ),
        'F1': make_scorer(
            f1_score, #Need pos_label
            pos_label='BBB+',
            average='binary'
        ),
        'Accuracy': 'accuracy', #accuracy_score doesn't need pos_label
        'Balanced accuracy': 'balanced_accuracy',
        'AUROC': 'roc_auc'
    },
    refit='AUROC',

    n_jobs=1,
    verbose=10
)

svc_centroid_grid_search.fit(classification_X_train, classification_y_train)

end_time = datetime.now()
print('GridSearchCV took {}'.format(end_time - start_time))

Fitting 6 folds for each of 2 candidates, totalling 12 fits
[CV 1/6; 1/2] START gamma=0.1, kernel=rbf.......................................
[CV 1/6; 1/2] END gamma=0.1, kernel=rbf; AUROC: (test=0.816) Accuracy: (test=0.741) Balanced accuracy: (test=0.741) F1: (test=0.791) Precision: (test=0.664) Recall: (test=0.980) total time=  18.3s
[CV 2/6; 1/2] START gamma=0.1, kernel=rbf.......................................
[CV 2/6; 1/2] END gamma=0.1, kernel=rbf; AUROC: (test=0.826) Accuracy: (test=0.754) Balanced accuracy: (test=0.754) F1: (test=0.801) Precision: (test=0.673) Recall: (test=0.991) total time=  30.2s
[CV 3/6; 1/2] START gamma=0.1, kernel=rbf.......................................
[CV 3/6; 1/2] END gamma=0.1, kernel=rbf; AUROC: (test=0.812) Accuracy: (test=0.732) Balanced accuracy: (test=0.732) F1: (test=0.786) Precision: (test=0.654) Recall: (test=0.982) total time=  24.6s
[CV 4/6; 1/2] START gamma=0.1, kernel=rbf.......................................
[CV 4/6; 1/2] END gamma=0

In [13]:
svc_centroid_results_df = pd.DataFrame(svc_centroid_grid_search.cv_results_)
#Make the GridSearch results into a df

svc_centroid_results_df.drop(
    list(svc_centroid_results_df.filter(regex='time|split|std')),
    axis=1,
    inplace=True
)  # Remove columns that aren't very interesting
svc_centroid_results_df = svc_centroid_results_df.sort_values(
    by='rank_test_AUROC')

svc_centroid_results_df

Unnamed: 0,param_gamma,param_kernel,params,mean_test_Recall,rank_test_Recall,mean_test_Precision,rank_test_Precision,mean_test_F1,rank_test_F1,mean_test_Accuracy,rank_test_Accuracy,mean_test_Balanced accuracy,rank_test_Balanced accuracy,mean_test_AUROC,rank_test_AUROC
0,0.1,rbf,"{'gamma': 0.1, 'kernel': 'rbf'}",0.984657,1,0.665369,1,0.7941,1,0.74458,1,0.744521,1,0.816975,1
1,0.5,rbf,"{'gamma': 0.5, 'kernel': 'rbf'}",0.984657,1,0.664276,2,0.79332,2,0.743362,2,0.743303,2,0.800742,2


In [14]:
svc_centroid_results_df.to_csv(
    r'model_grid_search\svm_classifier_centroid_results.csv',
    index=False
)

In [15]:
best_svc_centroid_classifier = svc_centroid_grid_search.best_estimator_
save_pickle(
    best_svc_centroid_classifier,
    r'model_pickles\best_svm_classifier_centroid.pkl'
)
# To load this best model again, use load_pickle(r'model_pickles\svc_centroid_classifier\best_svc_centroid_classifier.pkl')

0

#### Balanced by SMOTEENN method

In [16]:
classification_smoteenn_df = pd.read_csv(
    r"datasets/balanced_datasets/BBB_classification_balanced_smoteenn.csv.zip"
)

classification_smoteenn_df

Unnamed: 0,SMILES,BBB+/BBB-,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,...,157_y,158_y,159_y,160_y,161_y,162_y,163_y,164_y,165_y,166_y
0,CN[C@@H]1[C@H](O[C@H]2[C@H](O[C@@H]3[C@H](NC(=...,BBB+,16.987088,16.987088,0.036070,-2.333658,0.607687,52.700000,444.902000,417.686000,...,1.000000,0.000000,1.000000,1.000000,0.0,0.0,1.0,1.000000,1.0,0.0
1,CCCCC[C@H](N)C(O)(c1ccccc1)c1ccccc1,BBB+,11.850791,11.850791,0.333611,-1.232222,0.556573,20.437500,224.260000,208.132000,...,0.000000,1.000000,1.000000,1.000000,1.0,0.0,1.0,1.000000,1.0,0.0
2,Cc1nnc(SCC2=C(C(=O)O)N3C(=O)[C@@H](NC(=O)[C@H]...,BBB+,13.647107,13.647107,0.012185,-1.422510,0.564246,42.285714,488.621000,448.301000,...,1.000000,0.000000,1.000000,1.000000,0.0,0.0,1.0,1.000000,1.0,0.0
3,C[C@@H]1C[C@H]2[C@@H]3C[C@H](F)C4=CC(=O)C(Cl)=...,BBB+,11.649464,11.649464,0.198757,-0.198757,0.793104,26.576923,350.422000,328.246000,...,0.000000,1.000000,1.000000,1.000000,1.0,1.0,1.0,1.000000,1.0,0.0
4,CC(C)=CCN1Cc2cc(Cl)cc3[nH]c(=S)n(c23)C[C@@H]1C,BBB+,11.786832,11.786832,0.063339,-0.683465,0.912447,33.200000,295.766000,277.622000,...,1.000000,1.000000,1.000000,1.000000,1.0,1.0,1.0,1.000000,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6132,O=c1[nH]cnc2c1ncn2[C@@H]1CC[C@@H](CO)O1,BBB-,13.634735,13.634735,0.027420,-2.184699,0.298412,28.342105,527.526000,498.294000,...,1.000000,1.000000,1.000000,1.000000,1.0,1.0,1.0,1.000000,1.0,0.0
6133,CCCCCCCCCCCCCCCCOP(=O)(O)OCC[N+](C)(C)C,BBB-,11.010582,11.010582,0.070069,-1.291404,0.195738,49.951014,448.391665,410.237000,...,1.000000,1.000000,1.000000,1.000000,1.0,0.0,1.0,1.000000,1.0,0.0
6134,O=C1CN2CCO[C@]2(c2ccccc2F)c2cc(Cl)ccc2N1CCO,BBB-,7.454496,7.454496,0.509827,0.401000,0.748302,12.771272,205.245974,191.543994,...,0.318647,0.681353,0.318647,0.318647,1.0,1.0,1.0,0.318647,1.0,0.0
6135,CC(C)(CO)C(O)C(=O)NCCCC(=O)O,BBB-,12.530398,12.530398,0.147649,-0.918220,0.208973,18.593750,471.492000,446.292000,...,1.000000,1.000000,1.000000,1.000000,1.0,1.0,0.0,1.000000,1.0,0.0


In [17]:
# Data pre-processing

classification_X = classification_smoteenn_df.loc[
                   :,
                   ~classification_smoteenn_df.columns.isin(
                       ['SMILES', 'BBB+/BBB-'])
                   ]

classification_y = classification_smoteenn_df['BBB+/BBB-']


data_processing_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(
        n_components=0.95, #When using the svd_solver='full', n_components
        # can be between 0 and 1 to represent the percentage of variance
        # that you want to explain
        svd_solver='full'
    ))
])

classification_X_processed=data_processing_pipeline.fit_transform(classification_X)
classification_X_processed

array([[  2.98706038, -15.52534234, -11.35322912, ...,   0.32526048,
          0.3844538 ,   0.11867296],
       [ -5.6583304 ,  -0.56109026,  -1.29849247, ...,  -0.35508283,
         -0.10207494,   0.05669757],
       [  4.93606644, -14.55570922, -10.44574843, ...,   0.76240043,
          0.24285187,  -0.03796337],
       ...,
       [-10.66358129,   2.3762372 ,   2.68652006, ...,   3.25198126,
         -0.06720095,  -0.65328242],
       [  2.68403602,  13.68948642, -10.43922295, ...,  -0.21991513,
         -0.06626778,   0.17626702],
       [  5.21647711,   1.66299201,  -2.0178315 , ...,  -0.09730234,
         -0.09406303,  -0.0205432 ]])

In [18]:
classification_X_train, classification_X_test, classification_y_train, classification_y_test = train_test_split(
    classification_X_processed,
    classification_y,
    test_size=0.2,
    random_state=1,
    shuffle=True,
    stratify=classification_y #Ensure train set and test set have the same
    # ratio for the 2 categories
)

In [19]:
start_time = datetime.now()

svc_smoteenn = SVC(
    random_state=1
)

svc_smoteenn_grid_search = GridSearchCV(
    estimator=svc_smoteenn,
    param_grid={
        # 'C': [0.001, 0.01, 0.1, 1, 10, 100], #Regularization parameter
        'kernel': ['rbf'], #Algorithm kernel. Should add poly and sigmoid
        #'degree': [2,3,4], #Polynomial degree
        'gamma': [0.1, 0.5] #Coefficient for algorithms
    },
    cv=RepeatedStratifiedKFold(
        n_splits=3,
        n_repeats=2, #Each time the split will be different
        random_state=1
    ),
    scoring={
        'Recall': make_scorer(
            recall_score, #Need pos_label
            pos_label='BBB+', #Without this, pos_label is default to be 1
            # and will through an error since 1 isn't "BBB+" or "BBB-"
            average='binary'
        ),
        'Precision': make_scorer(
            precision_score, #Need pos_label
            pos_label='BBB+',
            average='binary'
        ),
        'F1': make_scorer(
            f1_score, #Need pos_label
            pos_label='BBB+',
            average='binary'
        ),
        'Accuracy': 'accuracy', #accuracy_score doesn't need pos_label
        'Balanced accuracy': 'balanced_accuracy',
        'AUROC': 'roc_auc'
    },
    refit='AUROC',

    n_jobs=1,
    verbose=10
)

svc_smoteenn_grid_search.fit(classification_X_train, classification_y_train)

end_time = datetime.now()
print('GridSearchCV took {}'.format(end_time - start_time))

Fitting 6 folds for each of 2 candidates, totalling 12 fits
[CV 1/6; 1/2] START gamma=0.1, kernel=rbf.......................................
[CV 1/6; 1/2] END gamma=0.1, kernel=rbf; AUROC: (test=0.957) Accuracy: (test=0.827) Balanced accuracy: (test=0.825) F1: (test=0.853) Precision: (test=0.744) Recall: (test=1.000) total time=  13.8s
[CV 2/6; 1/2] START gamma=0.1, kernel=rbf.......................................
[CV 2/6; 1/2] END gamma=0.1, kernel=rbf; AUROC: (test=0.965) Accuracy: (test=0.830) Balanced accuracy: (test=0.829) F1: (test=0.856) Precision: (test=0.748) Recall: (test=1.000) total time=  13.5s
[CV 3/6; 1/2] START gamma=0.1, kernel=rbf.......................................
[CV 3/6; 1/2] END gamma=0.1, kernel=rbf; AUROC: (test=0.966) Accuracy: (test=0.838) Balanced accuracy: (test=0.837) F1: (test=0.862) Precision: (test=0.757) Recall: (test=1.000) total time=  12.3s
[CV 4/6; 1/2] START gamma=0.1, kernel=rbf.......................................
[CV 4/6; 1/2] END gamma=0

In [20]:
svc_smoteenn_results_df = pd.DataFrame(svc_smoteenn_grid_search.cv_results_)
#Make the GridSearch results into a df

svc_smoteenn_results_df.drop(
    list(svc_smoteenn_results_df.filter(regex='time|split|std')),
    axis=1,
    inplace=True
)  # Remove columns that aren't very interesting
svc_smoteenn_results_df = svc_smoteenn_results_df.sort_values(
    by='rank_test_AUROC')

svc_smoteenn_results_df

Unnamed: 0,param_gamma,param_kernel,params,mean_test_Recall,rank_test_Recall,mean_test_Precision,rank_test_Precision,mean_test_F1,rank_test_F1,mean_test_Accuracy,rank_test_Accuracy,mean_test_Balanced accuracy,rank_test_Balanced accuracy,mean_test_AUROC,rank_test_AUROC
0,0.1,rbf,"{'gamma': 0.1, 'kernel': 'rbf'}",1.0,1,0.746956,1,0.855117,1,0.829091,1,0.827652,1,0.962218,1
1,0.5,rbf,"{'gamma': 0.5, 'kernel': 'rbf'}",1.0,1,0.731739,2,0.845052,2,0.815035,2,0.813479,2,0.948825,2


In [21]:
svc_smoteenn_results_df.to_csv(
    r'model_grid_search\svm_classifier_smoteenn_results.csv',
    index=False
)

In [22]:
best_svc_smoteenn_classifier = svc_smoteenn_grid_search.best_estimator_
save_pickle(
    best_svc_smoteenn_classifier,
    r'model_pickles\best_svm_classifier_smoteenn.pkl'
)
# To load this best model again, use load_pickle(r'model_pickles\svc_smoteenn_classifier\best_svc_smoteenn_classifier.pkl')

0