# Hyperparameter selection

**Goal**: Optimize settings/parameters within a chosen model class to maximize performance.

**Output**: A report with metrics to inform which model types are likely to perform better.

**Notes**:
- Even though logistic regression using all of the FPR (All) genes did sliiightly better, there are so many more features and I have so few cells that I'm worried about creating issues when N features >> N cells

**TODO**:


In [34]:

# Import needed libraries
import scanpy as sc
import numpy as np
import pandas as pd
import os

import matplotlib.cm as cm
import matplotlib.pyplot as plt

from utils.config import *
from utils.analysis_variables import *
from utils.analysis_functions import *

import pickle

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer


In [35]:
# Scanpy setup
sc.settings.verbosity = 3 # corresponds to hints

# Notebook setup
np.random.seed(15)

import warnings
warnings.filterwarnings('ignore')

In [36]:
# Important paths
notebook_name = "05b_hyperparam_selection"

# path_outdir_base = "../../output/20240221_import"
path_results = os.path.join(path_outdir_base, notebook_name)
os.makedirs(path_results, exist_ok=True)

path_input_data = os.path.join(path_outdir_base, "03_create_test_train", "training_data.pkl")
path_input_adata = os.path.join(path_outdir_base, "03_create_test_train", "adata_labeled.h5ad")
path_input_features = os.path.join(path_outdir_base, "04_feature_selection", "dict_feature_data.pkl")

chosen_feature_set = "FPR (200)"

# Import data

In [37]:
with open(path_input_data, 'rb') as f:
    data_dict = pickle.load(f)

X_train = data_dict['X']
y_train = data_dict['Y']

In [38]:
adata_labeled = sc.read_h5ad(path_input_adata)

In [39]:

with open(path_input_features, 'rb') as file:
    dict_feature_data = pickle.load(file)

print(dict_feature_data.keys())
prelim_model_features = dict_feature_data[chosen_feature_set]


dict_keys(['Highly Variable Genes', 'K Best (100)', 'K Best (200)', 'K Best (Signif)', 'FPR (All)', 'FPR (200)'])


In [40]:
 # Subset training data to only include X_data corresponding to features we want to use
adata_model = adata_labeled[:, adata_labeled.var.index.isin(prelim_model_features)]

mask_features = adata_labeled.var.index.isin(prelim_model_features)
X_train_filtered = X_train[:, mask_features]

print(f"Shape of X_train: {X_train_filtered.shape}, Shape of Y_train: {y_train.shape}")

Shape of X_train: (7407, 266), Shape of Y_train: (7407,)


# Train models using a wider set of hyperparameters

In [41]:
# Define scoring metrics

scoring = {
    'accuracy': 'accuracy',  # Default scoring for classification
    'specificity': make_scorer(specificity),
    'false_positive_rate': make_scorer(false_positive_rate, greater_is_better=False),  # Minimize false positive rate
    'false_negative_rate': make_scorer(false_negative_rate, greater_is_better=False),  # Minimize false negative rate; something funny abt this processing makes this neg
    'precision': make_scorer(precision)
}

# Define pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', None), # classifier will be replaced during grid search
])

param_grid_svc = {
    'classifier': [SVC()],
    'classifier__C': [0.01, 0.1, 1, 10, 100],  # Regularization parameter; smaller is stronger
    'classifier__kernel': ['rbf', 'linear', 'poly', 'sigmoid'],
    'classifier__gamma': ['scale', 'auto']
}

param_grid_lr = {
    'classifier': [LogisticRegression()],
    'classifier__C': [0.01, 0.1, 1, 10, 100],  # Regularization parameter; smaller is stronger
    'classifier__penalty': ['l1', 'l2', 'elasticnet', None],
    'classifier__solver': ['lbfgs', 'liblinear']
}

# Combine the param grids into a list
all_param_grids = [param_grid_svc, param_grid_lr]


# Do grid search CV on different parameter sets

In [42]:
def GridSearchCV_on_featureset(X_train, featureset_id, all_param_grids=all_param_grids, pipeline=pipeline, scoring=scoring, path_results=path_results):
    print(f"RandomizedSearchCV on {featureset_id}")
    
    # fit the model and assess
    best_params_list = []
    best_estimator_list = []
    cv_result_list = []
    cv_result_list_df = []
    cv_result_list_filtered = []

    for param_grid in all_param_grids:
        grid_search = RandomizedSearchCV(pipeline, param_grid, cv=5, scoring=scoring, refit='accuracy') #GridSearchCV
        grid_search.fit(X_train, y_train)

        # Access the best parameters and best estimator for each classifier
        best_params_list.append( grid_search.best_params_ )
        best_estimator_list.append(grid_search.best_estimator_)

        cv_results = grid_search.cv_results_
        cv_result_list.append(cv_results)

        print(cv_results['mean_test_accuracy'])

        print(f"Best Parameters: {grid_search.best_params_ }")
        print(f"Best Estimator: {grid_search.best_estimator_}")


    for cv_scores in cv_result_list:
        cv_df = pd.DataFrame.from_dict(cv_scores)

        cols_params = [x for x in cv_df.columns if x.startswith("param")]
        cols_scores =  [x for x in cv_df.columns if x.startswith("mean_")]

        cv_result_list_df.append(cv_df)
        cv_result_list_filtered.append(cv_df[cols_params+cols_scores])

    merged_cv_df = pd.concat(cv_result_list_df, join='outer', axis=0)
    merged_cv_df_filtered = pd.concat(cv_result_list_filtered, join='outer', axis=0)

    merged_cv_df.to_csv(os.path.join(path_results, 'merged_cv_df_' + featureset_id+'.csv'))
    merged_cv_df_filtered.to_csv(os.path.join(path_results, 'merged_cv_df_filtered_' + featureset_id + '.csv'))

    return(merged_cv_df)


In [43]:
result = GridSearchCV_on_featureset(X_train_filtered, chosen_feature_set)

RandomizedSearchCV on FPR (200)
[0.96989278 0.98015347 0.98015347 0.94343137 0.98096364 0.98096364
 0.83353745 0.91224589 0.93816557 0.96827243]
Best Parameters: {'classifier__kernel': 'rbf', 'classifier__gamma': 'scale', 'classifier__C': 10, 'classifier': SVC()}
Best Estimator: Pipeline(steps=[('scaler', StandardScaler()), ('classifier', SVC(C=10))])
[       nan        nan 0.97704883 0.97286401 0.97164898        nan
 0.97556334 0.95139723        nan 0.97691387]
Best Parameters: {'classifier__solver': 'liblinear', 'classifier__penalty': 'l2', 'classifier__C': 0.1, 'classifier': LogisticRegression()}
Best Estimator: Pipeline(steps=[('scaler', StandardScaler()),
                ('classifier', LogisticRegression(C=0.1, solver='liblinear'))])


# Based on results, create the best model

SVMs are less interpretable than logistic regression models, but importance of features can still be hinted at by looking at how far genes fall from decision boundary.

In [47]:
best_model = SVC(C=1, kernel = 'rbf', gamma = 'scale')
best_model.fit(X_train_filtered, y_train)

# Save files

In [44]:

# with open(os.path.join(path_results, 'training_data_selectfeatures.pkl'), 'wb') as f:
#     pickle.dump({'X': X_train_features, 'Y': y_train}, f)

In [48]:
import joblib
joblib.dump(best_model, os.path.join(path_results, 'best_model.pkl'))


['../../output/20240221_import/05b_hyperparam_selection/best_model.pkl']