# Model selection

---

1. Seperate data as a holdout set for model validation after parameter tuning

2. Make data into numpy matrices

3. Set up pipelines for 10_fold CV of each models for each algorithm where:
    - Data is scaled according to fold's train data
    - PCA is computed
    - Model is evaluated on fold

In [13]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [13]:
train_X = pd.read_csv("data/train_X.csv", index_col=0)
train_Y = pd.read_csv("data/train_Y.csv", index_col=0).to_numpy()
train_X.info()

In [14]:
# create splitter
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=1)

# allocate data to train/validate sets
for train_index, validate_index in sss.split(train_X, train_Y):
    x_train, x_validate = train_X.iloc[train_index, ], train_X.iloc[validate_index, ]
    y_train, y_validate = train_Y[train_index], train_Y[validate_index]
    
del(train_X, train_Y, sss, train_index, validate_index)

# make X df's into numpy matrices 
x_train = x_train.to_numpy()
x_validate = x_validate.to_numpy()

y_train = np.ravel(y_train)

---

# Setup model evaluation pipelines


In [None]:
from sklearn.metrics import roc_auc_score, make_scorer
auc_scorer = make_scorer(roc_auc_score)
metrics.roc_auc_score


scaler = StandardScaler()
pca = PCA()

logistic = LogisticRegression(max_iter=1000, tol=0.1)
estimators = [('scaler', StandardScaler()), ('pca', PCA()), ('logistic', logistic)]
pipe = Pipeline(steps = estimators)
pipe

# Parameters of pipelines can be set using ‘__’ separated parameter names:
# names need to match with those in pipeline 'estimators'
param_grid = {
    'pca__n_components': [10, 50, 100, 200, 500],
    'logistic__C': np.logspace(-4, 4, 20),
    'logistic__solver' : ['liblinear'],
    'logistic__penalty:': ['l1', 'l2'],

}

search = GridSearchCV(pipe, 
                      param_grid, 
                      scoring=auc_scorer,
                      n_jobs=-1, 
                      verbose=2)
search.fit(x_train, y_train)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)
print(search.cv_results_)

cv_results = pd.DataFrame(search.cv_results_)

In [None]:
LogisticRegression().get_params().keys()


In [None]:
y_pred = search.predict(x_validate)
print("done in %0.3fs" % (time() - t0))

print(classification_report(y_validate, y_pred))
print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))



In [None]:
# Plot the PCA spectrum
pca.fit(x_train)

fig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True, figsize=(6, 6))
ax0.plot(np.arange(1, pca.n_components_ + 1),
         pca.explained_variance_ratio_, '+', linewidth=2)
ax0.set_ylabel('PCA explained variance ratio')

ax0.axvline(search.best_estimator_.named_steps['pca'].n_components,
            linestyle=':', label='n_components chosen')
ax0.legend(prop=dict(size=12))

# For each number of components, find the best classifier results
results = pd.DataFrame(search.cv_results_)
components_col = 'param_pca__n_components'
best_clfs = results.groupby(components_col).apply(
    lambda g: g.nlargest(1, 'mean_test_score'))

best_clfs.plot(x=components_col, y='mean_test_score', yerr='std_test_score',
               legend=False, ax=ax1)
ax1.set_ylabel('Classification accuracy (val)')
ax1.set_xlabel('n_components')

plt.xlim(-1, 70)

plt.tight_layout()
plt.title('PCA of x_train', y=1.1)
fig.savefig('pca_fig', bbox_inches="tight")

In [None]:

## Use "grid search" to find good values for our SVC for this training set
# Set up the list of values to search among
svc_parameters = [
        {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
        {'C': [1, 10, 100, 1000], 'kernel': ['rbf'],
                'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}]

# perform the search
svc_search = GridSearchCV(
            SVC(),      # estimator object
            parameters, # parameters to search among
            n_jobs=-1,  # -1 means "use all available CPUs"
            verbose=1)  # give a message indicating setup
svc_search.fit(X_validation, y_validation)


# clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
# clf.fit(X, y)


# Set up the list of values to search among
logreg_parameters = [
        {'penalty' : ['l1', 'l2'],
        'C' : np.logspace(-3, 2, 20),
        'solver' : ['liblinear']},
        ]

# perform the search
print(" . Performing grid search")
logreg_search = GridSearchCV(
        LogisticRegression(),
        logreg_parameters,
        n_jobs=-1,
        verbose=1
        )

logreg_search.fit(X_validation, y_validation)


In [None]:
###  Scaling on X and PCA (if indicated)
# create scaler with training
X_train_scaler = StandardScaler() 
X_training_projected = X_train_scaler.fit_transform(X_training_from_file) 
# apply scaler to other data
X_test = X_train_scaler.transform(X_test)
X_validation = X_train_scaler.transform(X_validation)


In [None]:
def save_pca_explanation_figure(k_PCA_sufficient, \
        variance_explained, \
        cumulative_variance_explained, \
        data_dirname):
    '''
    Make a figure to explain PCA results limit it to only k_PCA_sufficient bars wide
    # rather than the total number of assay components 
    '''
    plt.bar(range(k_PCA_sufficient), cumulative_variance_explained)
    plt.ylabel("Cumulative Explained Variance")
    plt.xlabel("Principal Components")
    plt.title("%.2f%% of variance (> 90%%) is explained by the first %d columns"
            % (variance_explained * 100.0, k_PCA_sufficient))
    fig_filename = "%s/PCA-variance-explained.pdf" % data_dirname
    plt.savefig(fig_filename, bbox_inches="tight")

def save_pca_scatterplot_figure(X, y, data_dirname):
    '''Scatter plot of the first two axes of the X data,
     with the colour (hue) based on the y values
     
     y is a 1 x N matrix, and the columns of X are 1 x N vectors,
    # so we have to combine them in two steps'''

    # 
    plotdata = pd.DataFrame({"X_0" : X[:, 0], "X_1" : X[:, 1]})
    plotdata['label'] = y
    sns.relplot(x="X_0", y="X_1", hue="label", data=plotdata)
    plt.xlabel("X_0 Components")
    plt.ylabel("X_1")
    plt.title("Scatter by first to principal components")
    fig_filename = "%s/PCA-data-scatter.pdf" % data_dirname
    plt.savefig(fig_filename, bbox_inches="tight")

