In [1]:
# import plotting libraries
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline
plt.style.use('seaborn-v0_8') # pretty matplotlib plots

import seaborn as sns
sns.set_theme('notebook', style='whitegrid', font_scale=1.25)

# autoload changes in other files, so you don't have to restart the Jupyter kernel each time you make a change to the imported code.
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd

In [56]:
from experimentloop2 import load_data, grid_search, PARAM_GRID, pipeline
from preprocessing import text_col, num_del, num_to_sp
from sklearn.pipeline import FunctionTransformer

In [70]:
x_NC, y_N = load_data()
param_grid = {
    "extract_text": [
        FunctionTransformer(text_col),
        FunctionTransformer(num_to_sp),
    ],
    "featurize__token_pattern": [
        r"(?u)\b\w+\b",
        r"(?u)\b\w\w+\b",
        r"(?u)\b\w\w\w+\b",
        # r"(?u)\b\w\w\w\w+\b",
    ],
    "featurize__strip_accents": ["unicode", None],
    "featurize__lowercase": [True, False],
    "featurize__min_df": [1,2,3],
    "featurize__max_df": np.logspace(-2, 0, 11),
    "classify__C": np.logspace(-3, 3, 15),
}
search = grid_search(x_NC, y_N, param_grid, return_train_score=False)

In [68]:
print("best auroc: ", search.best_score_)
print("vocab size: ", len(search.best_estimator_.named_steps["featurize"].vocabulary_))
print("best params: ", search.best_params_)

best auroc:  0.8885381944444444
vocab size:  4526
best params:  {'classify__C': 7.196856730011514, 'extract_text': FunctionTransformer(func=<function text_col at 0x12eddfc70>), 'featurize__lowercase': True, 'featurize__max_df': 0.15848931924611143, 'featurize__min_df': 1, 'featurize__strip_accents': 'unicode', 'featurize__token_pattern': '(?u)\\b\\w+\\b'}


In [75]:
x_NC, y_N = load_data()
param_grid_l2 = {k: [v] for k, v in search.best_params_.items()}
param_grid_l2['classify__C'] = np.logspace(-3, 3, 15)
search_l2 = grid_search(x_NC, y_N, param_grid_l2, return_train_score=True)

In [66]:
param_grid_l1 = {
    "extract_text": [FunctionTransformer(text_col)],
    "featurize__token_pattern": [r"(?u)\b\w+\b"],
    "featurize__strip_accents": ["unicode"],
    "featurize__lowercase": [True],
    "featurize__min_df": [1],
    "featurize__max_df": [0.15848931924611143],
    "classify__penalty": ['l1'],
    "classify__C": np.logspace(-3, 3, 15),
    "classify__solver": ['liblinear'],
}
search_l1 = grid_search(x_NC, y_N, param_grid_l1, return_train_score=True)



In [67]:
print("best auroc: ", search_l1.best_score_)
print("vocab size: ", len(search_l1.best_estimator_.named_steps["featurize"].vocabulary_))
print("best params: ", search_l1.best_params_)

best auroc:  0.8809739583333333
vocab size:  4526
best params:  {'classify__C': 2.6826957952797246, 'classify__penalty': 'l1', 'classify__solver': 'liblinear', 'extract_text': FunctionTransformer(func=<function text_col at 0x12eddfc70>), 'featurize__lowercase': True, 'featurize__max_df': 0.15848931924611143, 'featurize__min_df': 1, 'featurize__strip_accents': 'unicode', 'featurize__token_pattern': '(?u)\\b\\w+\\b'}


In [None]:
# first line: data points along C values for train performance
# second line: data points along C values for mean validation performance

c_grid_log = np.log10(PARAM_GRID['classify__C'])
val_performance = search.cv_results_['mean_test_roc_auc']
train_performance = search.cv_results_['mean_train_roc_auc']

val_all_folds_performance = np.transpose([search.cv_results_[f'split{k}_test_roc_auc'] for k in range(5)])
train_all_folds_performance = np.transpose([search.cv_results_[f'split{k}_train_roc_auc'] for k in range(5)])

# TODO: make legend explicit so we can avoid duplicate labels
plt.title('C-Grid Hyperparameter Search')
plt.plot(c_grid_log, val_performance, 'r-', label='validation mean')
plt.plot(c_grid_log, val_all_folds_performance, 'r.', label='validation fold')
plt.plot(c_grid_log, train_performance, 'b-', label='training mean')
plt.plot(c_grid_log, train_all_folds_performance, 'b.', label='train fold')

handles, labels = plt.gca().get_legend_handles_labels()
by_label = dict(zip(labels, handles))
plt.legend(by_label.values(), by_label.keys(), bbox_to_anchor=(1.5, 0.5))

plt.ylabel('AUROC')
plt.xlabel('$\log_{10} C$')