# Henry Ginder

I am trying different hyper-parameters on the SVM model.

In [1]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.model_selection import GridSearchCV

In [2]:
def plot_roc(models, model_names):
    plt.figure(0, figsize = [8, 7]).clf()
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    
    for ii, model in enumerate(models):
        y_prob_test = model.predict_proba(X_test_featurized)[:, 1]
        fpr, tpr, threshold = roc_curve(y_test, y_prob_test, pos_label = "yes")
        roc_auc = auc(fpr, tpr)

        fpr, tpr, threshold = roc_curve(y_test, y_prob_test, pos_label = "yes")
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label = "{} AUC = {:0.2f}".format(model_names[ii], roc_auc))

    plt.legend(loc = 'lower right');

Let's load and pre-process the data in the same way we did in the lecture

In [3]:
bank = pd.read_csv("./data/bank-full.csv", sep = ";")

# seperate columns by type
num_cols = bank.select_dtypes(['integer', 'float']).columns
cat_cols = bank.select_dtypes(['object']).drop(columns = "y").columns

# split data to training and testing
X_train, X_test, y_train, y_test = train_test_split(bank.drop(columns = "y"), bank["y"], 
                                                    test_size = 0.10, random_state = 42)
# reset index
X_train = X_train.reset_index(drop = True)
X_test = X_test.reset_index(drop = True)

# one hot encode categorical columns
onehoter = OneHotEncoder(sparse = False)
onehoter.fit(X_train[cat_cols])
onehot_cols = onehoter.get_feature_names_out(cat_cols)
X_train_onehot = pd.DataFrame(onehoter.transform(X_train[cat_cols]), columns = onehot_cols)
X_test_onehot = pd.DataFrame(onehoter.transform(X_test[cat_cols]), columns = onehot_cols)

# normalize numeric columns
znormalizer = StandardScaler()
znormalizer.fit(X_train[num_cols])
X_train_norm = pd.DataFrame(znormalizer.transform(X_train[num_cols]), columns = num_cols)
X_test_norm = pd.DataFrame(znormalizer.transform(X_test[num_cols]), columns = num_cols)

# concatenate one hot encoded cateorical type columns with normalized numeric columns
X_train_featurized = X_train_onehot # add one-hot-encoded columns
X_test_featurized = X_test_onehot   # add one-hot-encoded columns
X_train_featurized[num_cols] = X_train_norm # add numeric columns
X_test_featurized[num_cols] = X_test_norm   # add numeric columns

del X_train_norm, X_test_norm, X_train_onehot, X_test_onehot

logit = LogisticRegression(max_iter = 5000, solver = 'lbfgs')
logit.fit(X_train_featurized, y_train)

y_hat_train = logit.predict(X_train_featurized)
y_hat_test = logit.predict(X_test_featurized)

print("Featurized training data has {} rows and {} columns.".format(*X_train_featurized.shape))
print("Featurized test data has {} rows and {} columns.".format(*X_test_featurized.shape))

X_train_featurized.head()

Featurized training data has 40689 rows and 51 columns.
Featurized test data has 4522 rows and 51 columns.


Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,poutcome_other,poutcome_success,poutcome_unknown,age,balance,day,duration,campaign,pdays,previous
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,-1.124112,-0.443322,-0.099012,0.231962,0.076064,-0.411045,-0.249556
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.510135,-0.3146,-0.459566,-0.581586,-0.24489,-0.411045,-0.249556
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.227894,-0.211233,-1.300857,-0.126155,-0.565844,-0.411045,-0.249556
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,1.039734,0.230193,-0.699935,-0.130048,-0.565844,-0.411045,-0.249556
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-0.653711,0.134627,-1.421042,0.391557,-0.565844,1.216026,0.61555


There are three main ways to search the **hyper-parameter space**:

- **Grid search:** tries every combination of hyper-parameters
- **Random search:** tries a random subset of all combinations of hyper-parameters
- **Bayesian optimization:** tries a subset of all combinations of hyper-parameters (like random search) but does so in a more intelligent way, based on trading off the need to **explore** (trying a part of the hyper-parameter space thus far unexplored) and the need to **exploit** (focusing on a part of the hyper-parameter space that thus far seems promising)

In [7]:
hps = {'kernel': ['linear', 'rbf'], 'C': [1, 15], 'gamma': ['scale', 'auto', 0.1]}#'degree': [2, 3, 4]

I removed the degree portion from the hyperparameters dictionary ( as well as the 'poly' value within the 'kernel' key) because after countless runs I didn't see 'poly' chosen once, and from the documentation for SVC() I read that the 'degree' parameter only affects 'poly'-kernel models. I did this to makes running the notebook take slightly less time.

In [8]:
(3.2e9) / (5 * 12) / 1e6

53.333333333333336

Approximate the cache size for each model in the GridSearch. Jupyter notebooks have about 3.2 gigabytes of memory. Divide by total models we are gonna create, 5 fold times 15 potential combinations. Then divide by a million to get value in megabytes, which is how it expects it based on the documentation.

In [10]:
gscv = GridSearchCV(SVC(max_iter=-1, class_weight='balanced', cache_size=51, probability=False), param_grid=hps, verbose=2, cv=5, refit=True)

Initialize the GridSearchCV() object. Max_iter=

Train model with gridsearch

In [11]:
import warnings
warnings.filterwarnings('ignore')

gscv.fit(X_train_featurized, y_train)

Fitting 2 folds for each of 12 candidates, totalling 24 fits
[CV] END ....................C=1, gamma=scale, kernel=linear; total time=  32.7s
[CV] END ....................C=1, gamma=scale, kernel=linear; total time=  36.2s
[CV] END .......................C=1, gamma=scale, kernel=rbf; total time=  34.3s
[CV] END .......................C=1, gamma=scale, kernel=rbf; total time=  33.0s
[CV] END .....................C=1, gamma=auto, kernel=linear; total time=  32.1s
[CV] END .....................C=1, gamma=auto, kernel=linear; total time=  32.6s
[CV] END ........................C=1, gamma=auto, kernel=rbf; total time=  31.7s
[CV] END ........................C=1, gamma=auto, kernel=rbf; total time=  31.5s
[CV] END ......................C=1, gamma=0.1, kernel=linear; total time=  31.9s
[CV] END ......................C=1, gamma=0.1, kernel=linear; total time=  32.2s
[CV] END .........................C=1, gamma=0.1, kernel=rbf; total time=  32.7s
[CV] END .........................C=1, gamma=0.1

GridSearchCV(cv=2, estimator=SVC(cache_size=51, class_weight='balanced'),
             param_grid={'C': [1, 15], 'gamma': ['scale', 'auto', 0.1],
                         'kernel': ['linear', 'rbf']},
             verbose=2)

In [12]:
max_score = gscv.cv_results_['mean_test_score'].max()
max_index = list(gscv.cv_results_['mean_test_score']).index(max_score)
gscv.cv_results_['params'][max_index]

{'C': 15, 'gamma': 0.1, 'kernel': 'rbf'}

Get the precision and recall of the best estimator

In [13]:
precision_train = precision_score(y_train, y_hat_train, pos_label = 'yes') * 100
precision_test = precision_score(y_test, y_hat_test, pos_label = 'yes') * 100

recall_train = recall_score(y_train, y_hat_train, pos_label = 'yes') * 100
recall_test = recall_score(y_test, y_hat_test, pos_label = 'yes') * 100

print("Precision = {:.0f}% and recall = {:.0f}% on the training data.".format(precision_train, recall_train))
print("Precision = {:.0f}% and recall = {:.0f}% on the validation data.".format(precision_test, recall_test))

Precision = 65% and recall = 35% on the training data.
Precision = 63% and recall = 34% on the validation data.


In [14]:
accuracy_score(y_train, y_hat_train)

0.9027009756936765

In [15]:
accuracy_score(y_test, y_hat_test)

0.8949579831932774

In [16]:
gscv.best_estimator_

SVC(C=15, cache_size=51, class_weight='balanced', gamma=0.1)

In [17]:
gscv_train_predict = gscv.predict(X_train_featurized)
gscv_test_predict = gscv.predict(X_test_featurized)

precision_train = precision_score(y_train, gscv_train_predict, pos_label = 'yes') * 100
precision_test = precision_score(y_test, gscv_test_predict, pos_label = 'yes') * 100

recall_train = recall_score(y_train, gscv_train_predict, pos_label = 'yes') * 100
recall_test = recall_score(y_test, gscv_test_predict, pos_label = 'yes') * 100

print("Precision = {:.0f}% and recall = {:.0f}% on the training data.".format(precision_train, recall_train))
print("Precision = {:.0f}% and recall = {:.0f}% on the validation data.".format(precision_test, recall_test))

Precision = 66% and recall = 98% on the training data.
Precision = 49% and recall = 71% on the validation data.


In [18]:
accuracy_score(y_train, gscv_train_predict)

0.9388532527218658

In [19]:
accuracy_score(y_test, gscv_test_predict)

0.8721804511278195

Very cool!!

Henry Ginder