# Support Vector Machine

## Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RepeatedStratifiedKFold, cross_validate, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import balanced_accuracy_score
from time import time

## Data

In [2]:
client_attrition = pd.read_csv('../data/preprocessed/client_attrition_train.csv', sep=";")
X = client_attrition.drop("account_status",axis=1)
y = client_attrition["account_status"]
print(X.shape)

(10127, 37)


## Grid search
Using repeated Stratified KFold Cross Valdidation

In [3]:
# Grid search results to df
def gs_to_df(grid_search):
    df = pd.DataFrame()
    for param in param_grid.keys():
        df[param] = 0
    df["mean_train_score"] = df["std_train_score"] = df["mean_test_score"] = df["std_test_score"] = 0
    for i, params in enumerate(grid_search.cv_results_['params']):
        new_row = params | {"mean_train_score": grid_search.cv_results_['mean_train_score'][i],
                            "std_train_score": grid_search.cv_results_['std_train_score'][i],
                            "mean_test_score": grid_search.cv_results_['mean_test_score'][i],
                            "std_test_score": grid_search.cv_results_['std_test_score'][i]}
        df.loc[len(df)] = new_row

    return df
    

In [4]:
start_time = time()
# Grid 
param_grid = {'C': [1, 10, 100], 'kernel': ['poly'], 'degree': [1,2,3], 'gamma': ['scale']}
# Cross Validation
rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=2,random_state=12345)
# Grid Search
grid_search = GridSearchCV(estimator=SVC(), param_grid=param_grid, scoring="balanced_accuracy", 
                            n_jobs=12, cv=rskf, verbose=3, return_train_score=True)
grid_search.fit(X,y)
print("--- %s seconds ---" % (time() - start_time))

Fitting 20 folds for each of 9 candidates, totalling 180 fits
--- 328.91745352745056 seconds ---


In [5]:
df = gs_to_df(grid_search=grid_search)
df.sort_values(ascending=False,by="mean_test_score").to_csv('./results/svm_poly_metrics_2_10_cv.csv', index=False,sep=';')
df.sort_values(ascending=False,by="mean_test_score")

Unnamed: 0,C,kernel,degree,gamma,mean_train_score,std_train_score,mean_test_score,std_test_score
5,10,poly,3,scale,0.906738,0.0028,0.764714,0.017509
8,100,poly,3,scale,0.974525,0.00139,0.759641,0.016807
6,100,poly,1,scale,0.734832,0.002834,0.729473,0.015638
3,10,poly,1,scale,0.734362,0.002717,0.729467,0.015643
0,1,poly,1,scale,0.73014,0.003233,0.726484,0.016615
2,1,poly,3,scale,0.777379,0.00269,0.702769,0.018227
7,100,poly,2,scale,0.719577,0.003934,0.686864,0.016635
4,10,poly,2,scale,0.69247,0.002999,0.662515,0.016436
1,1,poly,2,scale,0.6278,0.003394,0.608796,0.015811


Move onto higher polynomials and more precise C

In [6]:
start_time = time()
# Grid 
param_grid = {'C': [10, 50, 100], 'kernel': ['poly'], 'degree': [3,4,5], 'gamma': ['scale']}
# Cross Validation
rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=2,random_state=12345)
# Grid Search
grid_search = GridSearchCV(estimator=SVC(), param_grid=param_grid, scoring="balanced_accuracy", 
                            n_jobs=12, cv=rskf, verbose=3, return_train_score=True)
grid_search.fit(X,y)
print("--- %s seconds ---" % (time() - start_time))

Fitting 20 folds for each of 9 candidates, totalling 180 fits
--- 113.11646580696106 seconds ---


In [7]:
df = gs_to_df(grid_search=grid_search)
df.sort_values(ascending=False,by="mean_test_score").to_csv('./results/svm_poly1_metrics_2_10_cv.csv', index=False,sep=';')
df.sort_values(ascending=False,by="mean_test_score")

Unnamed: 0,C,kernel,degree,gamma,mean_train_score,std_train_score,mean_test_score,std_test_score
0,10,poly,3,scale,0.906738,0.0028,0.764714,0.017509
3,50,poly,3,scale,0.960315,0.002131,0.763585,0.016251
6,100,poly,3,scale,0.974525,0.00139,0.759641,0.016807
4,50,poly,4,scale,0.985188,0.001217,0.752185,0.016255
7,100,poly,4,scale,0.993312,0.000531,0.749607,0.01576
8,100,poly,5,scale,0.99156,0.000506,0.74754,0.01417
5,50,poly,5,scale,0.982026,0.000968,0.743944,0.012027
1,10,poly,4,scale,0.933455,0.001951,0.741555,0.012233
2,10,poly,5,scale,0.925271,0.002016,0.725866,0.015045


Lets use rbf kernel

In [8]:
start_time = time()
# Grid 
param_grid = {'C': [1, 10, 50, 100], 'kernel': ['rbf'],'gamma': ['scale', 'auto', 1]}
# Cross Validation
rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=2,random_state=12345)
# Grid Search
grid_search = GridSearchCV(estimator=SVC(), param_grid=param_grid, scoring="balanced_accuracy", 
                            n_jobs=12, cv=rskf, verbose=3, return_train_score=True)
grid_search.fit(X,y)
print("--- %s seconds ---" % (time() - start_time))

Fitting 20 folds for each of 12 candidates, totalling 240 fits
--- 913.0137739181519 seconds ---


In [9]:
df = gs_to_df(grid_search=grid_search)
df.sort_values(ascending=False,by="mean_test_score").to_csv('./results/svm_rbf_metrics_2_10_cv.csv', index=False,sep=';')
df.sort_values(ascending=False,by="mean_test_score")

Unnamed: 0,C,kernel,gamma,mean_train_score,std_train_score,mean_test_score,std_test_score
4,10,rbf,auto,0.935821,0.002764,0.794753,0.014388
6,50,rbf,scale,0.982646,0.001009,0.794644,0.015184
7,50,rbf,auto,0.982701,0.001038,0.794644,0.016084
3,10,rbf,scale,0.935791,0.002736,0.794599,0.014382
10,100,rbf,auto,0.991285,0.000702,0.788441,0.017737
9,100,rbf,scale,0.991322,0.000704,0.788345,0.01792
0,1,rbf,scale,0.797864,0.003626,0.748049,0.01523
1,1,rbf,auto,0.797836,0.003557,0.748049,0.015266
5,10,rbf,1,1.0,0.0,0.505197,0.002961
8,50,rbf,1,1.0,0.0,0.505197,0.002961


More granular search for C

In [10]:
start_time = time()
# Grid 
param_grid = {'C': [5, 12, 25, 40], 'kernel': ['rbf'],'gamma': ['scale', 'auto']}
# Cross Validation
rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=2,random_state=12345)
# Grid Search
grid_search = GridSearchCV(estimator=SVC(), param_grid=param_grid, scoring="balanced_accuracy", 
                            n_jobs=12, cv=rskf, verbose=3, return_train_score=True)
grid_search.fit(X,y)
print("--- %s seconds ---" % (time() - start_time))

Fitting 20 folds for each of 8 candidates, totalling 160 fits
--- 192.1791980266571 seconds ---


In [11]:
df = gs_to_df(grid_search=grid_search)
df.sort_values(ascending=False,by="mean_test_score").to_csv('./results/svm_rbf1_metrics_2_10_cv.csv', index=False,sep=';')
df.sort_values(ascending=False,by="mean_test_score")

Unnamed: 0,C,kernel,gamma,mean_train_score,std_train_score,mean_test_score,std_test_score
2,12,rbf,scale,0.942803,0.002335,0.795206,0.015096
3,12,rbf,auto,0.942684,0.002347,0.795205,0.015333
4,25,rbf,scale,0.966428,0.001699,0.794934,0.015023
5,25,rbf,auto,0.966442,0.001723,0.794934,0.015023
7,40,rbf,auto,0.978762,0.001318,0.794612,0.014991
6,40,rbf,scale,0.978714,0.001279,0.794334,0.014982
1,5,rbf,auto,0.901672,0.002927,0.788765,0.015845
0,5,rbf,scale,0.901642,0.002934,0.788612,0.015849


C parameter seems to increase score

In [12]:
start_time = time()
# Grid 
param_grid = {'C': [12, 15, 20], 'kernel': ['rbf'],'gamma': ['scale', 'auto']}
# Cross Validation
rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=2,random_state=12345)
# Grid Search
grid_search = GridSearchCV(estimator=SVC(), param_grid=param_grid, scoring="balanced_accuracy", 
                            n_jobs=12, cv=rskf, verbose=3, return_train_score=True)
grid_search.fit(X,y)
print("--- %s seconds ---" % (time() - start_time))

Fitting 20 folds for each of 6 candidates, totalling 120 fits
--- 230.3798098564148 seconds ---


In [13]:
df = gs_to_df(grid_search=grid_search)
df.sort_values(ascending=False,by="mean_test_score").to_csv('./results/svm_rbf2_metrics_2_10_cv.csv', index=False,sep=';')
df.sort_values(ascending=False,by="mean_test_score")

Unnamed: 0,C,kernel,gamma,mean_train_score,std_train_score,mean_test_score,std_test_score
5,20,rbf,auto,0.96022,0.001817,0.795377,0.01513
4,20,rbf,scale,0.960206,0.001849,0.795252,0.01532
0,12,rbf,scale,0.942803,0.002335,0.795206,0.015096
1,12,rbf,auto,0.942684,0.002347,0.795205,0.015333
2,15,rbf,scale,0.95099,0.001874,0.794746,0.013431
3,15,rbf,auto,0.951004,0.001873,0.794467,0.013833


Best model C 20 kernel rbf gamma auto