In [1]:
# Import required packages 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000

In [2]:
# Import database
df = pd.read_csv('salary_data_cleaned.csv')


In [3]:
# Columns selection
df_model = df[['avg_salary','Rating','Size','Type of ownership','Industry','Sector','Revenue','num_comp','hour','employer_provided',
             'job_state','same_state','age','python','spark','aws','excel','job_simp','seniority','desc_len']]

In [4]:
# Get a dummy of each column and catenate it
df_dummies = pd.get_dummies(df_model,drop_first=True)
df_dummies.shape


(742, 170)

In [5]:
# Select the target and the rest of features
features= df_dummies.drop('avg_salary', axis =1)
target = df_dummies.avg_salary.values

In [6]:
# Import split tool 

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)

In [7]:
# Import Support Vector Regressor model
from sklearn.svm import SVR

# Fit model with 'RBF' kernel
regressor = SVR(kernel = 'rbf')

# Train model 
regressor.fit(X_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)

In [8]:
# Make prediction on train and test sets
train_predict = regressor.predict(X_train)
test_predict = regressor.predict(X_test)

In [9]:
# Import mean absolute error metric
from sklearn.metrics import mean_absolute_error

train_mae = mean_absolute_error(y_train,train_predict)
print(train_mae)
test_mae = mean_absolute_error(y_test, test_predict)
print(test_mae)

28.154765319710776
30.821202352893653


### Support Vector Optimization through GridSearchCV

In [10]:
from sklearn.model_selection import GridSearchCV
 
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf','sigmoid','linear']}

grid = GridSearchCV(SVR(), param_grid, refit = True, verbose = 3, scoring='neg_mean_absolute_error')
 
# fitting the model for grid search
grid.fit(X_train, y_train)

Fitting 3 folds for each of 75 candidates, totalling 225 fits
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] ........ C=0.1, gamma=1, kernel=rbf, score=-28.695, total=   0.1s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] ........ C=0.1, gamma=1, kernel=rbf, score=-29.894, total=   0.1s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s


[CV] ........ C=0.1, gamma=1, kernel=rbf, score=-30.883, total=   0.1s
[CV] C=0.1, gamma=1, kernel=sigmoid ..................................
[CV] .... C=0.1, gamma=1, kernel=sigmoid, score=-28.741, total=   0.1s
[CV] C=0.1, gamma=1, kernel=sigmoid ..................................
[CV] .... C=0.1, gamma=1, kernel=sigmoid, score=-29.937, total=   0.1s
[CV] C=0.1, gamma=1, kernel=sigmoid ..................................
[CV] .... C=0.1, gamma=1, kernel=sigmoid, score=-30.932, total=   0.1s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] ..... C=0.1, gamma=1, kernel=linear, score=-24.989, total=   8.0s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] ..... C=0.1, gamma=1, kernel=linear, score=-25.837, total=   8.7s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] ..... C=0.1, gamma=1, kernel=linear, score=-26.615, total=   9.5s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] .

[CV] ..... C=1, gamma=0.1, kernel=linear, score=-21.590, total= 6.1min
[CV] C=1, gamma=0.1, kernel=linear ...................................
[CV] ..... C=1, gamma=0.1, kernel=linear, score=-21.754, total= 4.7min
[CV] C=1, gamma=0.1, kernel=linear ...................................
[CV] ..... C=1, gamma=0.1, kernel=linear, score=-22.418, total= 5.3min
[CV] C=1, gamma=0.01, kernel=rbf .....................................
[CV] ....... C=1, gamma=0.01, kernel=rbf, score=-28.258, total=   0.1s
[CV] C=1, gamma=0.01, kernel=rbf .....................................
[CV] ....... C=1, gamma=0.01, kernel=rbf, score=-29.492, total=   0.1s
[CV] C=1, gamma=0.01, kernel=rbf .....................................
[CV] ....... C=1, gamma=0.01, kernel=rbf, score=-30.446, total=   0.1s
[CV] C=1, gamma=0.01, kernel=sigmoid .................................
[CV] ... C=1, gamma=0.01, kernel=sigmoid, score=-28.741, total=   0.1s
[CV] C=1, gamma=0.01, kernel=sigmoid .................................
[CV] .

[CV] . C=10, gamma=0.001, kernel=sigmoid, score=-28.741, total=   0.1s
[CV] C=10, gamma=0.001, kernel=sigmoid ...............................
[CV] . C=10, gamma=0.001, kernel=sigmoid, score=-29.937, total=   0.1s
[CV] C=10, gamma=0.001, kernel=sigmoid ...............................
[CV] . C=10, gamma=0.001, kernel=sigmoid, score=-30.932, total=   0.1s
[CV] C=10, gamma=0.001, kernel=linear ................................
[CV] .. C=10, gamma=0.001, kernel=linear, score=-75.630, total=17.2min
[CV] C=10, gamma=0.001, kernel=linear ................................
[CV] .. C=10, gamma=0.001, kernel=linear, score=-66.563, total= 5.8min
[CV] C=10, gamma=0.001, kernel=linear ................................
[CV] .. C=10, gamma=0.001, kernel=linear, score=-75.361, total= 5.6min
[CV] C=10, gamma=0.0001, kernel=rbf ..................................
[CV] .... C=10, gamma=0.0001, kernel=rbf, score=-27.180, total=   0.1s
[CV] C=10, gamma=0.0001, kernel=rbf ..................................
[CV] .

[CV]  C=100, gamma=0.0001, kernel=linear, score=-635.821, total= 8.2min
[CV] C=100, gamma=0.0001, kernel=linear ..............................
[CV]  C=100, gamma=0.0001, kernel=linear, score=-652.489, total= 6.2min
[CV] C=1000, gamma=1, kernel=rbf .....................................
[CV] ....... C=1000, gamma=1, kernel=rbf, score=-15.436, total=   0.1s
[CV] C=1000, gamma=1, kernel=rbf .....................................
[CV] ....... C=1000, gamma=1, kernel=rbf, score=-17.745, total=   0.1s
[CV] C=1000, gamma=1, kernel=rbf .....................................
[CV] ....... C=1000, gamma=1, kernel=rbf, score=-15.481, total=   0.1s
[CV] C=1000, gamma=1, kernel=sigmoid .................................
[CV] ... C=1000, gamma=1, kernel=sigmoid, score=-28.741, total=   0.1s
[CV] C=1000, gamma=1, kernel=sigmoid .................................
[CV] ... C=1000, gamma=1, kernel=sigmoid, score=-29.937, total=   0.1s
[CV] C=1000, gamma=1, kernel=sigmoid .................................
[CV]

[Parallel(n_jobs=1)]: Done 225 out of 225 | elapsed: 603.2min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='auto_deprecated', kernel='rbf',
                           max_iter=-1, shrinking=True, tol=0.001,
                           verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf', 'sigmoid', 'linear']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_absolute_error', verbose=3)

In [11]:
# print best parameter after tuning
print(grid.best_params_)
 
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

{'C': 100, 'gamma': 0.1, 'kernel': 'rbf'}
SVR(C=100, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.1,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)


In [12]:
# Make prediction on test set and get MAE
test_grid_predictions = grid.predict(X_test)
test_grid_mae = mean_absolute_error(y_test, test_grid_predictions)
print(test_grid_mae)

10.62269029659026
