In [1]:
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import pickle
from sklearn import preprocessing
import xgboost as xgb
from sklearn.model_selection import train_test_split
from bayes_opt import BayesianOptimization
from sklearn import metrics

In [2]:
with open('/Users/jamoth/DSR/DataScienceJobs/data/SQL_access.pkl','rb') as file:
    PASSWORD = pickle.load(file)
engine = create_engine('postgresql://postgres:'+PASSWORD+'@dsj-1.c9mo6xd9bf9d.us-west-2.rds.amazonaws.com:5432/')
df = pd.read_sql("select * from all_data where language like 'en'", engine)

with open('/Users/jamoth/DSR/DataScienceJobs/Pickles/BOG_transform.pkl', 'rb') as file:
    BOG_transform = pickle.load(file)
    
with open('/Users/jamoth/DSR/DataScienceJobs/Pickles/TFIDF_transform.pkl', 'rb') as file:
    TFIDF_transform = pickle.load(file)

In [3]:
df1 = df.dropna(subset = ['salary_average_euros','region','country','train_test_label','company'], axis=0)

df1 = df1.loc[df1.salary_type == 'yearly']

df1 = df1.drop(columns=['ref_code','url','location','posted_date','extraction_date','index','id','language','jobtype',
                        'salary','salary_low','salary_high','salary_low_euros','salary_high_euros','salary_average',
                        'currency','salary_type'], axis=1)

In [4]:
df1 = df1.reset_index(drop=True)
x_train = df1.loc[df1['train_test_label']=='train']
x_test = df1.loc[df1['train_test_label']=='test']

y_train = x_train['salary_average_euros']
y_test = x_test['salary_average_euros']

train_index = x_train.index
test_index = x_test.index

In [5]:
train_enc = x_train[['job_title','company','country','region']]
enc = preprocessing.OneHotEncoder(categories='auto', handle_unknown='ignore')
enc.fit(train_enc)
OHE_train = enc.transform(train_enc).toarray()

In [6]:
tfidf_train = TFIDF_transform[train_index,:].toarray()
BOG_train = BOG_transform[train_index,:].toarray()
OHE_tridf_train = np.hstack((OHE_train, tfidf_train))
OHE_BOG_train = np.hstack((OHE_train, BOG_train))

In [7]:
X_train, X_val, Y_train, Y_val = train_test_split(OHE_tridf_train, y_train, test_size=0.33, random_state=42)
#X_train, X_val, Y_train, Y_val = train_test_split(OHE_BOG_train, y_train, test_size=0.33, random_state=42)

In [9]:
params = {
    # Learning Task Parameters
    'objective': 'reg:squarederror',
    'eval_metric':'rmse', # Evaluation metrics for validation data
    # Parameters for Tree Booster
    'learning_rate': 0.05, # Learning Rate: step size shrinkage used to prevent overfitting.
    # Paramters for XGB ScikitLearn API
    'n_jobs': 4, # Number of parallel threads used to run xgboost
    'n_estimators': 1000, # number of trees you want to build
    'verbosity': 1, # degree of verbosity: 0 (silent) - 3 (debug)
    'max_depth': 4,
    'reg_lambda': 1,
    'colsample_bytree': 0.5,
    'subsample': 0.9
}
fit_params = {
    'eval_metric':'rmse',
    'early_stopping_rounds': 3,
    'eval_set': [(X_val, Y_val)],
}
xgb_reg = xgb.XGBRegressor(**params)


In [10]:
xgb_reg.fit(X_train, Y_train, **fit_params)

  if getattr(data, 'base', None) is not None and \


[0]	validation_0-rmse:69699.7
Will train until validation_0-rmse hasn't improved in 3 rounds.
[1]	validation_0-rmse:66742
[2]	validation_0-rmse:63942.5
[3]	validation_0-rmse:61399.1
[4]	validation_0-rmse:58894.9
[5]	validation_0-rmse:56579
[6]	validation_0-rmse:54638.7
[7]	validation_0-rmse:52647.5
[8]	validation_0-rmse:50667.2
[9]	validation_0-rmse:48815
[10]	validation_0-rmse:47116.1
[11]	validation_0-rmse:45619.3
[12]	validation_0-rmse:44300.3
[13]	validation_0-rmse:42939.9
[14]	validation_0-rmse:41608
[15]	validation_0-rmse:40429.3
[16]	validation_0-rmse:39475.8
[17]	validation_0-rmse:38407.7
[18]	validation_0-rmse:37418.1
[19]	validation_0-rmse:36479.8
[20]	validation_0-rmse:35752.6
[21]	validation_0-rmse:35107.4
[22]	validation_0-rmse:34291.1
[23]	validation_0-rmse:33569.4
[24]	validation_0-rmse:32943.7
[25]	validation_0-rmse:32396.1
[26]	validation_0-rmse:31857.2
[27]	validation_0-rmse:31322
[28]	validation_0-rmse:30863.7
[29]	validation_0-rmse:30386.2
[30]	validation_0-rmse:299

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.5, eval_metric='rmse',
             gamma=0, importance_type='gain', learning_rate=0.05,
             max_delta_step=0, max_depth=4, min_child_weight=1, missing=None,
             n_estimators=1000, n_jobs=4, nthread=None,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
             subsample=0.9, verbosity=1)

In [12]:
df_DM = xgb.DMatrix(data=X_train, label=Y_train)

In [13]:
def xgb_evaluate(max_depth, reg_lambda, colsample_bytree, subsample):
    params1 = {
        'colsample_bytree': colsample_bytree,
        'max_depth': int(round(max_depth)),  # Maximum depth of a tree: high value
                                             # -> prone to overfitting
        'reg_lambda': reg_lambda,  # L2 regularization term on weights
        'subsample': subsample
    }
    cv_result = xgb.cv(dtrain=df_DM,
                       params=params1,
                       early_stopping_rounds=10,
                       num_boost_round=100,
                       metrics='rmse')
    return -cv_result['test-rmse-mean'].iloc[-1]

In [14]:
optimizer = BayesianOptimization(xgb_evaluate, {'max_depth': (3, 5),
                                                'reg_lambda': (0, 5),
                                                'colsample_bytree': (0.3, 0.8),
                                                'subsample': (0.8, 1)})

In [15]:
optimizer.maximize(init_points=2, n_iter=10)

|   iter    |  target   | colsam... | max_depth | reg_la... | subsample |
-------------------------------------------------------------------------
| [0m 1       [0m | [0m-2.768e+0[0m | [0m 0.5901  [0m | [0m 3.165   [0m | [0m 1.531   [0m | [0m 0.8605  [0m |
| [0m 2       [0m | [0m-2.816e+0[0m | [0m 0.3222  [0m | [0m 4.607   [0m | [0m 0.6052  [0m | [0m 0.9958  [0m |
| [95m 3       [0m | [95m-2.764e+0[0m | [95m 0.5897  [0m | [95m 3.871   [0m | [95m 5.0     [0m | [95m 0.9594  [0m |
| [0m 4       [0m | [0m-2.807e+0[0m | [0m 0.6512  [0m | [0m 4.825   [0m | [0m 2.943   [0m | [0m 0.9636  [0m |
| [0m 5       [0m | [0m-2.785e+0[0m | [0m 0.5711  [0m | [0m 4.08    [0m | [0m 2.285   [0m | [0m 0.8999  [0m |
| [0m 6       [0m | [0m-2.824e+0[0m | [0m 0.3359  [0m | [0m 4.653   [0m | [0m 1.457   [0m | [0m 0.9634  [0m |
| [0m 7       [0m | [0m-2.785e+0[0m | [0m 0.5228  [0m | [0m 4.288   [0m | [0m 3.313   [0m | [0m 0.9074 

In [16]:
params_1 = optimizer.max['params']
params_1['max_depth'] = int(round(params_1['max_depth']))
params.update(params_1)

In [17]:
xgb_reg = xgb.XGBRegressor(**params)
xgb_reg.fit(X_train, Y_train, **fit_params)

  if getattr(data, 'base', None) is not None and \


[0]	validation_0-rmse:69803
Will train until validation_0-rmse hasn't improved in 3 rounds.
[1]	validation_0-rmse:67007
[2]	validation_0-rmse:64244.1
[3]	validation_0-rmse:61724.6
[4]	validation_0-rmse:59193.6
[5]	validation_0-rmse:56866
[6]	validation_0-rmse:54970.4
[7]	validation_0-rmse:53000.9
[8]	validation_0-rmse:51131.6
[9]	validation_0-rmse:49260.7
[10]	validation_0-rmse:47554.4
[11]	validation_0-rmse:46053.6
[12]	validation_0-rmse:44724.7
[13]	validation_0-rmse:43339
[14]	validation_0-rmse:41988.2
[15]	validation_0-rmse:40821.2
[16]	validation_0-rmse:39869.1
[17]	validation_0-rmse:38861.9
[18]	validation_0-rmse:37842.7
[19]	validation_0-rmse:36892.3
[20]	validation_0-rmse:36139.8
[21]	validation_0-rmse:35467.8
[22]	validation_0-rmse:34679.6
[23]	validation_0-rmse:33979.2
[24]	validation_0-rmse:33325
[25]	validation_0-rmse:32777.1
[26]	validation_0-rmse:32310.8
[27]	validation_0-rmse:31731.5
[28]	validation_0-rmse:31273.6
[29]	validation_0-rmse:30775.5
[30]	validation_0-rmse:303

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.4307381843437827,
             eval_metric='rmse', gamma=0, importance_type='gain',
             learning_rate=0.05, max_delta_step=0, max_depth=4,
             min_child_weight=1, missing=None, n_estimators=1000, n_jobs=4,
             nthread=None, objective='reg:squarederror', random_state=0,
             reg_alpha=0, reg_lambda=2.9229946338712667, scale_pos_weight=1,
             seed=None, silent=None, subsample=0.846885846460324, verbosity=1)

In [20]:
pickle.dump(xgb_reg, open("/Users/jamoth/DSR/DataScienceJobs/Pickles/xgb_model_tfidf.pkl", "wb"))

In [23]:
test_enc = x_test[['job_title','company','country','region']]
OHE_test = enc.transform(test_enc).toarray()

tfidf_test = TFIDF_transform[test_index,:].toarray()
OHE_tridf_test = np.hstack((OHE_test, tfidf_test))

y_pred = xgb_reg.predict(OHE_tridf_test)

In [25]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R2 Score:', np.sqrt(metrics.r2_score(y_test, y_pred)))

Mean Absolute Error: 18586.36870535714
Mean Squared Error: 683127548.5643677
Root Mean Squared Error: 26136.708831916225
R2 Score: 0.7534522588897914
