# Salary Predictions Based on Job Descriptions

# Part 1 - DEFINE

### ---- 1 Define the problem ----

Be able to predict the salaries of various job postings

In [22]:
#import your libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import cross_validate
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
import lightgbm as lgb
from sklearn.preprocessing import KBinsDiscretizer


# import sklearn as sk
#etc

#your info here
__author__ = "Jared Teerlink"
__email__ = "jteerlink@gmail.com"

## Part 2 - DISCOVER

### ---- 2 Load the data ----

In [2]:
#load the data into a Pandas dataframe
train_features = pd.read_csv('../Data/train_features.csv')
train_salaries = pd.read_csv('../Data/train_salaries.csv')
# test_features = pd.read_csv('../Data/test_features.csv')


### ---- 3 Clean the data ----

In [3]:
#look for duplicate data, invalid data (e.g. salaries <=0), or corrupt data and remove it



train_combined = train_features.merge(train_salaries, on = 'jobId', how = 'left')
train_combined = train_combined[train_combined.salary > 0]
train_combined.head()

Unnamed: 0,jobId,companyId,jobType,degree,major,industry,yearsExperience,milesFromMetropolis,salary
0,JOB1362684407687,COMP37,CFO,MASTERS,MATH,HEALTH,10,83,130
1,JOB1362684407688,COMP19,CEO,HIGH_SCHOOL,NONE,WEB,3,73,101
2,JOB1362684407689,COMP52,VICE_PRESIDENT,DOCTORAL,PHYSICS,HEALTH,10,38,137
3,JOB1362684407690,COMP38,MANAGER,DOCTORAL,CHEMISTRY,AUTO,8,17,142
4,JOB1362684407691,COMP7,VICE_PRESIDENT,BACHELORS,PHYSICS,FINANCE,8,16,163


In [4]:
train_combined.drop(labels = ['jobId','companyId','major'],inplace=True,axis=1)
                      
# train_combined = train_combined.drop(,inplace=True)




In [5]:
#Dummy Variables
# major = pd.get_dummies(train_combined['major'], prefix_sep='_', drop_first=True)

industry = pd.get_dummies(train_combined['industry'], prefix_sep='_', drop_first=True)

train_combined = pd.concat([train_combined, industry], axis=1)


# major = pd.get_dummies(train_combined['major'], prefix_sep='_', drop_first=True)

# train_combined = pd.concat([train_combined, major], axis=1)

In [6]:


label_enc = LabelEncoder()

# major_coded = label_enc.fit_transform(train_combined['major'])
# 

# industry_coded = label_enc.fit_transform(train_combined['industry'])
# company_coded = label_enc.fit_transform(train_combined['companyId'])


# train_combined['major_enc'] = major_coded

# train_combined = pd.concat([train_combined, major_coded], axis=1)
# train_combined['industry_enc'] = industry_coded

# train_combined['company_enc'] = company_coded

In [7]:
# Ordinal Encoding - degree and jobtype

degree = [['NONE', 'HIGH_SCHOOL', 'BACHELORS', 'MASTERS', 'DOCTORAL']]
degree_array = np.asarray(train_combined.degree).reshape(-1,1)



enc = OrdinalEncoder(categories=degree)
degree_ord = enc.fit_transform(degree_array)

train_combined['degree_ord'] = degree_ord
# ordered by average salary
titles = [['JANITOR', 'JUNIOR', 'SENIOR', 'MANAGER', 'VICE_PRESIDENT', 'CFO', 'CTO', 'CEO']]
titles_array = np.asarray(train_combined.jobType).reshape(-1,1)

enc = OrdinalEncoder(categories=titles)
titles_ord = enc.fit_transform(titles_array)

train_combined['jobType_ord'] = titles_ord

In [8]:


# scl = StandardScaler()

# # train_combined['yearsExperience'] = scl.fit_transform(np.asarray(train_combined['yearsExperience']).reshape(-1,1))
# train_combined['milesFromMetropolis'] = scl.fit_transform(np.asarray(train_combined['milesFromMetropolis']).reshape(-1,1))

In [10]:
poly = PolynomialFeatures(degree=2)
train_poly = poly.fit_transform(train_final)

In [9]:
#prepare final data frames

train_num = train_combined.select_dtypes(exclude = 'object')
train_final = train_num.drop(['salary'],axis=1)

train_salary = train_combined['salary']

### ---- 5 Establish a baseline ----

In [11]:


train_num_features, test_num_features, train_num_label, test_num_label = train_test_split(train_poly,train_salary, test_size = .2)

In [12]:
#select a reasonable metric (MSE in this case)
#create an extremely simple model and measure its efficacy
#e.g. use "average salary" for each industry as your model and then measure MSE
#during 5-fold cross-validation



lm = LinearRegression(n_jobs=-1)

model = lm.fit(train_num_features,train_num_label)
predict = model.predict(test_num_features)


In [13]:

mean_squared_error(test_num_label,predict)

386.3125052620002

In [None]:
410.3944048673012

In [82]:
train_num_features.describe()

Unnamed: 0,yearsExperience,milesFromMetropolis,industry_enc
count,749996.0,749996.0,749996.0
mean,11.994971,49.534714,3.001663
std,7.21277,28.878453,2.00071
min,0.0,0.0,0.0
25%,6.0,25.0,1.0
50%,12.0,50.0,3.0
75%,18.0,75.0,5.0
max,24.0,99.0,6.0


### ---- 6 Hypothesize solution ----

In [19]:
# Ridge Regression or ElasticNet to help alleviate difficulty in predicting less frequent jobtypes
# RandomForestRegressor

Brainstorm 3 models that you think may improve results over the baseline model based on your EDA and explain why they're reasonable solutions here.

Also write down any new features that you think you should try adding to the model based on your EDA, e.g. interaction variables, summary statistics for each group, etc

## Part 3 - DEVELOP

You will cycle through creating features, tuning models, and training/validing models (steps 7-9) until you've reached your efficacy goal

#### Your metric will be MSE and your goal is:
 - <360 for entry-level data science roles
 - <320 for senior data science roles

### ---- 7 Engineer features  ----

In [20]:
#make sure that data is ready for modeling
#create any new features needed to potentially enhance model

### ---- 8 Create models ----

In [25]:
#sklearn.linear_model.ElasticNetCV



enr = RidgeCV(cv=5)

model = enr.fit(train_num_features,train_num_label)

predict = model.predict(test_num_features)

mean_squared_error(test_num_label,predict)




386.312506303153

In [None]:
489.4892428466449

In [17]:

rf = RandomForestRegressor(n_estimators=500,n_jobs = -1)
model = rf.fit(train_num_features,train_num_label)
predict = model.predict(test_num_features)
mean_squared_error(test_num_label,predict)

498.50002597691974

In [None]:
498.50002597691974

In [13]:
rf.params

NameError: name 'rf' is not defined

In [17]:
# sklearn.ensemble.RandomForestRegressor


# from sklearn.model_selection import GridSearchCV


# Number of trees in random forest
n_estimators = [100,250,500]
# Number of features to consider at every split
max_features = ['sqrt','log2',None]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(30, 400, num = 6)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [10,15,20,25,30]
# Minimum number of samples required at each leaf node
min_samples_leaf = [50,75,100,200,300,400]
warm_start = [False]

# Create the random grid
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'warm_start': warm_start
               }
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_grid = RandomizedSearchCV(estimator = rf, param_distributions = param_grid, cv = 3, verbose=3, n_jobs = -1,scoring='neg_mean_squared_error',n_iter=10)
# Fit the random search model
rf_grid.fit(train_num_features,train_num_label)


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 33.8min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=10, n_jobs=-1,
          param_distributions={'n_estimators': [100, 250, 500], 'max_features': ['sqrt', 'log2', None], 'max_depth': [30, 104, 178, 252, 326, 400, None], 'min_samples_split': [10, 15, 20, 25, 30], 'min_samples_leaf': [50, 75, 100, 200, 300, 400], 'warm_start': [False]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='neg_mean_squared_error',
          verbose=3)

In [18]:
rf_grid.best_params_


{'warm_start': False,
 'n_estimators': 250,
 'min_samples_split': 30,
 'min_samples_leaf': 50,
 'max_features': 'log2',
 'max_depth': 30}

In [None]:
# with dummy var

{'warm_start': False,
 'n_estimators': 500,
 'min_samples_split': 20,
 'min_samples_leaf': 8,
 'max_features': 'log2',
 'max_depth': 30}

In [None]:
#with label encoding only

{'warm_start': False,
 'n_estimators': 750,
 'min_samples_split': 20,
 'min_samples_leaf': 8,
 'max_features': 'log2',
 'max_depth': 50}

In [16]:


dtrain = xgb.DMatrix(train_num_features,label = train_num_label,nthread=-1)
# param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'}
# num_round = 10
bst = xgb.XGBRFRegressor(n_estimators=500,objective='reg:squarederror',n_jobs=-1)
model = bst.fit(train_num_features,train_num_label)
predict = model.predict(test_num_features)

mean_squared_error(test_num_label,predict)


  if getattr(data, 'base', None) is not None and \


717.034990487379

In [31]:
{'warm_start': False,
 'n_estimators': 250,
 'min_samples_split': 30,
 'min_samples_leaf': 50,
 'max_features': 'log2',
 'max_depth': 30}

### ---- 9 Test models ----

In [23]:
#do 5-fold cross validation on models and measure MSE


rf = RandomForestRegressor(warm_start = False, n_estimators=250, min_samples_split= 30, min_samples_leaf= 50, max_features= None, max_depth=30)

cv = cross_validate(rf,train_num_features,train_num_label, scoring = 'neg_mean_squared_error',cv = 5,n_jobs = 3)

In [24]:
cv



{'fit_time': array([544.82299995, 542.27099991, 542.41199994, 445.81500006,
        445.58399987]),
 'score_time': array([8.05800009, 8.15799999, 8.21199989, 7.31799984, 7.31800032]),
 'test_score': array([-385.93969026, -384.86865774, -387.55273776, -386.2387099 ,
        -384.72555811]),
 'train_score': array([-364.49850353, -364.59984675, -363.76759415, -364.43073224,
        -364.49207162])}

### Light GBM Model

In [22]:
lgb_train_X = lgb.Dataset(train_num_features)
lgb_train_y = lgb.Dataset(train_num_label)

In [15]:
lgbm = lgb.LGBMRegressor()

In [16]:
lgbm.get_params


<bound method LGBMModel.get_params of LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       importance_type='split', learning_rate=0.1, max_depth=-1,
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
       random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=0)>

In [17]:

num_round = 300

#  'min_split_gain' : [x for x in np.arange(.1,.25,.01)],
# #               'min_child_weight'  : [x for x in np.arange(.1,.25,.01)],
#               'min_child_samples' : [x for x in range(10,90,5)]
# #               'colsample_bytree'  : [x for x in np.arange(.1,.25,.01)]

In [18]:
param_grid = {'n_estimators': [x for x in range(300,600,50)],
          'num_leaves':[x for x in range(5,50,10)],
#           'max_depth':[x for x in range(20,80,10)],
          'learning_rate':[x for x in np.arange(.2,.35,.03)],
          'max_bin': [x for x in range(100,400,10)],
          'reg_alpha': [.1, .2, .3, .4, .5],
          'reg_lambda': [.1, .2, .3, .4, .5, .6, .7],
          'boosting_type': ['dart'],
          'subsample_for_bin':[x for x in range(200000,350000,10000)],
               'min_split_gain' : [x for x in np.arange(.01,.5,.05)],
              'min_child_weight'  : [x for x in np.arange(.05,.5,.05)],
              'min_child_samples' : [x for x in range(5,30,5)]
#               'colsample_bytree'  : [x for x in np.arange(1.0,2.5,.5)]
             }

In [19]:

lgb_random = RandomizedSearchCV(estimator = lgbm, 
                               param_distributions = param_grid, 
                               n_iter = 5, 
                               cv = 3, 
                               verbose=3, 
                               n_jobs = 3,
                               scoring = 'neg_mean_squared_error')
# Fit the random search model
lgb_random.fit(train_num_features,train_num_label)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  15 out of  15 | elapsed: 31.1min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       importance_type='split', learning_rate=0.1, max_depth=-1,
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
       random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
          fit_params=None, iid='warn', n_iter=5, n_jobs=3,
          param_distributions={'n_estimators': [300, 350, 400, 450, 500, 550], 'num_leaves': [5, 15, 25, 35, 45], 'learning_rate': [0.2, 0.23, 0.26, 0.29000000000000004, 0.32], 'max_bin': [100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, 280, 290, 300, 310, 320, 330, 3...00000002, 0.2, 0.25, 0.3, 0.35000000000000003, 0.4, 0.45], 'min_child_samples': [5, 10, 15, 20, 25]},
          pre_dispatch='2*n_j

In [20]:

lgb_best = lgb_random.best_estimator_
# lgb_best.predict(X_test)
ypred = lgb_best.predict(test_num_features)


In [21]:
print(lgb_random.best_params_)
print(mean_squared_error(test_num_label,ypred))

{'subsample_for_bin': 290000, 'reg_lambda': 0.2, 'reg_alpha': 0.3, 'num_leaves': 15, 'n_estimators': 400, 'min_split_gain': 0.21000000000000002, 'min_child_weight': 0.4, 'min_child_samples': 15, 'max_bin': 360, 'learning_rate': 0.23, 'boosting_type': 'dart'}
380.23056931807747


In [None]:
{'subsample_for_bin': 280000, 'reg_lambda': 0.5, 'reg_alpha': 0.3, 'num_leaves': 25, 'n_estimators': 500, 'min_split_gain': 0.060000000000000005, 'min_child_weight': 0.05, 'min_child_samples': 25, 'max_bin': 210, 'learning_rate': 0.27, 'boosting_type': 'dart'}
353.8570277502128

### ---- 10 Select best model  ----

In [None]:
#select the model with the lowest error as your "prodcuction" model

## Part 4 - DEPLOY

### ---- 11 Automate pipeline ----

In [None]:
#write script that trains model on entire training set, saves model to disk,
#and scores the "test" dataset

### ---- 12 Deploy solution ----

In [16]:
#save your prediction to a csv file or optionally save them as a table in a SQL database
#additionally, you want to save a visualization and summary of your prediction and feature importances
#these visualizations and summaries will be extremely useful to business stakeholders

### ---- 13 Measure efficacy ----

We'll skip this step since we don't have the outcomes for the test data