# Salary Predictions Based on Job Descriptions

# Part 1 - DEFINE

### ---- 1 Define the problem ----

Be able to predict the salaries of various job postings

In [1]:
#import your libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


#your info here
__author__ = "Jared Teerlink"
__email__ = "jteerlink@gmail.com"

## Part 2 - DISCOVER

### ---- 2 Load the data ----

In [2]:
#load the data into a Pandas dataframe
train_features = pd.read_csv('../Data/train_features.csv')
train_salaries = pd.read_csv('../Data/train_salaries.csv')
test_features = pd.read_csv('../Data/test_features.csv')


### ---- 3 Clean the data ----

In [None]:
#look for duplicate data, invalid data (e.g. salaries <=0), or corrupt data and remove it



train_combined = train_features.merge(train_salaries, on = 'jobId', how = 'left')
train_combined = train_combined[train_combined.salary > 0]
# train_combined.head()


In [4]:
train_combined.isnull().sum()

jobId                  0
companyId              0
jobType                0
degree                 0
major                  0
industry               0
yearsExperience        0
milesFromMetropolis    0
salary                 0
dtype: int64

In [None]:
train_combined.drop(labels = ['jobId', 'companyId'],inplace=True,axis=1)
                      


### ---- 4 Explore the data (EDA) ----

In [None]:
import pandas_profiling
pandas_profiling.ProfileReport(train_combined)

In [None]:
train_combined.info()

In [None]:
plt.scatter(train_combined.milesFromMetropolis,train_combined.salary)

In [None]:
train_combined.groupby(['jobType','degree'])['salary'].mean()



In [None]:
train_combined.dropna()

In [None]:
print(set(train_num['degree_ord']), set(train_num['jobType_ord']))

# degree 2-4 and type4-7 ==A
# degree 0-1 and type4-7 ==B
# degree 2-4 and type2-3 ==C
# degree 0-1 and type2-3 ==D
# degree 2-4 and type0-1 ==E
# degree 0-1 and type0-1 ==F

def reclassjob(df):
    #some informative note should go here
    if (2 <= df.degree_ord <= 4 and 4 <= df.jobType_ord <= 7):
        return 5
    elif (0 <= df.degree_ord <= 1 and 4 <= df.jobType_ord <= 7):
        return 4
    elif (2 <= df.degree_ord <= 4 and 2 <= df.jobType_ord <= 3):
        return 3
    elif (0 <= df.degree_ord <= 1 and 2 <= df.jobType_ord <= 3):
        return 2
    elif (2 <= df.degree_ord <= 4 and 0 <= df.jobType_ord <= 1):
        return 1
    else:
        return 0
        

In [None]:
train_num['customclass'] = train_num.apply(reclassjob,axis=1)

In [None]:
#summarize each feature variable
#summarize the target variable
#look for correlation between each feature and the target
#look for correlation between features

train_features.describe()

train_features.isnull().sum()


In [None]:
train_combined.describe()

In [None]:
x = train_combined.degree.groupby(train_combined['degree']).count()

plt.bar(x.index,x)

In [None]:
numerical_features = ['yearsExperience','milesFromMetropolis']
cat_features = ['jobType','degree','industry']

subsize = str(round(len(cat_features))-1)

In [None]:
def cat_bar(x,y):
    data = x.groupby(x).count()  
#     plt.bar(data.index,data)
    plt.boxplot(data)
    plt.xticks(rotation=90)
    plt.show()


for i in cat_features:
    cat_bar(train_combined[i],cat_features)


In [None]:
train_combined[train_combined.salary>100].groupby(train_combined['jobType']).mean()

In [None]:
plt.matshow(train_combined.corr())
plt.show()

In [None]:
sns.heatmap(train_combined.corr())

In [None]:
train_combined.dtypes
# train_combined.corr()

# drop major 
train_combined = train_combined.drop('major_enc',axis = 1)

In [None]:
#prepare final data frames

train_num = train_combined.select_dtypes(exclude = 'object')

train_final = train_num.drop(['salary'],axis = 1)

train_salary = train_combined['salary']

### ---- 5 Establish a baseline ----

In [None]:
avg_salary = pd.DataFrame(train_combined.salary.groupby(train_combined['jobType']).mean())



avg_salary

In [25]:
train_with_avg = train_combined.merge(avg_salary,how = 'left',left_on = 'jobType', right_on=avg_salary.index)


### --Baseline Metric--

In [26]:
from sklearn.metrics import mean_squared_error
mean_squared_error(train_with_avg.salary_x,train_with_avg.salary_y)

963.9252996562975

## Part 3 - DEVELOP

You will cycle through creating features, tuning models, and training/validing models (steps 7-9) until you've reached your efficacy goal

#### Your metric will be MSE and your goal is:
 - <360 for entry-level data science roles
 - <320 for senior data science roles

### ---- 7 Engineer features  ----

In [None]:
#make sure that data is ready for modeling
#create any new features needed to potentially enhance model

In [None]:
#Dummy Variables
# major = pd.get_dummies(train_combined['major'], prefix_sep='_', drop_first=True)

industry = pd.get_dummies(train_combined['industry'], prefix_sep='_', drop_first=True)

train_combined = pd.concat([train_combined, industry], axis=1)

In [None]:
from sklearn.preprocessing import LabelEncoder

label_enc = LabelEncoder()

# major_coded = label_enc.fit_transform(train_combined['major'])


industry_coded = label_enc.fit_transform(train_combined['industry'])

# train_combined = pd.concat([train_combined, major_coded, industry_coded], axis=1)

# train_combined['major_enc'] = major_coded
train_combined['industry_enc'] = industry_coded

In [None]:
# Ordinal Encoding - degree and jobtype

degree = [['NONE', 'HIGH_SCHOOL', 'BACHELORS', 'MASTERS', 'DOCTORAL']]
degree_array = np.asarray(train_combined.degree).reshape(-1,1)

from sklearn.preprocessing import OrdinalEncoder

enc = OrdinalEncoder(categories=degree)
degree_ord = enc.fit_transform(degree_array)

train_combined['degree_ord'] = degree_ord
# ordered by average salary
titles = [['JANITOR', 'JUNIOR', 'SENIOR', 'MANAGER', 'VICE_PRESIDENT', 'CFO', 'CTO', 'CEO']]
titles_array = np.asarray(train_combined.jobType).reshape(-1,1)

enc = OrdinalEncoder(categories=titles)
titles_ord = enc.fit_transform(titles_array)

train_combined['jobType_ord'] = titles_ord

### ---- 8 Create models ----

In [None]:
from sklearn.model_selection import train_test_split

train_num_features, test_num_features, train_num_label, test_num_label = train_test_split(train_final,train_salary)

In [None]:
#select a reasonable metric (MSE in this case)
#create an extremely simple model and measure its efficacy
#e.g. use "average salary" for each industry as your model and then measure MSE
#during 5-fold cross-validation


from sklearn.linear_model import LinearRegression
lm = LinearRegression(n_jobs=-1)

model = lm.fit(train_num_features,train_num_label)
predict = model.predict(test_num_features)


from sklearn.metrics import mean_squared_error
mean_squared_error(test_num_label,predict)

In [None]:
#sklearn.linear_model.ElasticNetCV

from sklearn.linear_model import ElasticNetCV

enr = ElasticNetCV(max_iter=10000,cv=5,n_jobs=-1)

model = enr.fit(train_num_features,train_num_label)

predict = model.predict(test_num_features)

mean_squared_error(test_num_label,predict)




In [None]:
489.4892428466449

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=500,n_jobs = -1)
model = rf.fit(train_num_features,train_num_label)
predict = model.predict(test_num_features)
mean_squared_error(test_num_label,predict)

In [None]:
498.50002597691974

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

In [None]:
rf.params

In [None]:
# sklearn.ensemble.RandomForestRegressor

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV


# Number of trees in random forest
n_estimators = [250,500,750]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2',None]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 50, num = 3)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [10,15,20]
# Minimum number of samples required at each leaf node
min_samples_leaf = [4, 6, 8]
warm_start = [True,False]

# Create the random grid
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'warm_start': warm_start
               }
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_grid = RandomizedSearchCV(estimator = rf, param_distributions = param_grid, cv = 3, verbose=3, n_jobs = 3,scoring='neg_mean_squared_error',n_iter=50)
# Fit the random search model
rf_grid.fit(train_num_features,train_num_label)


In [None]:
rf_grid.best_params_


In [None]:
{'warm_start': False,
 'n_estimators': 500,
 'min_samples_split': 20,
 'min_samples_leaf': 8,
 'max_features': 'log2',
 'max_depth': 30}

In [None]:
import xgboost as xgb

dtrain = xgb.DMatrix(train_num_features,label = train_num_label,nthread=-1)
# param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'}
# num_round = 10
bst = xgb.XGBRFRegressor(n_estimators=500,objective='reg:squarederror',n_jobs=-1)
model = bst.fit(train_num_features,train_num_label)
predict = model.predict(test_num_features)

mean_squared_error(test_num_label,predict)


## Part 4 - DEPLOY

### ---- 11 Automate pipeline ----

In [None]:
#write script that trains model on entire training set, saves model to disk,
#and scores the "test" dataset

### ---- 12 Deploy solution ----

In [None]:
#save your prediction to a csv file or optionally save them as a table in a SQL database
#additionally, you want to save a visualization and summary of your prediction and feature importances
#these visualizations and summaries will be extremely useful to business stakeholders

### ---- 13 Measure efficacy ----

We'll skip this step since we don't have the outcomes for the test data