<h1>Salary Predictions Based on Job Descriptions</h1>

'''This notebook pulls in salary data, builds and tests several predictive models,
   and then makes salary predictions on test data using the best model.'''

__author__ = "Jennifer Ma"
__email__ = "jenniferfmma@gmail.com"

<h3>Import packages</h3>

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
# from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.utils import shuffle

In [11]:
# Input file
training_df_file = 'data/training_df.csv'
target_df_file = 'data/train_salaries.csv'
test_features_file = 'data/test_features.csv'

# Variables
cat_cols = ['companyId', 'jobType', 'degree', 'major', 'industry']
num_cols = ['yearsExperience', 'milesFromMetropolis']
target_col = 'salary'

def load_file(filename):
    '''load csv to pd dataframe'''
    return pd.read_csv(filename)

training_df = load_file(training_df_file)
target_df = load_file(target_df_file)
test_df = load_file(test_features_file)

In [12]:
# Clean up target df by removing salaries <= 0 as seen during EDA
clean_target_df = target_df[target_df.salary>0]

clean_target_df.head()

Unnamed: 0,jobId,salary
0,JOB1362684407687,130
1,JOB1362684407688,101
2,JOB1362684407689,137
3,JOB1362684407690,142
4,JOB1362684407691,163


## Part 3 - DEVELOP MODEL

Cycle through creating features, tuning models, and training/validating models (steps 7-9) until our efficacy goal of <360 MSE is reached.

### 7 : Engineer features

In [None]:
#make sure that data is ready for modeling
#create any new features needed to potentially enhance model

In [5]:
# Before modeling, categorical variables to be encoded for both the training and test data

def one_hot_encoding_df(df, cat_vars = None, num_vars = None):
    '''Perform one-hot encoding on categorical variables and combine with numerical variables'''
    cat_df = pd.get_dummies(training_df[cat_vars])
    num_df = df[num_vars].apply(pd.to_numeric)
    return pd.concat([cat_df, num_df], axis = 1)

training_feat_df = one_hot_encoding_df(training_df, cat_vars = cat_cols, num_vars = num_cols)
test_feat_df = one_hot_encoding_df(test_df, cat_vars = cat_cols, num_vars = num_cols)

Unnamed: 0,companyId_COMP0,companyId_COMP1,companyId_COMP10,companyId_COMP11,companyId_COMP12,companyId_COMP13,companyId_COMP14,companyId_COMP15,companyId_COMP16,companyId_COMP17,...,major_PHYSICS,industry_AUTO,industry_EDUCATION,industry_FINANCE,industry_HEALTH,industry_OIL,industry_SERVICE,industry_WEB,yearsExperience,milesFromMetropolis
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,10,83
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,3,73
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,10,38
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,8,17
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,8,16


In [8]:
training_feat_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999995 entries, 0 to 999994
Data columns (total 94 columns):
companyId_COMP0           999995 non-null uint8
companyId_COMP1           999995 non-null uint8
companyId_COMP10          999995 non-null uint8
companyId_COMP11          999995 non-null uint8
companyId_COMP12          999995 non-null uint8
companyId_COMP13          999995 non-null uint8
companyId_COMP14          999995 non-null uint8
companyId_COMP15          999995 non-null uint8
companyId_COMP16          999995 non-null uint8
companyId_COMP17          999995 non-null uint8
companyId_COMP18          999995 non-null uint8
companyId_COMP19          999995 non-null uint8
companyId_COMP2           999995 non-null uint8
companyId_COMP20          999995 non-null uint8
companyId_COMP21          999995 non-null uint8
companyId_COMP22          999995 non-null uint8
companyId_COMP23          999995 non-null uint8
companyId_COMP24          999995 non-null uint8
companyId_COMP25         

### 8 : Create and Test Models

In [None]:
# Create and tune the models brainstormed in part 2
models = []
mean_mse = {}
cv_std = {}
res = {}


In [None]:
# Create Linear Regression Model
linReg = LinearRegression()

linReg_model = linReg.fit(training_feat_df, clean_target_df)

#pipeline = make_pipeline(StandardScaler(), LinearRegression())
#pipeline.fit(training_feat_df, target_df)

# do 3-fold cross validation on models and measure MSE

neg_mse = cross_val_score(linReg, training_feat_df, clean_target_df, scoring = 'neg_mean_squared_error', cv = 3)
mean_mse = -1.0*np.mean(neg_mse)
cv_std = np.std(neg_mse)

In [None]:
# Create Random Forest Regression Model
rf = RandomForestRegressor()

In [None]:
# Create Gradient Boosted Regression Model
gbr = GradientBoostingRegression()

In [None]:
# Use cross val score to evaluate MSE of each model and print a summary for each one

### ---- 10 Select best model  ----

In [None]:
#select the model with the lowest error as your "production" model

## Part 4 - DEPLOY

### ---- 11 Automate pipeline ----

In [None]:
#write script that trains model on entire training set, saves model to disk,
#and scores the "test" dataset

### ---- 12 Deploy solution ----

In [None]:
#save your prediction to a csv file or optionally save them as a table in a SQL database
#additionally, you want to save a visualization and summary of your prediction and feature importances
#these visualizations and summaries will be extremely useful to business stakeholders