In [1]:
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import pickle
from sklearn import linear_model
from sklearn import preprocessing
from sklearn import metrics

# Load data 
Either from database or import from CSV (if available)

In [2]:
with open('/Users/jamoth/DSR/DataScienceJobs/data/SQL_access.pkl','rb') as file:
    PASSWORD = pickle.load(file)
engine = create_engine('postgresql://postgres:'+PASSWORD+'@dsj-1.c9mo6xd9bf9d.us-west-2.rds.amazonaws.com:5432/')
df = pd.read_sql("select * from all_data where language like 'en'", engine)

#df = pd.read_csv('/Users/jamoth/DSR/DataScienceJobs/data/DB_2019_11_20.csv', index_col=0)
#df = df.reset_index(drop=True)

Import the Bag-of-Word and TF-IDF models:

In [3]:
with open('/Users/jamoth/DSR/DataScienceJobs/Pickles/BOG_transform.pkl', 'rb') as file:
    BOG_transform = pickle.load(file)
    
with open('/Users/jamoth/DSR/DataScienceJobs/Pickles/TFIDF_transform.pkl', 'rb') as file:
    TFIDF_transform = pickle.load(file)

# Basic data cleaning
1. Drop NaNs: Average salary in euros will be our predicted value, so it is dropped if there is not value available. If no region, country or company is available, it will also be dropped. 
2. We only consider yearly salaries, other salaries will be dropped.
3. The remaining columns in the dataframe will be dropped as well.

In [4]:
df1 = df.dropna(subset = ['salary_average_euros','region','country','train_test_label','company'], axis=0)

df1 = df1.loc[df1.salary_type == 'yearly']

df1 = df1.drop(columns=['ref_code','url','location','posted_date','extraction_date','index','id','language','jobtype',
                        'salary','salary_low','salary_high','salary_low_euros','salary_high_euros','salary_average',
                        'currency','salary_type'], axis=1)

The data is already split in train and test set (train_test_label indicates for the corresponding rows. The indices for the train and test data in the dataframe is extracted as the BOG and TFIDF models need to be split based on those indices.

In [5]:
df1 = df1.reset_index(drop=True)
x_train = df1.loc[df1['train_test_label']=='train']
x_test = df1.loc[df1['train_test_label']=='test']

y_train = x_train['salary_average_euros']
y_test = x_test['salary_average_euros']

train_index = x_train.index
test_index = x_test.index

# One Hot Encoding
1. Select the columns, which should be one-hot-encoded: company, country, region, job_title.
2. Perform the one-hot-encoding for the training data.

In [6]:
train_enc = x_train[['job_title','company','country','region']]
enc = preprocessing.OneHotEncoder(categories='auto', handle_unknown='ignore')
enc.fit(train_enc)
OHE_train = enc.transform(train_enc).toarray()

Extract the rows corresponding to the training data from the matrix of the BOG/TFIDF encoding of the job descriptions. Combine the BOG/TFIDF matrix with the one-hot-encoding matrix of the training data.

In [7]:
tfidf_train = TFIDF_transform[train_index,:].toarray()
BOG_train = BOG_transform[train_index,:].toarray()
OHE_tridf_train = np.hstack((OHE_train, tfidf_train))
OHE_BOG_train = np.hstack((OHE_train, BOG_train))

# Fit Linear Regression model

In [8]:
regr_BOG = linear_model.LinearRegression(normalize=True) #normalize=True
regr_BOG.fit(OHE_BOG_train, y_train) #training the algorithm

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

In [9]:
regr_TFIDF = linear_model.LinearRegression() #normalize=True
regr_TFIDF.fit(OHE_tridf_train, y_train) #training the algorithm

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

# Evaluate model on test data
1. Transform test data: map the test data to the one-hot-encoding matrix. Extract the rows corresponding to the test data from the matrix of the TFIDF encoding of the job descriptions. Combine the TFIDF matrix with the one-hot-encoding matrix of the test data.
2. Predict salary for test data
3. Compare results

In [10]:
test_enc = x_test[['job_title','company','country','region']]
OHE_test = enc.transform(test_enc).toarray()

BOG_test = BOG_transform[test_index,:].toarray()
tfidf_test = TFIDF_transform[test_index,:].toarray()
OHE_BOG_test = np.hstack((OHE_test, BOG_test))
OHE_TFIDF_test = np.hstack((OHE_test, tfidf_test))

In [11]:
y_pred_BOG = regr_BOG.predict(OHE_BOG_test)
y_pred_TFIDF = regr_TFIDF.predict(OHE_TFIDF_test)

In [12]:
evaluate_df_BOG = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_BOG.flatten()})
evaluate_df_TFIDF = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_TFIDF.flatten()})
evaluate_df_BOG.head()

Unnamed: 0,Actual,Predicted
2,62700.0,55113.780513
4,54150.0,55046.00587
10,79800.0,83065.737842
15,175000.0,120120.875223
17,25080.0,62646.79147


In [13]:
evaluate_df_TFIDF.head()

Unnamed: 0,Actual,Predicted
2,62700.0,52569.951027
4,54150.0,47709.029949
10,79800.0,90502.377295
15,175000.0,144367.209229
17,25080.0,34957.401674


In [13]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_BOG))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_BOG))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_BOG)))
print('R2 Score:', np.sqrt(metrics.r2_score(y_test,y_pred_BOG)))

Mean Absolute Error: 22269.403219120846
Mean Squared Error: 1063229454.7509742
Root Mean Squared Error: 32607.199431275512
R2 Score: 0.5719676828559899


In [14]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_TFIDF))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_TFIDF))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_TFIDF)))
print('R2 Score:', np.sqrt(metrics.r2_score(y_test, y_pred_TFIDF)))

Mean Absolute Error: 18287.33536950092
Mean Squared Error: 678731040.7128122
Root Mean Squared Error: 26052.46707536183
R2 Score: 0.7552963575080731
