In [1]:
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import MultiLabelBinarizer
import model_encodings_jm as e

# Import Data from Database
1. Only take the data with English Job descriptions (for BOG and TFIDF it is necessary that the job descriptions have all the same language)
2. Drop Rows that contain NaNs in the important Columns (the columns that we will use as features)
3. Only considers the yearly salary (monthly, weekly salary will mess up the results)

In [2]:
with open('/Users/jamoth/DSR/DataScienceJobs/data/SQL_access.pkl','rb') as file:
       PASSWORD = pickle.load(file)
engine = create_engine('postgresql://postgres:'+PASSWORD+'@dsj-1.c9mo6xd9bf9d.us-west-2.rds.amazonaws.com:5432/')
df = pd.read_sql("select * from all_data where language like'en'", engine)

df1 = df.dropna(subset = ['salary_average_euros','region','country','train_test_label','company','description'], axis=0)
df1 = df1.loc[df1.salary_type == 'yearly']

# Splitting the data in Train, Validation and Test sets
1. Reset the index of the dataframe (as we dropped some rows in our previous steps)
2. Split the data based on the assignment in the 'Train_test_label' column from the database.
3. Split the Train Set into Train and Validation set
4. Track the index for each data set.

In [4]:
df1 = df1.reset_index(drop=True)

#first split the train from the test as denoted in the database
df_train = df1.loc[df1['train_test_label']=='train']
x_test = df1.loc[df1['train_test_label']=='test']
df_train_y = df_train['salary_average_euros']
y_test = x_test['salary_average_euros']

# then split the train data into train and validation
x_train, x_val, y_train, y_val = train_test_split(df_train, df_train_y, test_size=0.2, random_state=42)

train_index = x_train.index
val_index = x_val.index
test_index = x_test.index

# One Hot Encoding
1. Define which columns should be One-Hot-Encoded: Company, Country and Region
2. Fit the One-Hot-Encoding model only on the Train set
3. Transform the Train, Validation and Test data using the One-Hot-Encoding model

Result: Numpy arrays

In [5]:
columns_to_ohe_encode = ['company','country','region']
train_enc = x_train[columns_to_ohe_encode]
val_enc = x_val[columns_to_ohe_encode]
test_enc = x_test[columns_to_ohe_encode]

# only train encoding on train data
enc = preprocessing.OneHotEncoder(categories='auto', handle_unknown='ignore')
enc.fit(train_enc)

# get the names of the OHE features
col_headings = enc.get_feature_names(columns_to_ohe_encode)

# create encoding
OHE_train = enc.transform(train_enc).toarray()
OHE_val = enc.transform(val_enc).toarray()
OHE_test= enc.transform(test_enc).toarray()

# allocate columns names
#OHE_train.columns = col_headings
#OHE_val.columns  = col_headings
#OHE_test.columns  = col_headings

# Bag of Words (BOG) Encoding
1. Fit the BOG model with the Train set
2. Transform Train, Validation and Test set using the BOG model

Result: Numpy arrays

In [6]:
BOG_model = e.encode_BOG(x_train,min_df = 3)

BOG_train= BOG_model.transform(x_train['description']).toarray()
BOG_val= BOG_model.transform(x_val['description']).toarray()
BOG_test= BOG_model.transform(x_test['description']).toarray()

#feature_names_bog = BOG_model.get_feature_names()
#BOG_train.columns = feature_names_bog
#BOG_val.columns = feature_names_bog
#BOG_test.columns = feature_names_bog

Selected only English job descriptions...

Performed some basic text cleaning...

Trained Bag-Of-Words model...



# Term Frequency – Inverse Document Frequency (TF-IDF) Encoding
1. Fit the TF-IDF model with the Train set
2. Transform Train, Validation and Test set using the TF-IDF model

Result: Numpy arrays

In [7]:
TFIDF_model= e.encode_TFIDF(x_train,min_df = 3)
TFIDF_train= TFIDF_model.transform(x_train['description']).toarray()
TFIDF_val= TFIDF_model.transform(x_val['description']).toarray()
TFIDF_test= TFIDF_model.transform(x_test['description']).toarray()

#feature_names_tfidf = TFIDF_model.get_feature_names()
#TFIDF_train.columns = feature_names_tfidf
#TFIDF_val.columns = feature_names_tfidf
#TFIDF_test.columns = feature_names_tfidf

Selected only English job descriptions...

Performed some basic text cleaning...

Trained TF-IDF model...



# Encoding of Technical Terms from Job description
1. Load dictionary with technical terms from Pickle file
2. Select the categories, which should be included
3. Create a list with all technical terms
4. Extract a list of technical terms that occur in the job description (for Train, Validation and Test set)
5. Fit a Multilabelbinarizer model using the Train set
6. Transform Train, Validation and Test set using the Multilabelbinarizer model

Result: Numpy arrays

In [9]:
tech_dict =  pd.read_pickle('Pickles/broad_tech_dictionary.pickle')
categories_to_include = ['front_end-technologies', 'databases', 'software-infrastructure-devops','data-science','software_architecture', 'web_design','tools','cyber_security','cloud_computing','back_end-technologies', 'mobile']

tech_list=[]

for i in categories_to_include:
    for j in range(len(tech_dict[i])):
        tech_list.append(tech_dict[i][j])
important_terms = list(set([x.lower() for x in tech_list]))

tech_terms_train = x_train['description'].apply(e.tech_process,args=(important_terms,))
tech_terms_val = x_val['description'].apply(e.tech_process,args=(important_terms,))
tech_terms_test = x_test['description'].apply(e.tech_process,args=(important_terms,))

mlb = MultiLabelBinarizer(classes = important_terms)
mlb.fit(tech_terms_train)
TECH_train = mlb.transform(tech_terms_train)
TECH_val = mlb.transform(tech_terms_val)
TECH_test = mlb.transform(tech_terms_test)

Check the shape of each Encoding-Matrices:
1. OHE matrix -> encoding Company, Country, Region
2. TFIDF matrix -> Encoding job descriptions
3. TECH matrix -> Encoding technical terms from the job description

In [10]:
print(OHE_train.shape)
print(TFIDF_train.shape)
print(TECH_train.shape)

(4104, 1553)
(4104, 7568)
(4104, 1433)


If Encoding-Matrices have the same number of rows, you can horizontally stack them together, to create the feature matrix:

In [11]:
X_Train = np.hstack((OHE_train, TFIDF_train, TECH_train))
X_Val = np.hstack((OHE_val, TFIDF_val, TECH_val))
X_Test = np.hstack((OHE_test, TFIDF_test, TECH_test))

In [12]:
print(X_Train.shape)
print(y_train.shape)

(4104, 10554)
(4104,)
