In [10]:
import pandas as pd

# get embedding vectors - each row is a job posting, each column is a feature
embeddings = pd.read_csv('text_embeddings.csv', index_col=0)

all_postings = pd.read_csv('processed_description.csv', index_col=0)

text = all_postings['processed_description']
salary_bins = all_postings['salary_bin']
salary_ranges = all_postings['salary_range']





## Split train, validation, and test set: 


In [11]:
from sklearn.model_selection import train_test_split



X_train, X_test, y_train, y_test = train_test_split(embeddings, salary_bins, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)

## Logistic Regression

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_recall_curve, auc

model = LogisticRegression(max_iter=1000, C = 1)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)



print('Accuracy:', accuracy_score(y_test, y_pred))
print('F1:', f1_score(y_test, y_pred, average='weighted'))
print('AUROC:', roc_auc_score(y_test, y_pred_proba, multi_class='ovr'))





Accuracy: 0.5838951310861423
F1: 0.5801127152649272
AUROC: 0.8205238544057819


## Random Forest Classifier

In [25]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, max_depth=20)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('F1:', f1_score(y_test, y_pred, average='weighted'))
print('ROC AUC:', roc_auc_score(y_test, y_pred_proba, multi_class='ovr'))



Accuracy: 0.6209737827715356
F1: 0.6108050157548137
ROC AUC: 0.8449678741548263


## XGBoost

In [26]:
import xgboost as xgb

xgb = xgb.XGBClassifier(objective='multi:softmax', num_class=5, max_depth=20, n_estimators=100)
xgb.fit(X_train, y_train)

y_pred = xgb.predict(X_test)
y_pred_proba = xgb.predict_proba(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('F1:', f1_score(y_test, y_pred, average='weighted'))
print('ROC AUC:', roc_auc_score(y_test, y_pred_proba, multi_class='ovr'))



Accuracy: 0.6262172284644195
F1: 0.6227912112059231
ROC AUC: 0.8498956393466891


## Feed Forward Network

In [28]:
#  feedforward
from sklearn.neural_network import MLPClassifier

model = MLPClassifier(hidden_layer_sizes=(100,50) , max_iter=1000, learning_rate='adaptive', )
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('F1:', f1_score(y_test, y_pred, average='weighted'))
print('ROC AUC:', roc_auc_score(y_test, y_pred_proba, multi_class='ovr'))



Accuracy: 0.601123595505618
F1: 0.6018189006020477
ROC AUC: 0.8058940526022109


## Dense Neural Network

In [142]:
import torch
import torch.nn as nn
import torch.optim as optim

class NN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, hidden_layers=200):
        super(NN, self).__init__()
        self.layers = nn.ModuleList()
        self.layers.append(nn.Linear(input_dim, hidden_dim))
        self.layers.append(nn.ReLU())
        for i in range(hidden_layers):
            self.layers.append(nn.Linear(hidden_dim, hidden_dim))
            self.layers.append((nn.ReLU()))
        self.layers.append(nn.Linear(hidden_dim, output_dim))
  
        

    
    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x
    
    


In [143]:
# create train and test loaders
from torch.utils.data import TensorDataset, DataLoader

X_train_tensor = torch.tensor(X_train.values).float()
y_train_tensor = torch.tensor(y_train.values).float()
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

X_test_tensor = torch.tensor(X_test.values).float()
y_test_tensor = torch.tensor(y_test.values).float()
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=True)




In [144]:
device = 'mps'
from sklearn.metrics import roc_auc_score, auc, precision_recall_curve
from torch.nn import functional as F
def train(epoch, model, optimizer, verbose=False):
    model.train()
    epoch_loss = 0
    losses = []
    for batch_idx, (data, target) in enumerate(train_loader):
        # send data to device, where the "device" is either a GPU if it exists or a CPU
        data, target = data.to(device), target.to(device)
        
       
        optimizer.zero_grad()
        # forward pass through the model
        output = model(data)
        # forward pass through the cross-entropy loss function
        loss = F.cross_entropy(output, target)
        # backward pass through the cross-entropy loss function and the model
        loss.backward()
        
        optimizer.step()
        if batch_idx % 50 == 0:
            losses.append(loss.detach())
            if verbose :
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, batch_idx * len(data), len(train_loader.dataset),
                    100. * batch_idx / len(train_loader), loss.item()))
    return losses

def test(model, verbose=False):
    model.eval()
    accuracy_list = []
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            # send data to device, where the "device" is either a GPU if it exists or a CPU
            data, target = data.to(device), target.to(device)

            output = model(data)
            test_loss += F.cross_entropy(output, target, reduction='sum').item() # sum up batch loss                                                               
            pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability      
                                                                       
            correct += pred.eq(target.data.view_as(pred)).cpu().sum().item()

        test_loss /= len(test_loader.dataset)
        accuracy = 100. * correct / len(test_loader.dataset)
        accuracy_list.append(accuracy) 
        if verbose :
            print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
                test_loss, correct, len(test_loader.dataset),
                accuracy))
    return test_loss
        
    
  



In [220]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

hidden_dim = 100

hidden_layers = 25
input_dim = embeddings.shape[1]
output_dim = 4
model = NN(input_dim, hidden_dim, output_dim, hidden_layers)

optimizer = optim.Adam(model.parameters(), lr=0.001)

model.to(device)    

for epoch in range(0, 10):
    train(epoch, model, optimizer, verbose=True)
    test(model, verbose=True)









Test set: Average loss: 1.3317, Accuracy: 1019/2670 (38%)


Test set: Average loss: 1.3297, Accuracy: 1019/2670 (38%)


Test set: Average loss: 1.3305, Accuracy: 1019/2670 (38%)


Test set: Average loss: 1.3352, Accuracy: 1019/2670 (38%)


Test set: Average loss: 1.3301, Accuracy: 1019/2670 (38%)


Test set: Average loss: 1.3300, Accuracy: 1019/2670 (38%)


Test set: Average loss: 1.3294, Accuracy: 1019/2670 (38%)


Test set: Average loss: 1.3299, Accuracy: 1019/2670 (38%)


Test set: Average loss: 1.3294, Accuracy: 1019/2670 (38%)


Test set: Average loss: 1.3327, Accuracy: 1019/2670 (38%)



### LSTM Model (RNN Variant)
Long Short-Term Memory layer in a neural netowrk model perform well in tasks involve learning long-term dependencies in sequential data, addressing limitation of a traditional RNN.

In [178]:
class LSTMModel(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_layers, output_dim):
        super(LSTMModel, self).__init__()
        self.num_layers = num_layers
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers,batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out)
        return out
    


In [221]:
embedding_dim = embeddings.shape[1]
hidden_dim = 100
output_dim = 4
num_layers = 2

model = LSTMModel(embedding_dim, hidden_dim, num_layers, output_dim)

optimizer = optim.Adam(model.parameters(), lr=0.001)

model.to(device)

for epoch in range(0, 10):
    train(epoch, model, optimizer, verbose=True)
    test(model, verbose=True)
    


Test set: Average loss: 1.0279, Accuracy: 1393/2670 (52%)


Test set: Average loss: 0.9725, Accuracy: 1492/2670 (56%)


Test set: Average loss: 0.9520, Accuracy: 1513/2670 (57%)


Test set: Average loss: 0.9401, Accuracy: 1529/2670 (57%)


Test set: Average loss: 0.9339, Accuracy: 1570/2670 (59%)


Test set: Average loss: 0.9229, Accuracy: 1576/2670 (59%)


Test set: Average loss: 0.9197, Accuracy: 1572/2670 (59%)


Test set: Average loss: 0.9110, Accuracy: 1594/2670 (60%)


Test set: Average loss: 0.8943, Accuracy: 1635/2670 (61%)


Test set: Average loss: 0.9221, Accuracy: 1599/2670 (60%)

