In [222]:
import pandas as pd

# get embedding vectors - each row is a job posting, each column is a feature
embeddings = pd.read_csv('text_embeddings.csv', index_col=0)

all_postings = pd.read_csv('processed_description.csv', index_col=0)

text = all_postings['processed_description']
salary_bins = all_postings['salary_bin']
salary_ranges = all_postings['salary_range']





## Split train, validation, and test set: 


In [223]:
from sklearn.model_selection import train_test_split



X_train, X_test, y_train, y_test = train_test_split(embeddings, salary_bins, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)

## Logistic Regression

In [224]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_recall_curve, auc

model = LogisticRegression(max_iter=1000, C = 1)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)



print('Accuracy:', accuracy_score(y_test, y_pred))
print('F1:', f1_score(y_test, y_pred, average='weighted'))
print('AUROC:', roc_auc_score(y_test, y_pred_proba, multi_class='ovr'))





Accuracy: 0.5913857677902622
F1: 0.5878727474522593
AUROC: 0.8310688877372037


## Random Forest Classifier

In [225]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, max_depth=20)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('F1:', f1_score(y_test, y_pred, average='weighted'))
print('ROC AUC:', roc_auc_score(y_test, y_pred_proba, multi_class='ovr'))



Accuracy: 0.6232209737827715
F1: 0.6154045287037953
ROC AUC: 0.8575517089215852


## XGBoost

In [230]:
import xgboost as xgb

xgb = xgb.XGBClassifier(objective='multi:softmax', num_class=5, max_depth=20, n_estimators=100)
xgb.fit(X_train, y_train)

y_pred = xgb.predict(X_test)
y_pred_proba = xgb.predict_proba(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('F1:', f1_score(y_test, y_pred, average='weighted'))
print('ROC AUC:', roc_auc_score(y_test, y_pred_proba, multi_class='ovr'))



Accuracy: 0.6333333333333333
F1: 0.6307232785335142
ROC AUC: 0.8663795074185793


## Feed Forward Network

In [None]:
#  feedforward
from sklearn.neural_network import MLPClassifier

model = MLPClassifier(hidden_layer_sizes=(100,50) , max_iter=1000, learning_rate='adaptive')
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('F1:', f1_score(y_test, y_pred, average='weighted'))
print('ROC AUC:', roc_auc_score(y_test, y_pred_proba, multi_class='ovr'))



Accuracy: 0.601123595505618
F1: 0.6018189006020477
ROC AUC: 0.8058940526022109


## Dense Neural Network

In [142]:
import torch
import torch.nn as nn
import torch.optim as optim

class NN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, hidden_layers=200):
        super(NN, self).__init__()
        self.layers = nn.ModuleList()
        self.layers.append(nn.Linear(input_dim, hidden_dim))
        self.layers.append(nn.ReLU())
        for i in range(hidden_layers):
            self.layers.append(nn.Linear(hidden_dim, hidden_dim))
            self.layers.append((nn.ReLU()))
        self.layers.append(nn.Linear(hidden_dim, output_dim))
  
        

    
    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x
    
    


In [None]:
# create train and test loaders
from torch.utils.data import TensorDataset, DataLoader

X_train_tensor = torch.tensor(X_train.values).float()
y_train_tensor = torch.tensor(y_train.values).float()
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

X_test_tensor = torch.tensor(X_test.values).float()
y_test_tensor = torch.tensor(y_test.values).float()
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=True)




In [None]:
device = 'mps'
from sklearn.metrics import roc_auc_score, auc, precision_recall_curve
from torch.nn import functional as F
def train(epoch, model, optimizer, verbose=False):
    model.train()
    epoch_loss = 0
    losses = []
    for batch_idx, (data, target) in enumerate(train_loader):
        # send data to device, where the "device" is either a GPU if it exists or a CPU
        data, target = data.to(device), target.to(device)
        
       
        optimizer.zero_grad()
        # forward pass through the model
        output = model(data)
        # forward pass through the cross-entropy loss function
        loss = F.cross_entropy(output, target)
        # backward pass through the cross-entropy loss function and the model
        loss.backward()
        
        optimizer.step()
        if batch_idx % 50 == 0:
            losses.append(loss.detach())
            if verbose :
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, batch_idx * len(data), len(train_loader.dataset),
                    100. * batch_idx / len(train_loader), loss.item()))
    return losses

def test(model, verbose=False):
    model.eval()
    accuracy_list = []
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            # send data to device, where the "device" is either a GPU if it exists or a CPU
            data, target = data.to(device), target.to(device)

            output = model(data)
            test_loss += F.cross_entropy(output, target, reduction='sum').item() # sum up batch loss                                                               
            pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability      
                                                                       
            correct += pred.eq(target.data.view_as(pred)).cpu().sum().item()

        test_loss /= len(test_loader.dataset)
        accuracy = 100. * correct / len(test_loader.dataset)
        accuracy_list.append(accuracy) 
        if verbose :
            print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
                test_loss, correct, len(test_loader.dataset),
                accuracy))
    return test_loss
        
    
  



In [238]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

hidden_dim = 50

hidden_layers = 4
input_dim = embeddings.shape[1]
output_dim = 4
model = NN(input_dim, hidden_dim, output_dim, hidden_layers)

optimizer = optim.Adam(model.parameters(), lr=0.001)

model.to(device)    

for epoch in range(0, 10):
    train(epoch, model, optimizer, verbose=True)
    test(model, verbose=True)








Test set: Average loss: 1.0354, Accuracy: 1362/2670 (51%)


Test set: Average loss: 0.9741, Accuracy: 1506/2670 (56%)


Test set: Average loss: 0.9675, Accuracy: 1503/2670 (56%)


Test set: Average loss: 0.9607, Accuracy: 1531/2670 (57%)


Test set: Average loss: 0.9395, Accuracy: 1558/2670 (58%)


Test set: Average loss: 0.9517, Accuracy: 1539/2670 (58%)


Test set: Average loss: 0.9354, Accuracy: 1574/2670 (59%)


Test set: Average loss: 0.9384, Accuracy: 1565/2670 (59%)


Test set: Average loss: 0.9595, Accuracy: 1528/2670 (57%)


Test set: Average loss: 0.9441, Accuracy: 1544/2670 (58%)



In [None]:
y_pred = model(X_test_tensor.to(device)).argmax(dim=1).cpu().numpy()
f1_score = f1_score(y_test, y_pred, average='weighted')
auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')



print('Dense Neural Network F1-Score: ', f1_score)
print('Dense Neural Network ROC AUC: ', auc)

### LSTM Model (RNN Variant)
Long Short-Term Memory layer in a neural netowrk model perform well in tasks involve learning long-term dependencies in sequential data, addressing limitation of a traditional RNN.

In [228]:
class LSTMModel(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_layers, output_dim):
        super(LSTMModel, self).__init__()
        self.num_layers = num_layers
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers,batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out)
        return out
    


In [232]:
embedding_dim = embeddings.shape[1]
hidden_dim = 100
output_dim = 4
num_layers = 2

model = LSTMModel(embedding_dim, hidden_dim, num_layers, output_dim)

optimizer = optim.Adam(model.parameters(), lr=0.001)

model.to(device)

for epoch in range(0, 10):
    train(epoch, model, optimizer, verbose=True)
    test(model, verbose=True)
    





Test set: Average loss: 0.9955, Accuracy: 1440/2670 (54%)


Test set: Average loss: 0.9818, Accuracy: 1462/2670 (55%)


Test set: Average loss: 0.9644, Accuracy: 1496/2670 (56%)


Test set: Average loss: 0.9340, Accuracy: 1565/2670 (59%)


Test set: Average loss: 0.9288, Accuracy: 1561/2670 (58%)


Test set: Average loss: 0.9223, Accuracy: 1587/2670 (59%)


Test set: Average loss: 0.9108, Accuracy: 1600/2670 (60%)


Test set: Average loss: 0.9419, Accuracy: 1533/2670 (57%)


Test set: Average loss: 0.9143, Accuracy: 1608/2670 (60%)


Test set: Average loss: 0.9060, Accuracy: 1608/2670 (60%)



TypeError: 'numpy.float64' object is not callable

In [235]:
y_pred = model(X_test_tensor.to(device)).argmax(dim=1).cpu()
y_pred_proba = model(X_test_tensor.to(device)).argmax(dim=1).cpu().detach()
f1_score = f1_score(y_test, y_pred, average='weighted')
auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
print('LSTM F1-Score: ', f1_score)
print('LSTM ROC AUC: ', auc)

TypeError: 'numpy.float64' object is not callable