In [4]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

device = 'mps'

  torch.utils._pytree._register_pytree_node(


In [22]:
df = pd.read_csv('processed_description.csv')
#print(df.head())
data = df[['processed_description','salary_range', 'salary_bin']]
# target: salary_bin, convert to one hot encoding - list of binary (e.g: [1,0,0,0] for first bin) 

print(data.head())






                               processed_description salary_range  salary_bin
0  hear care provid overview hearinglif nation he...     50k-100k           1
1  cook descriptiontitl look great develop profes...        0-50k           0
2  princip cloud secur architect remot summari ih...        150k+           3
3  dishwash descriptiontitl 2,000 sign-on bonu gu...        0-50k           0
4  insight analyst auto industri escal award-win ...     50k-100k           1


In [23]:
# confids
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [24]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.processed_description
        self.targets = self.data.salary_bin
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [25]:
# create dataset and dataloader
train_size = 0.8
train_dataset=data.sample(frac=train_size,random_state=200)
test_dataset=data.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (13350, 3)
TRAIN Dataset: (10680, 3)
TEST Dataset: (2670, 3)


In [26]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

## Neural Network with BERTClass

In [10]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 4)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()


In [11]:
def loss_fn(outputs, targets):
    return torch.nn.CrossEntropyLoss()(outputs, targets)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

## Fine tuning the model

In [27]:
def train(epoch):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids']
        mask = data['mask']
        token_type_ids = data['token_type_ids']
        targets = data['targets']
        optimizer.zero_grad()
        outputs = model(ids, mask, token_type_ids)
        

        loss = loss_fn(outputs, targets)

        print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
      
        loss.backward()
        optimizer.step()

In [13]:
for epoch in range(EPOCHS):
    train(epoch)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch: 0, Loss:  1.5398072004318237
Epoch: 0, Loss:  1.4916586875915527
Epoch: 0, Loss:  1.584442138671875
Epoch: 0, Loss:  1.2940983772277832
Epoch: 0, Loss:  1.47401762008667
Epoch: 0, Loss:  1.3429944515228271
Epoch: 0, Loss:  1.4043467044830322
Epoch: 0, Loss:  1.3552888631820679
Epoch: 0, Loss:  1.450002908706665
Epoch: 0, Loss:  1.476799726486206
Epoch: 0, Loss:  1.3114187717437744
Epoch: 0, Loss:  1.4213018417358398
Epoch: 0, Loss:  1.412161946296692
Epoch: 0, Loss:  1.3066505193710327
Epoch: 0, Loss:  1.3909187316894531
Epoch: 0, Loss:  1.4036763906478882
Epoch: 0, Loss:  1.3858307600021362
Epoch: 0, Loss:  1.498443603515625
Epoch: 0, Loss:  1.4933803081512451
Epoch: 0, Loss:  1.35633385181427
Epoch: 0, Loss:  1.395204782485962
Epoch: 0, Loss:  1.268232822418213
Epoch: 0, Loss:  1.231755256652832
Epoch: 0, Loss:  1.2773098945617676
Epoch: 0, Loss:  1.1674654483795166
Epoch: 0, Loss:  1.1804039478302002
Epoch: 0, Loss:  1.5212695598602295
Epoch: 0, Loss:  1.4806256294250488
Epoc

## Validate the model

In [18]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids']
            mask = data['mask']
            token_type_ids = data['token_type_ids']
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
            
    return fin_outputs, fin_targets


In [30]:

for epoch in range(EPOCHS):
    outputs, targets = validation(epoch)
    # output is the highest probability
    outputs = np.argmax(outputs, axis=1)
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.




Accuracy Score = 0.6220973782771536
F1 Score (Micro) = 0.6220973782771536
F1 Score (Macro) = 0.6171781314585684
