## 1. Set Up

In [2]:
import numpy as np
import pandas as pd
import random
import torch
from sklearn.model_selection import train_test_split

# Link to /ds_job_project/
data_path = '/content/drive/MyDrive/ds_job_project/'

In [4]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 2.9MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 10.8MB/s 
Collecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |███████

In [8]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments, Trainer
import torch

In [3]:
df = pd.read_csv(data_path + 'data_science_jobs.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
# Utils function: set seed
def set_seed(seed_val = 42):

  random.seed(seed_val)
  np.random.seed(seed_val)
  torch.manual_seed(seed_val)
  torch.cuda.manual_seed(seed_val)

In [9]:
class DataJobDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [10]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# compute metrics function for binary classification
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions
    mse = mean_squared_error(labels, preds)
    rmse = mean_squared_error(labels, preds, squared=False)
    r2 = r2_score(labels, preds)
    mae = mean_absolute_error(labels, preds)
    return {"mse": mse, "r2": r2, "mae": mae, "rmse": rmse}

In [None]:
train, test = train_test_split(df, random_state=420, test_size=0.2)

## 2. Modelling: ML Index

In [None]:
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classi

In [None]:
train_texts = train['JOB_DESC'].to_list()
test_texts = test['JOB_DESC'].to_list()

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

train_target = train['ML_INDEX'].to_list()
test_target = test['ML_INDEX'].to_list()

train_dataset = DataJobDataset(train_encodings, train_target)
test_dataset = DataJobDataset(test_encodings, test_target)

In [None]:
set_seed()

training_args = TrainingArguments(
    output_dir= 'results',          
    num_train_epochs=4,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=16,   
    evaluation_strategy="epoch",
    learning_rate = 5e-5,
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='logs',            
    logging_steps=10,
)

trainer = Trainer(
    model=model,                         
    args=training_args,
    compute_metrics=compute_metrics,                  
    train_dataset=train_dataset,         
    eval_dataset=test_dataset             
)

trainer.train()

Epoch,Training Loss,Validation Loss,Mse,R2,Mae,Rmse
1,37.3823,201.231812,201.231827,0.703173,7.04375,14.18562
2,19.1173,167.728165,167.728149,0.752593,6.394526,12.950991
3,61.8727,157.45343,157.4534,0.767749,6.126414,12.548043
4,23.9119,155.858322,155.858307,0.770101,5.771966,12.484323


TrainOutput(global_step=2196, training_loss=64.06293156455341, metrics={'train_runtime': 901.0168, 'train_samples_per_second': 2.437, 'total_flos': 645023231311872.0, 'epoch': 4.0, 'init_mem_cpu_alloc_delta': 0, 'init_mem_gpu_alloc_delta': 0, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 0, 'train_mem_gpu_alloc_delta': 536553472, 'train_mem_cpu_peaked_delta': 0, 'train_mem_gpu_peaked_delta': 6562893824})

In [None]:
tokenizer.save_pretrained(data_path + 'ml_distilbert/tokenizer')
trainer.save_model(data_path + 'ml_distilbert')

from transformers import AutoConfig
config = AutoConfig.from_pretrained('distilbert-base-uncased')
config.save_pretrained('ml_distilbert/tokenizer')

## 3. Natural Language Processing Index

In [None]:
set_seed()

model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

train_target = train['NLP_INDEX'].to_list()
test_target = test['NLP_INDEX'].to_list()

train_dataset = DataJobDataset(train_encodings, train_target)
test_dataset = DataJobDataset(test_encodings, test_target)

training_args = TrainingArguments(
    output_dir= 'results',          
    num_train_epochs=3,              
    per_device_train_batch_size=32,  
    per_device_eval_batch_size=32,   
    evaluation_strategy="epoch",
    learning_rate = 5e-5,
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='logs',            
    logging_steps=10,
)

trainer = Trainer(
    model=model,                         
    args=training_args,
    compute_metrics=compute_metrics,                  
    train_dataset=train_dataset,         
    eval_dataset=test_dataset             
)

trainer.train()

Epoch,Training Loss,Validation Loss,Mse,R2,Mae,Rmse
1,13.4363,25.202383,25.202385,0.821076,2.425747,5.020198
2,9.8481,25.506554,25.506554,0.818916,2.634434,5.050401
3,11.8551,23.231352,23.231352,0.835069,2.243505,4.819891


TrainOutput(global_step=825, training_loss=18.114678446451823, metrics={'train_runtime': 538.0893, 'train_samples_per_second': 1.533, 'total_flos': 2132525785153536.0, 'epoch': 3.0, 'init_mem_cpu_alloc_delta': 0, 'init_mem_gpu_alloc_delta': 0, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 24576, 'train_mem_gpu_alloc_delta': 543631360, 'train_mem_cpu_peaked_delta': 0, 'train_mem_gpu_peaked_delta': 13364958720})

In [None]:
trainer.save_model(data_path + 'nlp_distilbert')
tokenizer.save_pretrained(data_path + 'nlp_distilbert/tokenizer')

from transformers import AutoConfig
config = AutoConfig.from_pretrained('distilbert-base-uncased')
config.save_pretrained(data_path + 'nlp_distilbert/tokenizer')

## 4. Computer Vision

In [None]:
set_seed()

model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

train_target = train['CV_INDEX'].to_list()
test_target = test['CV_INDEX'].to_list()

train_dataset = DataJobDataset(train_encodings, train_target)
test_dataset = DataJobDataset(test_encodings, test_target)

training_args = TrainingArguments(
    output_dir= 'results',          
    num_train_epochs=3,              
    per_device_train_batch_size=32,  
    per_device_eval_batch_size=32,   
    evaluation_strategy="epoch",
    learning_rate = 5e-5,
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='logs',            
    logging_steps=10,
)

trainer = Trainer(
    model=model,                         
    args=training_args,
    compute_metrics=compute_metrics,                  
    train_dataset=train_dataset,         
    eval_dataset=test_dataset             
)

trainer.train()

Epoch,Training Loss,Validation Loss,Mse,R2,Mae,Rmse
1,5.6641,36.902161,36.902153,0.782354,1.749505,6.074714
2,4.0716,36.918991,36.918999,0.782255,1.769234,6.076101
3,142.7548,35.345486,35.345474,0.791536,1.542229,5.945206


TrainOutput(global_step=825, training_loss=16.46228397253788, metrics={'train_runtime': 527.5146, 'train_samples_per_second': 1.564, 'total_flos': 2132525785153536.0, 'epoch': 3.0, 'init_mem_cpu_alloc_delta': 0, 'init_mem_gpu_alloc_delta': 0, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 0, 'train_mem_gpu_alloc_delta': 543893504, 'train_mem_cpu_peaked_delta': 8192, 'train_mem_gpu_peaked_delta': 13223400960})

In [None]:
config.save_pretrained(data_path + 'cv_distilbert/tokenizer')
trainer.save_model(data_path + 'cv_distilbert')
tokenizer.save_pretrained(data_path + 'cv_distilbert/tokenizer')