In [1]:
import pandas as pd
from transformers import AutoTokenizer, TFBertForSequenceClassification
import tensorflow as tf
from tqdm import tqdm
import keras
from sklearn.model_selection import train_test_split




In [2]:
print("Available devices:")
for device in tf.config.list_physical_devices():
    print(device)

Available devices:
PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')


In [3]:
def remove_empty_words(tokens):
    tokens = list(filter(None, str(tokens)))
    return tokens

In [4]:
def mask_inputs_for_bert(texts, max_len):
    input_ids = []
    attention_masks = []
    for text in tqdm(texts):
        #encoded_dict = tokenizer.encode_plus(text, add_special_tokens = True, max_length = max_len, pad_to_max_length = True, return_attention_mask = True, is_split_into_words=True)
        encoded_dict = tokenizer.encode_plus(text, add_special_tokens = True, max_length = max_len, pad_to_max_length = True, return_attention_mask = True, is_split_into_words=False)
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    input_ids = tf.convert_to_tensor(input_ids)
    attention_masks = tf.convert_to_tensor(attention_masks)
    return input_ids, attention_masks

In [5]:
def preprocess(list_of_text_data):
    for text in tqdm(list_of_text_data):
        text = remove_empty_words(text)
    return list_of_text_data

In [6]:
model_name = 'agne/jobBERT-de'

In [7]:
bert_model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=2, from_pt=True)
model_save_path = f'bert_model_{model_name[:5]}.weights.h5'
callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=model_save_path, save_weights_only=True, monitor='val_loss', mode='min', save_best_only=True), tf.keras.callbacks.TensorBoard()]




  return self.fget.__get__(instance, owner)()
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use 

In [8]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-08)
bert_model.compile(loss=loss, optimizer=optimizer, metrics=[metric])

In [9]:
df = pd.read_csv('C:/Users/venglaro/Desktop/job_ads_identification/job_ads_identification_df.csv')
df = df.dropna(subset=['text'])

In [10]:
df_positive = df[df['job_ad'] == 1]
df_negative = df[df['job_ad'] == 0]

df_positive_sampled = df_positive.sample(n=2500, random_state=42)
df_negative_sampled = df_negative.sample(n=2500, random_state=42)

df_sampled = pd.concat([df_positive_sampled, df_negative_sampled])
df_sampled = df_sampled.sample(frac=1, random_state=42).reset_index(drop=True)

In [11]:
X = df_sampled['text']  # Features (independent variables)
y = df_sampled['job_ad']  # Labels (dependent variable)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
X_train = preprocess(X_train)
X_test = preprocess(X_test)

100%|██████████████████████████████████████████████████████████████████████████| 4000/4000 [00:00<00:00, 220341.15it/s]
100%|██████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 255500.97it/s]


In [13]:
tokenizer = AutoTokenizer.from_pretrained(model_name, truncation=True)



In [14]:
train_input = X_train.values
train_label = y_train.values

test_input = X_test.values
test_label = y_test.values

In [15]:
max_len = 512
#for text in train_input:
    #max_len = max(max_len, len(text))

In [16]:
train_inp, train_mask = mask_inputs_for_bert(train_input, max_len)
test_inp, test_mask = mask_inputs_for_bert(test_input, max_len)

  0%|                                                                                         | 0/4000 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|████████████████████████████████████████████████████████████████████████████| 4000/4000 [00:00<00:00, 4528.82it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 4872.40it/s]


In [17]:
train_label = tf.convert_to_tensor(train_label)
test_label = tf.convert_to_tensor(test_label)

In [18]:
history = bert_model.fit([train_inp, train_mask], train_label, batch_size=16, epochs=5, validation_data=([test_inp, test_mask], test_label), callbacks=callbacks)

Epoch 1/5




Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [19]:
from sklearn.metrics import classification_report

test_input = X_test.values
test_label = y_test.values
test_inp, test_mask = mask_inputs_for_bert(test_input, max_len)
test_label = tf.convert_to_tensor(test_label)

pred_raw = bert_model.predict([test_inp, test_mask])
pred = pred_raw[0].argmax(axis = 1)

print(classification_report(test_label, pred))


100%|███████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 10011.49it/s]


              precision    recall  f1-score   support

           0       0.83      0.86      0.85       524
           1       0.84      0.81      0.82       476

    accuracy                           0.84      1000
   macro avg       0.84      0.83      0.84      1000
weighted avg       0.84      0.84      0.84      1000

