In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m44.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m46.2 MB/s[0m eta [36m0:00:0

In [None]:
import os
import pickle
import numpy as np
import tensorflow as tf
from transformers import TFBertModel, BertTokenizerFast
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [None]:
directory = '/content/drive/MyDrive/NLP/mini project'

# Loading the Dataset

In [None]:
with open(directory + '/dataset/train.pickle', 'rb') as file:
    train = pickle.load(file)
with open(directory + '/dataset/validation.pickle', 'rb') as file:
    val = pickle.load(file)
with open(directory + '/dataset/test.pickle', 'rb') as file:
    test = pickle.load(file)

In [None]:
def find_union(arr):
    union_set = set()
    for lst in arr:
        union_set.update(lst)
    return union_set

In [None]:
ner_tags = find_union(train['ner_tags'])
pos_tags = find_union(train['pos_tags'])

In [None]:
num_ner_labels = len(ner_tags)
num_pos_labels = len(pos_tags)

In [None]:
ner_to_ids = {k: v for v, k in enumerate(sorted(ner_tags))}
ids_to_ner = {v: k for v, k in enumerate(sorted(ner_tags))}

pos_to_ids = {k: v for v, k in enumerate(sorted(pos_tags))}
ids_to_pos = {v: k for v, k in enumerate(sorted(pos_tags))}

# Tokenization

In [None]:
# Load BERT tokenizer and model
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
max_length = 128

In [None]:
def align_label(input, tokenized_input, labels, labels_to_ids):

        word_ids = tokenized_input.word_ids()
        tokens = tokenizer.convert_ids_to_tokens(tokenized_input['input_ids'][0])

        label_ids = []

        last_id = -1
        shift = 0

        for i in range(len(word_ids)):

            if word_ids[i] is None:
                label_ids.append(-100)
            elif last_id == word_ids[i]:
                label_ids.append(-100)
            else:
                if word_ids[i] - shift >= len(input) or not input[word_ids[i] - shift].startswith(tokens[i]):
                    shift += 1
                    label_ids.append(-100)
                else:
                    label_ids.append(labels_to_ids[labels[word_ids[i] - shift]])
                last_id = word_ids[i]

        return label_ids

In [None]:
# Tokenize the text and adapt labels
def tokenize(dataset):
    input_ids = []
    attention_masks = []
    ner_ids = []
    pos_ids = []
    for tokens, ner_tags, pos_tags in zip(dataset['tokens'], dataset['ner_tags'], dataset['pos_tags']):

        encoded = tokenizer(' '.join(tokens), add_special_tokens=True, max_length=max_length, truncation=True,
                                      padding='max_length', return_tensors='tf')
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])

        ner_ids.append(align_label(tokens, encoded, ner_tags, ner_to_ids))
        pos_ids.append(align_label(tokens, encoded, pos_tags, pos_to_ids))

    return input_ids, attention_masks, ner_ids, pos_ids

In [None]:
x_train, att_mask_train, ner_train, pos_train = tokenize(train)

with open(directory + "/x_train.pickle", "wb") as file:
    pickle.dump(x_train, file)
with open(directory + "/att_mask_train.pickle", "wb") as file:
    pickle.dump(att_mask_train, file)
with open(directory + "/ner_train.pickle", "wb") as file:
    pickle.dump(ner_train, file)
with open(directory + "/pos_train.pickle", "wb") as file:
    pickle.dump(pos_train, file)

In [None]:
x_val, att_mask_val, ner_val, pos_val = tokenize(val)

with open(directory + "/x_val.pickle", "wb") as file:
    pickle.dump(x_val, file)
with open(directory + "/att_mask_val.pickle", "wb") as file:
    pickle.dump(att_mask_val, file)
with open(directory + "/ner_val.pickle", "wb") as file:
    pickle.dump(ner_val, file)
with open(directory + "/pos_val.pickle", "wb") as file:
    pickle.dump(pos_val, file)

In [None]:
with open(directory + "/x_train.pickle", "rb") as file:
    x_train = pickle.load(file)
with open(directory + "/att_mask_train.pickle", "rb") as file:
    att_mask_train = pickle.load(file)
with open(directory + "/ner_train.pickle", "rb") as file:
    ner_train = pickle.load(file)
with open(directory + "/pos_train.pickle", "rb") as file:
    pos_train = pickle.load(file)

with open(directory + "/x_val.pickle", "rb") as file:
    x_val = pickle.load(file)
with open(directory + "/att_mask_val.pickle", "rb") as file:
    att_mask_val = pickle.load(file)
with open(directory + "/ner_val.pickle", "rb") as file:
    ner_val = pickle.load(file)
with open(directory + "/pos_val.pickle", "rb") as file:
    pos_val = pickle.load(file)

# Bert

In [None]:
x_train = tf.concat(x_train, axis=0)
att_mask_train = tf.concat(att_mask_train, axis=0)
ner_train = tf.convert_to_tensor(ner_train)
pos_train = tf.convert_to_tensor(pos_train)

x_val = tf.concat(x_val, axis=0)
att_mask_val = tf.concat(att_mask_val, axis=0)
ner_val = tf.convert_to_tensor(ner_val)
pos_val = tf.convert_to_tensor(pos_val)

In [None]:
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.set_visible_devices(physical_devices[0], 'GPU')

## Building the Model

In [None]:
bert_model = TFBertModel.from_pretrained('bert-base-cased')

# Define the classification layers for NER and POS
ner_classifier = tf.keras.layers.Dense(num_ner_labels, activation='softmax', name='ner_logits',
                                       kernel_initializer=tf.keras.initializers.GlorotUniform())
pos_classifier = tf.keras.layers.Dense(num_pos_labels, activation='softmax', name='pos_logits',
                                       kernel_initializer=tf.keras.initializers.GlorotUniform())

# Create the multitasking model
input_ids = tf.keras.Input(shape=(max_length,), dtype=tf.int32)
attention_mask = tf.keras.Input(shape=(max_length,), dtype=tf.int32)

bert_output = bert_model(input_ids, attention_mask=attention_mask)[0]
ner_logits = ner_classifier(bert_output)
pos_logits = pos_classifier(bert_output)

model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=[ner_logits, pos_logits])

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [None]:

# Custom accuracy metric
def accuracy(y_true, y_pred):
    # Mask to ignore padding tokens with -100 labels
    mask = tf.not_equal(y_true, -100)
    # Filter predictions and true labels using the mask
    y_true_filtered = tf.boolean_mask(y_true, mask)
    y_pred_filtered = tf.boolean_mask(y_pred, mask)

    # Calculate accuracy
    accuracy = tf.keras.metrics.sparse_categorical_accuracy(y_true_filtered, y_pred_filtered)
    return tf.reduce_mean(accuracy)

In [None]:
# Set learning rate and compile the model
learning_rate = 0.0001
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

losses = {
    'ner_logits': tf.keras.losses.SparseCategoricalCrossentropy(ignore_class=-100),
    'pos_logits': tf.keras.losses.SparseCategoricalCrossentropy(ignore_class=-100)
}

metrics = {
    'ner_logits': accuracy,
    'pos_logits': accuracy
}

model.compile(optimizer=optimizer, loss=losses, metrics=metrics)

In [None]:
# Create a callback to save the best model checkpoints
checkpoint_path = directory + '/model_checkpoints/'
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    checkpoint_path, monitor='val_loss', verbose=1, save_best_only=True,
    save_weights_only=True, mode='min', save_freq='epoch'
)

In [None]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # 0: INFO, 1: WARNING, 2: ERROR, 3: FATAL
tf.get_logger().setLevel('ERROR')  # or tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

In [None]:
# Train the model with callbacks
model.fit(
    [x_train, att_mask_train],
    [ner_train, pos_train],
    validation_data=([x_val, att_mask_val], [ner_val, pos_val]),
    epochs=10,
    batch_size=64,
    callbacks=[checkpoint_callback]
)

Epoch 1/10
Epoch 1: val_loss improved from inf to 0.21581, saving model to /content/drive/MyDrive/NLP/mini project/model_checkpoints/
Epoch 2/10
Epoch 2: val_loss improved from 0.21581 to 0.19845, saving model to /content/drive/MyDrive/NLP/mini project/model_checkpoints/
Epoch 3/10
Epoch 3: val_loss did not improve from 0.19845
Epoch 4/10
Epoch 4: val_loss did not improve from 0.19845
Epoch 5/10
Epoch 5: val_loss did not improve from 0.19845
Epoch 6/10
Epoch 6: val_loss did not improve from 0.19845
Epoch 7/10
Epoch 7: val_loss did not improve from 0.19845
Epoch 8/10
Epoch 8: val_loss did not improve from 0.19845
Epoch 9/10
Epoch 9: val_loss did not improve from 0.19845
Epoch 10/10
Epoch 10: val_loss did not improve from 0.19845


<keras.callbacks.History at 0x7d1e09fa8670>

In [None]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 128)]        0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 128)]        0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  108310272   ['input_1[0][0]',                
                                thPoolingAndCrossAt               'input_2[0][0]']                
                                tentions(last_hidde                                               
                                n_state=(None, 128,                                           

In [None]:
best_model = checkpoint_callback.model

In [None]:
tf.keras.models.save_model(best_model, checkpoint_path)



## Evaluation

In [None]:
model = tf.keras.models.load_model(checkpoint_path, custom_objects={'TFBertModel': TFBertModel})

In [None]:
tf.keras.backend.clear_session()

In [None]:
# Make predictions using the trained model
ner_logits, pos_logits = model.predict([x_val, att_mask_val], batch_size=64)
ner_predictions = np.argmax(ner_logits, axis=-1)
pos_predictions = np.argmax(pos_logits, axis=-1)



In [None]:
# Flatten true labels and predictions for NER and POS tasks
flatten_ner_true_labels = tf.reshape(ner_val, [-1])
flatten_ner_predictions = tf.reshape(ner_predictions, [-1])

flatten_pos_true_labels = tf.reshape(pos_val, [-1])
flatten_pos_predictions = tf.reshape(pos_predictions, [-1])

In [None]:
mask = tf.not_equal(flatten_ner_true_labels, -100)

flatten_ner_true_labels = flatten_ner_true_labels[mask]
flatten_ner_predictions = flatten_ner_predictions[mask]

flatten_pos_true_labels = flatten_pos_true_labels[mask]
flatten_pos_predictions = flatten_pos_predictions[mask]

In [None]:
# Calculate metrics for NER
ner_accuracy = accuracy_score(flatten_ner_true_labels, flatten_ner_predictions)
ner_recall = recall_score(flatten_ner_true_labels, flatten_ner_predictions, average='micro')
ner_precision = precision_score(flatten_ner_true_labels, flatten_ner_predictions, average='micro')
ner_f1 = f1_score(flatten_ner_true_labels, flatten_ner_predictions, average='micro')

# Calculate metrics for POS
pos_accuracy = accuracy_score(flatten_pos_true_labels, flatten_pos_predictions)
pos_recall = recall_score(flatten_pos_true_labels, flatten_pos_predictions, average='micro')
pos_precision = precision_score(flatten_pos_true_labels, flatten_pos_predictions, average='micro')
pos_f1 = f1_score(flatten_pos_true_labels, flatten_pos_predictions, average='micro')

# Print the results
print("NER Accuracy:", ner_accuracy)
print("NER Recall:", ner_recall)
print("NER Precision:", ner_precision)
print("NER F1-score:", ner_f1)
print()
print("POS Accuracy:", pos_accuracy)
print("POS Recall:", pos_recall)
print("POS Precision:", pos_precision)
print("POS F1-score:", pos_f1)


NER Accuracy: 0.9887069216985457
NER Recall: 0.9887069216985457
NER Precision: 0.9887069216985457
NER F1-score: 0.9887069216985457

POS Accuracy: 0.9617035462766179
POS Recall: 0.9617035462766179
POS Precision: 0.9617035462766179
POS F1-score: 0.9617035462766179


## Prediction

In [None]:
def tokenize_test(dataset):
    input_ids = []
    attention_masks = []
    labels = []
    for sentence in dataset['tokens']:

        encoded = tokenizer(' '.join(sentence), add_special_tokens=True, max_length=max_length, truncation=True,
                                      padding='max_length', return_tensors='tf')
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])

        word_ids = encoded.word_ids()
        tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])


        label_ids = []

        last_id = -1
        shift = 0

        for i in range(max_length):
            if word_ids[i] is None or last_id == word_ids[i]:
                label_ids.append(False)
            else:
                if word_ids[i] - shift >= len(sentence) or not sentence[word_ids[i] - shift].startswith(tokens[i]):
                    shift += 1
                    label_ids.append(False)
                else:
                    label_ids.append(True)
                last_id = word_ids[i]
        labels.append(label_ids)

    return input_ids, attention_masks, labels

In [None]:
x_test, att_mask_test, prediction_mask= tokenize_test(test)

In [None]:
x_test = tf.concat(x_test, axis=0)
att_mask_test = tf.concat(att_mask_test, axis=0)
prediction_mask = tf.convert_to_tensor(prediction_mask)

In [None]:
# Make predictions using the trained model
ner_logits, pos_logits = model.predict([x_test, att_mask_test], batch_size=64)
ner_predictions = np.argmax(ner_logits, axis=-1)
pos_predictions = np.argmax(pos_logits, axis=-1)



In [None]:
def label_id_to_label(predictions, id_to_label, mask=prediction_mask):
    total_labels = []
    for i in range (len(predictions)):
        labels = [id_to_label[x] for x in predictions[i][mask[i]]]
        total_labels.append(labels)
    return np.array(total_labels)

In [None]:
ner_test = label_id_to_label(ner_predictions, ids_to_ner)
pos_test = label_id_to_label(pos_predictions, ids_to_pos)

In [None]:
result = {
    'ner_tags' : ner_test,
    'pos_tags' : pos_test
}

In [None]:
with open(directory + "/test_pred.pickle", "wb") as file:
    pickle.dump(result, file)