In [1]:
def column_to_list(df, column_name):
    """
    Convert a column in a DataFrame to a list of lists.

    Parameters:
    - df: DataFrame
        The DataFrame containing the column to be converted.
    - column_name: str
        The name of the column to be converted to a list.

    Returns:
    - lists: list
        A list of lists where each inner list corresponds to a row in the specified column.
    """
    column_values = df[column_name].tolist()
    lists = [list(arr) for arr in column_values]
    return lists


In [3]:
import pandas as pd

In [46]:
id2label = {0: '0',
            1: 'B-PER', 
            2: 'I-PER',
            3: 'B-ORG',
            4: 'I-ORG',
            5: 'B-LOC',
            6: 'I-LOC'
           }

In [47]:
#Training data
trainin_data = pd.read_parquet('train-00000-of-00001.parquet')
training_labels_num = column_to_list(trainin_data, 'ner_tags')
training_labels = [[id2label[label_id] for label_id in sequence] for sequence in training_labels_num]
training_sent =  column_to_list(trainin_data, 'tokens')

#flatten to one list to be able to use myutils
train_flat_labels = sum(training_labels, [])
train_flat_sent = sum(training_sent, [])

In [48]:
#test data
test_data = pd.read_parquet('test-00000-of-00001.parquet')
test_labels_num = column_to_list(test_data, 'ner_tags')
test_labels = [[id2label[label_id] for label_id in sequence] for sequence in test_labels_num]

test_sent =  column_to_list(test_data, 'tokens')
test_index = [[i for i, _ in enumerate(sublist)] for sublist in test_labels]

#flatten to one list to be able to use myutils
test_flat_labels = sum(test_labels, [])
test_flat_sent = sum(test_sent, [])
test_flat_index = sum(test_index, [])

In [49]:
#validation data
validation_data = pd.read_parquet('validation-00000-of-00001.parquet')
dev_labels_num = column_to_list(validation_data, 'ner_tags')
dev_labels = [[id2label[label_id] for label_id in sequence] for sequence in dev_labels_num]

dev_sent =  column_to_list(validation_data, 'tokens')

#flatten to one list to be able to use myutils
dev_flat_labels = sum(dev_labels, [])
dev_flat_sent = sum(dev_sent, [])

In [6]:
#!pip install transformers

In [52]:
#!pip install ipywidgets

In [14]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [12]:
inputs = []
for sentence in training_sent:
    inputt = tokenizer(sentence, is_split_into_words=True)
    inputs.append(inputt)

In [15]:
test_labels[0]


['0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 'B-LOC',
 '0',
 '0',
 '0',
 '0',
 'B-LOC',
 '0',
 '0',
 '0']

In [150]:
inputs[0].word_ids()

[None, 0, 0, 0, 0, 1, 2, 3, 3, 4, 5, 6, 7, 8, 8, 9, 10, None]

In [59]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [17]:
label2id = {label: id for id, label in id2label.items()}


In [53]:
train_flat_labels_num = [label2id[label_id] for label_id in train_flat_labels]


[['B-ORG', 'I-ORG', '0', 'B-ORG', 'I-ORG', 'I-ORG', '0', '0', '0', '0', '0'],
 ['0', '0', '0', 'B-PER', 'I-PER', '0', '0'],
 ['B-PER', 'I-PER', 'I-PER', '0', '0', '0', '0'],
 ['B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC'],
 ['0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  'B-PER',
  'I-PER',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  'B-PER',
  'I-PER',
  '0',
  'B-PER',
  'I-PER',
  '0',
  '0',
  '0',
  '0'],
 ['B-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG'],
 ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', '0'],
 ['B-PER', 'I-PER', '0', '0', '0'],
 ['0',
  '0',
  '0',
  'B-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  '0',
  '0'],
 ['B-PER', 'I-PER', 'B-PER', 'I-PER', 'B-PER', 'I-PER', 'B-PER', 'I-PER'],
 ['0', '0', 'B-ORG', 'I-ORG', '0', 'B-ORG'],
 ['0', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG'],
 ['0', 'B-LOC', '0', '0', '0', '0']

In [61]:
labels = training_labels
word_ids = []
for i in inputs:
    word_ids.append(i.word_ids())
word_ids_flat = sum(word_ids, [])
aligned_training = align_labels_with_tokens(train_flat_labels_num, word_ids_flat)


In [62]:
aligned_training

[-100,
 3,
 4,
 4,
 4,
 4,
 0,
 3,
 4,
 4,
 4,
 0,
 0,
 0,
 0,
 0,
 0,
 -100,
 -100,
 3,
 4,
 0,
 0,
 3,
 4,
 4,
 4,
 4,
 4,
 0,
 -100,
 -100,
 3,
 4,
 4,
 0,
 0,
 0,
 0,
 0,
 0,
 3,
 4,
 4,
 0,
 -100,
 -100,
 3,
 4,
 0,
 3,
 4,
 -100,
 -100,
 3,
 4,
 0,
 3,
 4,
 4,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 1,
 2,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 5,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 -100,
 -100,
 3,
 4,
 4,
 0,
 0,
 3,
 4,
 4,
 0,
 0,
 0,
 0,
 0,
 0,
 -100,
 -100,
 3,
 4,
 4,
 4,
 4,
 4,
 0,
 3,
 4,
 4,
 4,
 4,
 4,
 -100,
 -100,
 3,
 4,
 4,
 4,
 4,
 4,
 0,
 3,
 4,
 4,
 4,
 -100,
 -100,
 3,
 4,
 4,
 4,
 0,
 3,
 4,
 4,
 0,
 0,
 0,
 0,
 0,
 -100,
 -100,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 0,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 0,
 0,
 0,
 0,
 0,
 -100,
 -100,
 3,
 4,
 4,
 4,
 0,
 3,
 4,
 4,
 -100,
 -100,
 3,
 4,
 4,
 4,
 4,
 0,
 3,
 4,
 -100,
 -100,
 3,
 4,
 4,
 4,
 4,
 4,
 0,
 0,
 3,
 4,
 4,
 -100,
 -100,
 3,
 4,
 4,
 0,
 3,
 4,

In [63]:
def list_to_sentences(lst):
    sentences = []
    current_sentence = []
    
    for item in lst:
        if item == -100:
            if current_sentence:
                sentences.append(current_sentence)
                current_sentence = []
        else:
            current_sentence.append(item)
    
    if current_sentence:
        sentences.append(current_sentence)
    
    return sentences


sentences = list_to_sentences(aligned_training)
print(sentences)


[[3, 4, 4, 4, 4, 0, 3, 4, 4, 4, 0, 0, 0, 0, 0, 0], [3, 4, 0, 0, 3, 4, 4, 4, 4, 4, 0], [3, 4, 4, 0, 0, 0, 0, 0, 0, 3, 4, 4, 0], [3, 4, 0, 3, 4], [3, 4, 0, 3, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 5, 6, 6, 6, 6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [3, 4, 4, 0, 0, 3, 4, 4, 0, 0, 0, 0, 0, 0], [3, 4, 4, 4, 4, 4, 0, 3, 4, 4, 4, 4, 4], [3, 4, 4, 4, 4, 4, 0, 3, 4, 4, 4], [3, 4, 4, 4, 0, 3, 4, 4, 0, 0, 0, 0, 0], [3, 4, 4, 4, 4, 4, 4, 0, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0], [3, 4, 4, 4, 0, 3, 4, 4], [3, 4, 4, 4, 4, 0, 3, 4], [3, 4, 4, 4, 4, 4, 0, 0, 3, 4, 4], [3, 4, 4, 0, 3, 4, 4, 4, 4], [3, 4, 0, 3, 4, 4, 4, 4, 4], [3, 4, 0, 3, 4, 4, 0, 0, 0, 0], [3, 4, 0, 3, 4, 4, 0, 0], [3, 4, 4, 4, 4, 0, 0, 0, 0], [3, 4, 0, 3], [3, 4, 0, 3, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [3, 4, 0], [3, 4, 4, 4, 4, 4, 4, 0, 0, 0, 3, 4, 4, 4, 4, 0, 0, 0, 0, 0], [3, 4, 4, 0, 3, 4, 4, 4], [3, 4, 0, 3, 4, 4, 0, 0, 0, 0], [3, 4, 4, 4, 0, 3, 4, 4, 4, 4, 0]

In [64]:
to_zip_train = [[-100] + sublist + [-100] for sublist in sentences]

In [65]:
# Assuming inputs and to_zip_train are defined as provided

# Zip inputs with to_zip_train and add 'label' key to each item in inputs
for input_item, label_item in zip(inputs, to_zip_train):
    input_item['labels'] = label_item

# Print the updated inputs list
print(inputs[0])


{'input_ids': [101, 155, 119, 145, 119, 16029, 113, 1457, 119, 4898, 1595, 114, 113, 5306, 1604, 13277, 114, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 3, 4, 4, 4, 4, 0, 3, 4, 4, 4, 0, 0, 0, 0, 0, 0, -100]}


In [66]:
inputs_dev = []
for sentence in dev_sent:
    inputt = tokenizer(sentence, is_split_into_words=True)
    inputs_dev.append(inputt)

In [68]:
dev_flat_labels_num = [label2id[label_id] for label_id in dev_flat_labels]


In [69]:
dev_flat_labels_num

[3,
 4,
 4,
 0,
 5,
 6,
 6,
 6,
 6,
 0,
 1,
 2,
 2,
 2,
 2,
 2,
 5,
 6,
 6,
 6,
 0,
 0,
 0,
 3,
 4,
 0,
 3,
 4,
 0,
 0,
 1,
 2,
 0,
 0,
 0,
 1,
 2,
 0,
 1,
 2,
 3,
 4,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 3,
 4,
 0,
 3,
 4,
 0,
 0,
 5,
 0,
 5,
 5,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 1,
 2,
 0,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 3,
 4,
 4,
 3,
 4,
 4,
 0,
 0,
 1,
 2,
 0,
 0,
 1,
 2,
 2,
 0,
 0,
 0,
 5,
 0,
 0,
 0,
 5,
 6,
 6,
 6,
 3,
 4,
 4,
 4,
 3,
 4,
 0,
 0,
 0,
 0,
 1,
 2,
 2,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 2,
 2,
 0,
 0,
 0,
 0,
 0,
 5,
 6,
 6,
 0,
 5,
 5,
 6,
 6,
 6,
 6,
 6,
 6,
 1,
 2,
 2,
 2,
 2,
 0,
 0,
 0,
 0,
 5,
 0,
 5,
 6,
 6,
 1,
 2,
 2,
 0,
 5,
 6,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 2,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 5,
 6,
 6,
 6,
 0,
 3,
 4,
 0,
 5,
 6,
 6,
 0,
 3,
 4,
 4,
 4,
 0,
 0,
 0,
 0,
 0,
 0,
 3,
 4,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 0,
 5,
 6,
 6,
 6,
 6,
 3,


In [70]:
word_ids = []
for i in inputs_dev:
    word_ids.append(i.word_ids())
word_ids_flat = sum(word_ids, [])
aligned_dev = align_labels_with_tokens(dev_flat_labels_num, word_ids_flat)

In [71]:
to_zip_dev = [[-100] + sublist + [-100] for sublist in list_to_sentences(aligned_dev)]

In [73]:
for input_item, label_item in zip(inputs_dev, to_zip_dev):
    input_item['labels'] = label_item

# Print the updated inputs list
print(inputs_dev[0])

{'input_ids': [101, 17680, 6230, 6492, 113, 17680, 6230, 117, 1375, 7241, 114, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 3, 4, 4, 0, 5, 6, 6, 6, 6, 0, -100]}


In [74]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [75]:
#!pip install evaluate

In [76]:
#!pip install seqeval 

In [77]:
import evaluate

metric = evaluate.load("seqeval")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [120]:
id2label = {v: k for k, v in label2id.items()}

In [78]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label = id2label
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [79]:
model.config.num_labels

7

In [81]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [82]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Convert labels to a list of lists if it's a set
    if isinstance(labels, set):
        labels = [labels]

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }


In [83]:
#!pip install accelerate -U

In [86]:
from transformers import TrainingArguments

args = TrainingArguments(
    "bert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    push_to_hub=False,
)

In [87]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=inputs,
    eval_dataset=inputs_dev,
    data_collator=data_collator,
    tokenizer=tokenizer,
)
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss


RuntimeError: The size of tensor a (1051) must match the size of tensor b (512) at non-singleton dimension 1

In [34]:
#import pickle

# Get the trained model
#trained_model = trainer.model

# Save the trained model as a pickle file
#with open("trained_model.pkl", "wb") as f:
    #pickle.dump(trained_model, f)


In [35]:
inputs_test = []
for sentence in test_sent:
    inputt = tokenizer(sentence, is_split_into_words=True)
    inputs_test.append(inputt)

In [36]:
test_labels_num = [label2id.get(label, label2id[UNK]) for label in test_flat_labels]

In [37]:
word_ids = []
for i in inputs_test:
    word_ids.append(i.word_ids())
word_ids_flat = sum(word_ids, [])
aligned_test = align_labels_with_tokens(test_labels_num, word_ids_flat)

In [38]:
to_zip_test = [[-100] + sublist + [-100] for sublist in list_to_sentences(aligned_test)]

In [39]:
for input_item, label_item in zip(inputs_test, to_zip_test):
    input_item['labels'] = label_item

In [40]:
import pickle

with open("trained_model.pkl", "rb") as f:
    trained_model = pickle.load(f)

In [41]:
if isinstance(trained_model, AutoModelForTokenClassification):
    print("Trained model loaded successfully!")
else:
    print("Error: Failed to load the trained model.")

Error: Failed to load the trained model.


In [42]:
import pickle
from transformers import AutoModelForTokenClassification

try:
    # Open the pickle file for reading
    with open("trained_model.pkl", "rb") as f:
        # Deserialize the trained model object
        loaded_model = pickle.load(f)

    # Check if the loaded object is an instance of AutoModelForTokenClassification
    if isinstance(loaded_model, AutoModelForTokenClassification):
        print("Trained model loaded successfully!")
    else:
        print("Error: Loaded object is not an instance of AutoModelForTokenClassification.")
except Exception as e:
    print("Error occurred while loading the trained model:", e)


Error: Loaded object is not an instance of AutoModelForTokenClassification.


In [43]:
inputs_dev[:10]

[{'input_ids': [101, 1187, 1169, 146, 1243, 182, 1766, 23694, 1116, 1107, 27629, 8223, 1161, 5952, 117, 146, 1209, 1176, 1103, 170, 14527, 17054, 2076, 117, 1133, 146, 1209, 1106, 2222, 1330, 1116, 4268, 136, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 3, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, -100]},
 {'input_ids': [101, 146, 8703, 1155, 1166, 1103, 7210, 117, 1133, 146, 1180, 1136, 1525, 1141, 1282, 1107, 9720, 2410, 1115, 16695, 182, 1766, 23694, 1116, 117, 1145, 1227, 1112, 1892, 23609, 15977, 117, 1602, 23609, 15977, 1105, 1892, 21718, 27130, 1116, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'atten

In [44]:
inputs_test

[{'input_ids': [101, 1327, 1110, 1142, 20290, 7317, 136, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 1, 1, 1, 2, 2, 1, -100]},
 {'input_ids': [101, 1135, 1110, 170, 1282, 1107, 4904, 25338, 1233, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 1, 1, 1, 2, 1, 1, 1, 2, -100]},
 {'input_ids': [101, 1184, 1110, 170, 1363, 15367, 1111, 1126, 138, 14527, 17054, 4382, 136, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 1, 1, 1, 2, 1, 1, 1, 1, 2, 2, 1, 1, -100]},
 {'input_ids': [101, 107, 1130, 4904, 117, 14413, 1110, 27415, 117, 9581, 117, 1105, 5185, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, -100]},
 {

NameError: name 'raw_datasets' is not defined

In [59]:
trainer.model.to('cpu')

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [52]:
import torch

# List of sentences
sentences = test_flat_sent

# Tokenize the sentences
tokenized_inputs = tokenizer(sentences, truncation=True, padding=True, return_tensors="pt")

# Get the input tensors
input_ids = tokenized_inputs["input_ids"]
attention_mask = tokenized_inputs["attention_mask"]

# Set the model to evaluation mode
trainer.model.eval()

# Define batch size
batch_size = 16  # You can adjust this as needed


In [58]:
print(input_ids.device.type)
print(attention_mask.device.type)

cpu
cpu


In [None]:
import torch

# List of sentences
sentences = test_flat_sent

# Tokenize the sentences
tokenized_inputs = tokenizer(sentences, truncation=True, padding=True, return_tensors="pt")

# Get the input tensors
input_ids = tokenized_inputs["input_ids"]
attention_mask = tokenized_inputs["attention_mask"]

# Set the model to evaluation mode
trainer.model.eval()

# Define batch size
batch_size = 16  # You can adjust this as needed

# Batch processing
with torch.no_grad():
    predicted_labels = []
    for i in range(0, len(input_ids), batch_size):
        batch_input_ids = input_ids[i:i+batch_size]
        batch_attention_mask = attention_mask[i:i+batch_size]

        # Forward pass
        outputs = trainer.model(batch_input_ids, attention_mask=batch_attention_mask)

        # Get the predicted labels (class indices)
        batch_predicted_labels = torch.argmax(outputs.logits, dim=-1)

        predicted_labels.extend(batch_predicted_labels.tolist())

# Process the predictions as needed
# For example, convert logits to labels, post-process the output, etc.

# Print or use the predictions
print(predicted_labels)


In [None]:
trainer.model