# NOTE TO FUTURE USERS

Please only run the first one (aka, the one that links to legal_bert_model) since the rest have even worse performances somehows

In [None]:
#!unzip jsonl_files.zip

In [13]:
# model attempt 1
import os
import json
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification

folder_path = os.path.join("content/jsonl_files")
#tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
# |      >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
## |      >>> assistant_model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
# |      >>> # set pad_token_id to eos_token_id because GPT2 does not have a PAD token
# |      >>> model.generation_config.pad_token_id = model.generation_config.eos_token_id
# |      >>> input_prompt = "It might be possible to"
# |      >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
# |      >>> # instantiate logits processors
# |      >>> logits_processor = LogitsProcessorList(
# |      ...     [
# |      ...         MinLengthLogitsProcessor(10, eos_token_id=model.generation_config.eos_token_id),
# |      ...     ]
# |      ... )
# |      >>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)])
# |      >>> outputs = model.assisted_decoding(
 #|      ...     input_ids,
 #|      ...     assistant_model=assistant_model,
 #|      ...     logits_processor=logits_processor,
 #|      ...     stopping_criteria=stopping_criteria,




# Define the label list based on your data
label_list = ["PRECEDENT", "LAWYER", "JUDGE", "RESPONDENT", "GPE",
              "DATE", "OTHER_PERSON", "PROVISION", "ORG", "PETITIONER",
              "WITNESS", "COURT", "STATUTE", "CASE_NUMBER"]


class CustomDataset(Dataset):
    def __init__(self, folder_path, tokenizer, label_list, max_length=512, split_ratio=0.8, seed=42):
        self.tokenizer = tokenizer
        self.label_list = label_list
        self.label_map = {label: i for i, label in enumerate(label_list)}
        self.samples = []
        self.max_length = max_length
        self.split_ratio = split_ratio
        self.seed = seed

        self._load_data(folder_path)
        self._split_data()

    def _load_data(self, folder_path):
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.jsonl'):
                file_path = os.path.join(folder_path, file_name)
                with open(file_path, 'r') as file:
                    for line in file:
                        data = json.loads(line)
                        text = data['text']
                        labels = data['label']
                        tokenized_inputs = self.tokenizer(
                            text,
                            is_split_into_words=True,
                            padding='max_length',
                            truncation=True,
                            max_length=self.max_length,
                            return_tensors='pt'
                        )
                        labels = self.align_labels(labels, tokenized_inputs.input_ids)
                        self.samples.append({
                            'input_ids': tokenized_inputs.input_ids,
                            'attention_mask': tokenized_inputs.attention_mask,
                            'labels': labels
                        })

    def _split_data(self):
        split_index = int(len(self.samples) * self.split_ratio)
        self.train_samples = self.samples[:split_index]
        self.eval_samples = self.samples[split_index:]

    def align_labels(self, labels, input_ids):
        aligned_labels = []
        label_idx = 0
        for i in range(input_ids.size(1)):
            token = self.tokenizer.convert_ids_to_tokens(input_ids[0, i].item())
            if token.startswith("##"):
                aligned_labels.append(-100)
            else:
                if label_idx < len(labels) and i >= labels[label_idx][0] and i <= labels[label_idx][1]:
                    aligned_labels.append(self.label_map.get(labels[label_idx][2], -100))
                else:
                    aligned_labels.append(-100)
                if label_idx < len(labels) and i == labels[label_idx][1]:
                    label_idx += 1
        return torch.tensor(aligned_labels)

    def __len__(self):
        return len(self.train_samples)

    def __getitem__(self, idx):
        sample = self.train_samples[idx]
        return sample

    def get_eval_dataset(self):
        return self.eval_samples


class CustomDataCollatorForTokenClassification(DataCollatorForTokenClassification):
    def __call__(self, features):
        batch = {}
        batch["input_ids"] = torch.stack([feature["input_ids"].squeeze() for feature in features])
        batch["attention_mask"] = torch.stack([feature["attention_mask"].squeeze() for feature in features])
        batch["labels"] = torch.stack([feature["labels"].squeeze() for feature in features])

        return batch

tokenizer = BertTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
model = BertForTokenClassification.from_pretrained("nlpaueb/legal-bert-base-uncased", num_labels=len(label_list))

data_collator = CustomDataCollatorForTokenClassification(tokenizer)
# Initialize dataset
dataset = CustomDataset(folder_path, tokenizer, label_list)

# Get evaluation dataset
eval_dataset = dataset.get_eval_dataset()

# Prepare training arguments and trainer
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    logging_dir='./logs',
)

# Initialize trainer after defining the model and label_list
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()

# Save the model
model.save_pretrained('./legal_bert_ner_model')
tokenizer.save_pretrained('./legal_bert_ner_model')

Some weights of BertForTokenClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


('./legal_bert_ner_model/tokenizer_config.json',
 './legal_bert_ner_model/special_tokens_map.json',
 './legal_bert_ner_model/vocab.txt',
 './legal_bert_ner_model/added_tokens.json')

In [14]:
# evaluation
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

# Get the predicted labels
predictions = trainer.predict(eval_dataset)

# Extract predicted label IDs
predicted_label_ids = np.argmax(predictions.predictions, axis=2)

# Flatten the predictions and labels to calculate accuracy and F1 score
flat_predictions = np.concatenate(predicted_label_ids)
flat_labels = np.concatenate([eval_dataset[i]["labels"].numpy() for i in range(len(eval_dataset))])

# Calculate accuracy
accuracy = accuracy_score(flat_labels, flat_predictions)

# Calculate precision, recall, F1 score
precision, recall, f1, _ = precision_recall_fscore_support(flat_labels, flat_predictions, average='micro')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Accuracy: 0.08447265625
Precision: 0.08447265625
Recall: 0.08447265625
F1 Score: 0.08447265625


In [15]:
import torch

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Put the model on the device
model.to(device)

# Set the model in evaluation mode
model.eval()

# Iterate over the evaluation dataset and make predictions
predictions = []
for batch in eval_dataset:
    # Move inputs to the appropriate device
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)

    # Disable gradient calculation
    with torch.no_grad():
        # Forward pass, get logits
        outputs = model(input_ids, attention_mask=attention_mask)

    # Get the predicted labels
    predicted_labels = torch.argmax(outputs.logits, dim=-1)
    
    # Append predictions to the list
    predictions.append(predicted_labels.detach().cpu().numpy())

# Convert the list of predictions to a single numpy array
predictions = np.concatenate(predictions)

# Now you have predictions for all samples in the evaluation dataset
print(predictions)

# Define a function to convert integer labels to label names
def convert_labels_to_names(predictions, label_list):
    label_names = []
    for pred in predictions:
        label_names.append([label_list[idx] for idx in pred])
    return label_names

# Convert predicted labels to label names
predicted_label_names = convert_labels_to_names(predictions, label_list)

# Print some examples of predicted labels
for i in range(5):  # Print the first 5 examples
    print("Predicted Labels:", predicted_label_names[i])

[[ 3 11 11 ...  1  1 11]
 [ 3 11 11 ...  1  9  1]
 [ 3 11 11 ...  1  1 11]
 ...
 [ 3 11 11 ...  3  3  1]
 [ 1 11 11 ...  3  1  1]
 [ 3 11 11 ...  3  1 11]]
Predicted Labels: ['RESPONDENT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'RESPONDEN

In [2]:
import os
import json
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split

folder_path = os.path.join("content/jsonl_files")

# Define the label list based on your data
label_list = ["PRECEDENT", "LAWYER", "JUDGE", "RESPONDENT", "GPE",
              "DATE", "OTHER_PERSON", "PROVISION", "ORG", "PETITIONER",
              "WITNESS", "COURT", "STATUTE", "CASE_NUMBER"]

# Split the data into train and validation sets
train_files, eval_files = train_test_split(os.listdir(folder_path), test_size=0.2, random_state=42)

class CustomDataset(Dataset):
    def __init__(self, file_list, tokenizer, label_list, max_length=512):
        self.tokenizer = tokenizer
        self.label_list = label_list
        self.label_map = {label: i for i, label in enumerate(label_list)}
        self.samples = []
        self.max_length = max_length

        self._load_data(file_list)

    def _load_data(self, file_list):
        for file_name in file_list:
            if file_name.endswith('.jsonl'):
                file_path = os.path.join(folder_path, file_name)
                with open(file_path, 'r') as file:
                    for line in file:
                        data = json.loads(line)
                        text = data['text']
                        labels = data['label']
                        tokenized_inputs = self.tokenizer(
                            text,
                            is_split_into_words=True,
                            padding='max_length',
                            truncation=True,
                            max_length=self.max_length,
                            return_tensors='pt'
                        )
                        labels = self.align_labels(labels, tokenized_inputs.input_ids)
                        self.samples.append({
                            'input_ids': tokenized_inputs.input_ids,
                            'attention_mask': tokenized_inputs.attention_mask,
                            'labels': labels
                        })

    def align_labels(self, labels, input_ids):
        aligned_labels = []
        label_idx = 0
        for i in range(input_ids.size(1)):
            token = self.tokenizer.convert_ids_to_tokens(input_ids[0, i].item())
            if token.startswith("##"):
                aligned_labels.append(-100)
            else:
                if label_idx < len(labels) and i >= labels[label_idx][0] and i <= labels[label_idx][1]:
                    aligned_labels.append(self.label_map.get(labels[label_idx][2], -100))
                else:
                    aligned_labels.append(-100)
                if label_idx < len(labels) and i == labels[label_idx][1]:
                    label_idx += 1
        return torch.tensor(aligned_labels)

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        return sample

class CustomDataCollatorForTokenClassification(DataCollatorForTokenClassification):
    def __call__(self, features):
        batch = {}
        batch["input_ids"] = torch.stack([feature["input_ids"].squeeze() for feature in features])
        batch["attention_mask"] = torch.stack([feature["attention_mask"].squeeze() for feature in features])
        batch["labels"] = torch.stack([feature["labels"].squeeze() for feature in features])

        return batch

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
model = BertForTokenClassification.from_pretrained("nlpaueb/legal-bert-base-uncased", num_labels=len(label_list))

# Initialize train and validation datasets
train_dataset = CustomDataset(train_files, tokenizer, label_list)
eval_dataset = CustomDataset(eval_files, tokenizer, label_list)
data_collator = CustomDataCollatorForTokenClassification(tokenizer)
# Prepare training arguments and trainer
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    logging_dir='./logs',
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()

# Save the model
model.save_pretrained('./legal_bert_ner_model2')
tokenizer.save_pretrained('./legal_bert_ner_model2')


Some weights of BertForTokenClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


('./legal_bert_ner_model2/tokenizer_config.json',
 './legal_bert_ner_model2/special_tokens_map.json',
 './legal_bert_ner_model2/vocab.txt',
 './legal_bert_ner_model2/added_tokens.json')

In [58]:
print(len(label_list))

14


In [3]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

# Get the predicted labels
predictions = trainer.predict(eval_dataset)

# Extract predicted label IDs
predicted_label_ids = np.argmax(predictions.predictions, axis=2)

# Flatten the predictions and labels to calculate accuracy and F1 score
flat_predictions = np.concatenate(predicted_label_ids)
flat_labels = np.concatenate([eval_dataset[i]["labels"].numpy() for i in range(len(eval_dataset))])

# Calculate accuracy
accuracy = accuracy_score(flat_labels, flat_predictions)

# Calculate precision, recall, F1 score
precision, recall, f1, _ = precision_recall_fscore_support(flat_labels, flat_predictions, average='micro')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Accuracy: 0.045654296875
Precision: 0.045654296875
Recall: 0.045654296875
F1 Score: 0.045654296875


In [4]:
# Get the predicted labels
predictions = trainer.predict(eval_dataset)

# Extract predicted label IDs
predicted_label_ids = np.argmax(predictions.predictions, axis=2)

# Flatten the predictions and labels to calculate accuracy and F1 score
flat_predictions = np.concatenate(predicted_label_ids)
flat_labels = np.concatenate([eval_dataset[i]["labels"].numpy() for i in range(len(eval_dataset))])

# Calculate accuracy
accuracy = accuracy_score(flat_labels, flat_predictions)

# Calculate precision, recall, F1 score
precision, recall, f1, _ = precision_recall_fscore_support(flat_labels, flat_predictions, average='micro')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Accuracy: 0.045654296875
Precision: 0.045654296875
Recall: 0.045654296875
F1 Score: 0.045654296875


In [10]:
training_args = TrainingArguments(output_dir="test_trainer", use_mps_device=True)




In [5]:
import torch

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Put the model on the device
model.to(device)

# Set the model in evaluation mode
model.eval()

# Iterate over the evaluation dataset and make predictions
predictions = []
for batch in eval_dataset:
    # Move inputs to the appropriate device
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)

    # Disable gradient calculation
    with torch.no_grad():
        # Forward pass, get logits
        outputs = model(input_ids, attention_mask=attention_mask)

    # Get the predicted labels
    predicted_labels = torch.argmax(outputs.logits, dim=-1)
    
    # Append predictions to the list
    predictions.append(predicted_labels.detach().cpu().numpy())

# Convert the list of predictions to a single numpy array
predictions = np.concatenate(predictions)

# Now you have predictions for all samples in the evaluation dataset
print(predictions)

# Define a function to convert integer labels to label names
def convert_labels_to_names(predictions, label_list):
    label_names = []
    for pred in predictions:
        label_names.append([label_list[idx] for idx in pred])
    return label_names

# Convert predicted labels to label names
predicted_label_names = convert_labels_to_names(predictions, label_list)

# Print some examples of predicted labels
for i in range(5):  # Print the first 5 examples
    print("Predicted Labels:", predicted_label_names[i])

u_pred = []
for doc in predicted_label_names:
    for pred in doc:
        if pred not in u_pred:
            u_pred.append(pred)

print(u_pred)

[[11 11 11 ...  1  1  3]
 [11 11 11 ... 11  3  3]
 [11  0 11 ...  1  1  3]
 ...
 [11 11 11 ...  1  1  3]
 [11 11 11 ...  3  3  3]
 [11  0 11 ...  1  1  3]]
Predicted Labels: ['COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'LAWYER', 'LAWYER', 'COU

In [6]:
import json
import torch
from transformers import BertTokenizer, BertForTokenClassification

# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained("legal_bert_ner_model2")

# Specify the directory containing the model.safetensors file
model_path = "legal_bert_ner_model2"

# Load the model
model = BertForTokenClassification.from_pretrained(model_path)

# Load new data from JSON Lines file
new_texts = []
with open("admin.jsonl", "r") as file:
    for line in file:
        # Load JSON object from each line
        json_obj = json.loads(line)
        # Extract text from JSON object (assuming "text" field)
        new_texts.append(json_obj["text"])

# Iterate through each new text
for text in new_texts:
    # Tokenize the text
    tokenized_inputs = tokenizer(text, is_split_into_words=True, padding='max_length', truncation=True, max_length=512, return_tensors='pt')

    # Model inference
    with torch.no_grad():
        outputs = model(**tokenized_inputs)

    # Get predicted labels
    predicted_labels = torch.argmax(outputs.logits, dim=-1)

    # Convert token IDs back to tokens and display along with labels
    tokens = tokenizer.convert_ids_to_tokens(tokenized_inputs.input_ids[0])
    for token, label_idx in zip(tokens, predicted_labels[0]):
        label_value = model.config.id2label[label_idx.item()]
        #print(label_idx)
        print(f"{token}: {label_list[len(label_value.split(_)[-1])]}")



[CLS]: PROVISION
won: ORG
##g: ORG
fo: PROVISION
##ong: ORG
cha: ORG
##i: ORG
v: PROVISION
li: ORG
##n: ORG
ku: ORG
##o: ORG
ha: PROVISION
##o: ORG
[: PROVISION
2005: PROVISION
]: ORG
s: ORG
##gh: PROVISION
##c: ORG
77: PROVISION
case: PROVISION
number: PROVISION
:: PROVISION
boc: PROVISION
265: PROVISION
/: PROVISION
2004: PROVISION
,: PROVISION
sic: PROVISION
658: PROVISION
##0: PROVISION
/: PROVISION
2004: PROVISION
decision: PROVISION
date: PROVISION
:: PROVISION
26: PROVISION
april: PROVISION
2005: ORG
tribunal: PROVISION
/: PROVISION
court: PROVISION
:: PROVISION
high: PROVISION
court: PROVISION
cor: PROVISION
##am: PROVISION
:: PROVISION
andrew: PROVISION
ph: PROVISION
##ang: ORG
boon: PROVISION
leon: PROVISION
##g: ORG
j: PROVISION
##c: ORG
counsel: PROVISION
name: PROVISION
(: PROVISION
s: PROVISION
): PROVISION
:: PROVISION
gan: PROVISION
##es: PROVISION
##h: PROVISION
s: PROVISION
ram: PROVISION
##ana: PROVISION
##than: PROVISION
and: PROVISION
ren: ORG
##uka: ORG
che: ORG
#

# AHHHH

In [17]:
import os
import json
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertForTokenClassification
from sklearn.model_selection import train_test_split
from transformers import DataCollatorForTokenClassification
from transformers import TrainingArguments, Trainer

folder_path = os.path.join("content/jsonl_files")

# Define the full label names based on your data
full_label_list = ["PRECEDENT", "LAWYER", "JUDGE", "RESPONDENT", "GPE",
                   "DATE", "OTHER_PERSON", "PROVISION", "ORG", "PETITIONER",
                   "WITNESS", "COURT", "STATUTE", "CASE_NUMBER"]

# Split the data into train and validation sets
train_files, val_files = train_test_split(os.listdir(folder_path), test_size=0.2, random_state=42)

class CustomDataset(Dataset):
    def __init__(self, file_list, tokenizer, label_list, max_length=512):
        self.tokenizer = tokenizer
        self.label_list = label_list
        self.label_map = {label: i for i, label in enumerate(label_list)}
        self.samples = []
        self.max_length = max_length

        self._load_data(file_list)

    def _load_data(self, file_list):
        for file_name in file_list:
            if file_name.endswith('.jsonl'):
                file_path = os.path.join(folder_path, file_name)
                with open(file_path, 'r') as file:
                    for line in file:
                        data = json.loads(line)
                        text = data['text']
                        labels = data['label']
                        tokenized_inputs = self.tokenizer(
                            text,
                            is_split_into_words=True,
                            padding='max_length',
                            truncation=True,
                            max_length=self.max_length,
                            return_tensors='pt'
                        )
                        labels = self.align_labels(labels, tokenized_inputs.input_ids)
                        self.samples.append({
                            'input_ids': tokenized_inputs.input_ids.squeeze(0),
                            'attention_mask': tokenized_inputs.attention_mask.squeeze(0),
                            'labels': labels
                        })

    def align_labels(self, labels, input_ids):
        max_length = input_ids.size(1)
        aligned_labels = [-100] * max_length  # Initialize with padding token
        
        for start, end, label in labels:
            for i in range(start, min(end, max_length)):
                aligned_labels[i] = self.label_list.index(label)
                
        return torch.tensor(aligned_labels)


    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        return sample


# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
model = BertForTokenClassification.from_pretrained("nlpaueb/legal-bert-base-uncased", num_labels=len(full_label_list))

# Initialize train and validation datasets
train_dataset = CustomDataset(train_files, tokenizer, full_label_list)
eval_dataset = CustomDataset(val_files, tokenizer, full_label_list)
data_collator = DataCollatorForTokenClassification(tokenizer)
# Prepare training arguments and trainer
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    logging_dir='./logs',
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()

# Save the model
model.save_pretrained('./legal_bert_ner_model4')
tokenizer.save_pretrained('./legal_bert_ner_model4')


Some weights of BertForTokenClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


('./legal_bert_ner_model4/tokenizer_config.json',
 './legal_bert_ner_model4/special_tokens_map.json',
 './legal_bert_ner_model4/vocab.txt',
 './legal_bert_ner_model4/added_tokens.json')

In [18]:
import torch

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Put the model on the device
model.to(device)

# Set the model in evaluation mode
model.eval()

# Iterate over the evaluation dataset and make predictions
predictions = []
for batch in eval_dataset:
    # Move inputs to the appropriate device
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)

    # Disable gradient calculation
    with torch.no_grad():
        # Forward pass, get logits
        outputs = model(input_ids, attention_mask=attention_mask)

    # Get the predicted labels
    predicted_labels = torch.argmax(outputs.logits, dim=-1)
    
    # Append predictions to the list
    predictions.append(predicted_labels.detach().cpu().numpy())

# Convert the list of predictions to a single numpy array
predictions = np.concatenate(predictions)

# Now you have predictions for all samples in the evaluation dataset
print(predictions)

# Define a function to convert integer labels to label names
def convert_labels_to_names(predictions, label_list):
    label_names = []
    for pred in predictions:
        label_names.append([label_list[idx] for idx in pred])
    return label_names

# Convert predicted labels to label names
predicted_label_names = convert_labels_to_names(predictions, label_list)

# Print some examples of predicted labels
for i in range(5):  # Print the first 5 examples
    print("Predicted Labels:", predicted_label_names[i])

u_pred = []
for doc in predicted_label_names:
    for pred in doc:
        if pred not in u_pred:
            u_pred.append(pred)

print(u_pred)

ValueError: not enough values to unpack (expected 2, got 1)

In [20]:
import json

file_path = "admin.jsonl"

# Open the JSON file and read its contents
with open(file_path, "r") as file:
    # Read each line (dictionary) in the file
    for line in file:
        # Parse the JSON dictionary
        data = json.loads(line)
        
        
        # Now 'data' contains the contents of each dictionary
        # You can access the data and perform further processing here
        print(data.keys())


dict_keys(['id', 'text', 'label', 'Comments'])
dict_keys(['id', 'text', 'label', 'Comments'])
dict_keys(['id', 'text', 'label', 'Comments'])
dict_keys(['id', 'text', 'label', 'Comments'])
dict_keys(['id', 'text', 'label', 'Comments'])
dict_keys(['id', 'text', 'label', 'Comments'])
dict_keys(['id', 'text', 'label', 'Comments'])
dict_keys(['id', 'text', 'label', 'Comments'])
dict_keys(['id', 'text', 'label', 'Comments'])
dict_keys(['id', 'text', 'label', 'Comments'])


# PLEASE

In [None]:
# model attempt 1
import os
import json
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification

folder_path = os.path.join("content/jsonl_files")
#tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
# |      >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
## |      >>> assistant_model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
# |      >>> # set pad_token_id to eos_token_id because GPT2 does not have a PAD token
# |      >>> model.generation_config.pad_token_id = model.generation_config.eos_token_id
# |      >>> input_prompt = "It might be possible to"
# |      >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
# |      >>> # instantiate logits processors
# |      >>> logits_processor = LogitsProcessorList(
# |      ...     [
# |      ...         MinLengthLogitsProcessor(10, eos_token_id=model.generation_config.eos_token_id),
# |      ...     ]
# |      ... )
# |      >>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)])
# |      >>> outputs = model.assisted_decoding(
 #|      ...     input_ids,
 #|      ...     assistant_model=assistant_model,
 #|      ...     logits_processor=logits_processor,
 #|      ...     stopping_criteria=stopping_criteria,




# Define the label list based on your data
label_list = ["PRECEDENT", "LAWYER", "JUDGE", "RESPONDENT", "GPE",
              "DATE", "OTHER_PERSON", "PROVISION", "ORG", "PETITIONER",
              "WITNESS", "COURT", "STATUTE", "CASE_NUMBER"]


class CustomDataset(Dataset):
    def __init__(self, folder_path, tokenizer, label_list, max_length=512, split_ratio=0.8, seed=42):
        self.tokenizer = tokenizer
        self.label_list = label_list
        self.label_map = {label: i for i, label in enumerate(label_list)}
        self.samples = []
        self.max_length = max_length
        self.split_ratio = split_ratio
        self.seed = seed

        self._load_data(folder_path)
        self._split_data()

    def _load_data(self, folder_path):
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.jsonl'):
                file_path = os.path.join(folder_path, file_name)
                with open(file_path, 'r') as file:
                    for line in file:
                        data = json.loads(line)
                        text = data['text']
                        labels = data['label']
                        tokenized_inputs = self.tokenizer(
                            text,
                            is_split_into_words=True,
                            padding='max_length',
                            truncation=True,
                            max_length=self.max_length,
                            return_tensors='pt'
                        )
                        labels = self.align_labels(labels, tokenized_inputs.input_ids)
                        self.samples.append({
                            'input_ids': tokenized_inputs.input_ids,
                            'attention_mask': tokenized_inputs.attention_mask,
                            'labels': labels
                        })

    def _split_data(self):
        split_index = int(len(self.samples) * self.split_ratio)
        self.train_samples = self.samples[:split_index]
        self.eval_samples = self.samples[split_index:]

    def align_labels(self, labels, input_ids):
        aligned_labels = []
        label_idx = 0
        for i in range(input_ids.size(1)):
            token = self.tokenizer.convert_ids_to_tokens(input_ids[0, i].item())
            if token.startswith("##"):
                aligned_labels.append(-100)
            else:
                if label_idx < len(labels) and i >= labels[label_idx][0] and i <= labels[label_idx][1]:
                    aligned_labels.append(self.label_map.get(labels[label_idx][2], -100))
                else:
                    aligned_labels.append(-100)
                if label_idx < len(labels) and i == labels[label_idx][1]:
                    label_idx += 1
        return torch.tensor(aligned_labels)

    def __len__(self):
        return len(self.train_samples)

    def __getitem__(self, idx):
        sample = self.train_samples[idx]
        return sample

    def get_eval_dataset(self):
        return self.eval_samples


class CustomDataCollatorForTokenClassification(DataCollatorForTokenClassification):
    def __call__(self, features):
        batch = {}
        batch["input_ids"] = torch.stack([feature["input_ids"].squeeze() for feature in features])
        batch["attention_mask"] = torch.stack([feature["attention_mask"].squeeze() for feature in features])
        batch["labels"] = torch.stack([feature["labels"].squeeze() for feature in features])

        return batch

tokenizer = BertTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
model = BertForTokenClassification.from_pretrained("nlpaueb/legal-bert-base-uncased", num_labels=len(label_list))

data_collator = CustomDataCollatorForTokenClassification(tokenizer)
# Initialize dataset
dataset = CustomDataset(folder_path, tokenizer, label_list)

# Get evaluation dataset
eval_dataset = dataset.get_eval_dataset()

# Prepare training arguments and trainer
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    logging_dir='./logs',
)

# Initialize trainer after defining the model and label_list
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()

# Save the model
model.save_pretrained('./legal_bert_ner_model')
tokenizer.save_pretrained('./legal_bert_ner_model')