In [None]:
#!unzip jsonl_files.zip

# From here on out

In [1]:
# model attempt 1
import os
import json
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification

folder_path = os.path.join("content/jsonl_files")
#tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
# |      >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
## |      >>> assistant_model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
# |      >>> # set pad_token_id to eos_token_id because GPT2 does not have a PAD token
# |      >>> model.generation_config.pad_token_id = model.generation_config.eos_token_id
# |      >>> input_prompt = "It might be possible to"
# |      >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
# |      >>> # instantiate logits processors
# |      >>> logits_processor = LogitsProcessorList(
# |      ...     [
# |      ...         MinLengthLogitsProcessor(10, eos_token_id=model.generation_config.eos_token_id),
# |      ...     ]
# |      ... )
# |      >>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)])
# |      >>> outputs = model.assisted_decoding(
 #|      ...     input_ids,
 #|      ...     assistant_model=assistant_model,
 #|      ...     logits_processor=logits_processor,
 #|      ...     stopping_criteria=stopping_criteria,




# Define the label list based on your data
label_list = ["PRECEDENT", "LAWYER", "JUDGE", "RESPONDENT", "GPE",
              "DATE", "OTHER_PERSON", "PROVISION", "ORG", "PETITIONER",
              "WITNESS", "COURT", "STATUTE", "CASE_NUMBER"]


class CustomDataset(Dataset):
    def __init__(self, folder_path, tokenizer, label_list, max_length=512, split_ratio=0.8, seed=42):
        self.tokenizer = tokenizer
        self.label_list = label_list
        self.label_map = {label: i for i, label in enumerate(label_list)}
        self.samples = []
        self.max_length = max_length
        self.split_ratio = split_ratio
        self.seed = seed

        self._load_data(folder_path)
        self._split_data()

    def _load_data(self, folder_path):
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.jsonl'):
                file_path = os.path.join(folder_path, file_name)
                with open(file_path, 'r') as file:
                    for line in file:
                        data = json.loads(line)
                        text = data['text']
                        labels = data['label']
                        tokenized_inputs = self.tokenizer(
                            text,
                            is_split_into_words=True,
                            padding='max_length',
                            truncation=True,
                            max_length=self.max_length,
                            return_tensors='pt'
                        )
                        labels = self.align_labels(labels, tokenized_inputs.input_ids)
                        self.samples.append({
                            'input_ids': tokenized_inputs.input_ids,
                            'attention_mask': tokenized_inputs.attention_mask,
                            'labels': labels
                        })

    def _split_data(self):
        split_index = int(len(self.samples) * self.split_ratio)
        self.train_samples = self.samples[:split_index]
        self.eval_samples = self.samples[split_index:]

    def align_labels(self, labels, input_ids):
        aligned_labels = []
        label_idx = 0
        for i in range(input_ids.size(1)):
            token = self.tokenizer.convert_ids_to_tokens(input_ids[0, i].item())
            if token.startswith("##"):
                aligned_labels.append(-100)
            else:
                if label_idx < len(labels) and i >= labels[label_idx][0] and i <= labels[label_idx][1]:
                    aligned_labels.append(self.label_map.get(labels[label_idx][2], -100))
                else:
                    aligned_labels.append(-100)
                if label_idx < len(labels) and i == labels[label_idx][1]:
                    label_idx += 1
        return torch.tensor(aligned_labels)

    def __len__(self):
        return len(self.train_samples)

    def __getitem__(self, idx):
        sample = self.train_samples[idx]
        return sample

    def get_eval_dataset(self):
        return self.eval_samples


class CustomDataCollatorForTokenClassification(DataCollatorForTokenClassification):
    def __call__(self, features):
        batch = {}
        batch["input_ids"] = torch.stack([feature["input_ids"].squeeze() for feature in features])
        batch["attention_mask"] = torch.stack([feature["attention_mask"].squeeze() for feature in features])
        batch["labels"] = torch.stack([feature["labels"].squeeze() for feature in features])

        return batch

tokenizer = BertTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
model = BertForTokenClassification.from_pretrained("nlpaueb/legal-bert-base-uncased", num_labels=len(label_list))

data_collator = CustomDataCollatorForTokenClassification(tokenizer)
# Initialize dataset
dataset = CustomDataset(folder_path, tokenizer, label_list)

# Get evaluation dataset
eval_dataset = dataset.get_eval_dataset()

# Prepare training arguments and trainer
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    logging_dir='./logs',
)

# Initialize trainer after defining the model and label_list
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()



# Define label list
label_list = ["PRECEDENT", "LAWYER", "JUDGE", "RESPONDENT", "GPE",
              "DATE", "OTHER_PERSON", "PROVISION", "ORG", "PETITIONER",
              "WITNESS", "COURT", "STATUTE", "CASE_NUMBER"]

# Set label mapping
model.config.id2label = {i: label for i, label in enumerate(label_list)}
model.config.label2id = {label: i for i, label in enumerate(label_list)}

# Save the model
model.save_pretrained('./legal_bert_ner_model5')
tokenizer.save_pretrained('./legal_bert_ner_model5')

Some weights of BertForTokenClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


('./legal_bert_ner_model5/tokenizer_config.json',
 './legal_bert_ner_model5/special_tokens_map.json',
 './legal_bert_ner_model5/vocab.txt',
 './legal_bert_ner_model5/added_tokens.json')

In [2]:
# evaluation
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

# Get the predicted labels
predictions = trainer.predict(eval_dataset)

# Extract predicted label IDs
predicted_label_ids = np.argmax(predictions.predictions, axis=2)

# Flatten the predictions and labels to calculate accuracy and F1 score
flat_predictions = np.concatenate(predicted_label_ids)
flat_labels = np.concatenate([eval_dataset[i]["labels"].numpy() for i in range(len(eval_dataset))])

# Calculate accuracy
accuracy = accuracy_score(flat_labels, flat_predictions)

# Calculate precision, recall, F1 score
precision, recall, f1, _ = precision_recall_fscore_support(flat_labels, flat_predictions, average='micro')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Accuracy: 0.098388671875
Precision: 0.098388671875
Recall: 0.098388671875
F1 Score: 0.098388671875


In [3]:
import torch

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Put the model on the device
model.to(device)

# Set the model in evaluation mode
model.eval()

# Iterate over the evaluation dataset and make predictions
predictions = []
for batch in eval_dataset:
    # Move inputs to the appropriate device
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)

    # Disable gradient calculation
    with torch.no_grad():
        # Forward pass, get logits
        outputs = model(input_ids, attention_mask=attention_mask)

    # Get the predicted labels
    predicted_labels = torch.argmax(outputs.logits, dim=-1)
    
    # Append predictions to the list
    predictions.append(predicted_labels.detach().cpu().numpy())

# Convert the list of predictions to a single numpy array
predictions = np.concatenate(predictions)

# Now you have predictions for all samples in the evaluation dataset
print(predictions)

# Define a function to convert integer labels to label names
def convert_labels_to_names(predictions, label_list):
    label_names = []
    for pred in predictions:
        label_names.append([label_list[idx] for idx in pred])
    return label_names

# Convert predicted labels to label names
predicted_label_names = convert_labels_to_names(predictions, label_list)

# Print some examples of predicted labels
for i in range(5):  # Print the first 5 examples
    print("Predicted Labels:", predicted_label_names[i])

[[ 3 11 11 ...  1  3 11]
 [ 1 11 11 ...  3  1 11]
 [11 11 11 ...  1  3 11]
 ...
 [ 1 11  3 ...  3  3 11]
 [ 1 11  3 ...  1  1 11]
 [ 1 11 11 ...  1  1 11]]
Predicted Labels: ['RESPONDENT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', 'COURT', '

In [4]:
import json
import torch
from transformers import BertForTokenClassification, BertTokenizer, Trainer, TrainingArguments
from torch.utils.data import Dataset

class CustomDataset2(Dataset):
    def __init__(self, file_path, tokenizer, label_list, max_length=512, split_ratio=0.8, seed=42):
        self.tokenizer = tokenizer
        self.label_list = label_list
        self.label_map = {label: i for i, label in enumerate(label_list)}
        self.samples = []
        self.max_length = max_length
        self.split_ratio = split_ratio
        self.seed = seed

        self._load_data(file_path)
        self._split_data()

    def _load_data(self, file_path2):
        with open(file_path2, 'r') as file:
            for line in file:
                data = json.loads(line)
                text = data['text']
                labels = data['label']
                tokenized_inputs = self.tokenizer(
                    text,
                    is_split_into_words=True,
                    padding='max_length',
                    truncation=True,
                    max_length=self.max_length,
                    return_tensors='pt'
                )
                labels = self.align_labels(labels, tokenized_inputs.input_ids)
                self.samples.append({
                    'input_ids': tokenized_inputs.input_ids,
                    'attention_mask': tokenized_inputs.attention_mask,
                    'labels': labels
                })

    def _split_data(self):
        split_index = int(len(self.samples) * self.split_ratio)
        self.train_samples = self.samples[:split_index]
        self.eval_samples = self.samples[split_index:]

    import torch

    def align_labels(self, labels, input_ids):
        aligned_labels = []
        label_idx = 0
        for i in range(input_ids.size(1)):
            token = self.tokenizer.convert_ids_to_tokens(input_ids[0, i].item())
            # Check if the token is a special token (e.g., [CLS], [SEP], [PAD])
            if token.startswith("##") or token in ["[CLS]", "[SEP]"]:
                aligned_labels.append(-100)  # Assign -100 to special tokens
            else:
                # Check if the current position corresponds to a label span
                if label_idx < len(labels) and i >= labels[label_idx][0] and i <= labels[label_idx][1]:
                    aligned_labels.append(self.label_map.get(labels[label_idx][2], -100))  # Assign label or -100 if not found
                else:
                    aligned_labels.append(-100)  # Assign -100 if no label is present for this position
                # Move to the next label span if the end position is reached
                if label_idx < len(labels) and i == labels[label_idx][1]:
                    label_idx += 1
        return torch.tensor([aligned_labels])


    def __len__(self):
        return len(self.train_samples)

    def __getitem__(self, idx):
        sample = self.train_samples[idx]
        return sample

    def get_eval_dataset(self):
        return self.eval_samples

class CustomDataCollatorForTokenClassification(DataCollatorForTokenClassification):
    def __call__(self, features):
        batch = {}
        batch["input_ids"] = torch.stack([feature["input_ids"].squeeze() for feature in features])
        batch["attention_mask"] = torch.stack([feature["attention_mask"].squeeze() for feature in features])
        batch["labels"] = torch.stack([feature["labels"].squeeze() for feature in features])

        return batch


# Load the fine-tuned model and tokenizer
model = BertForTokenClassification.from_pretrained("legal_bert_ner_model5")
tokenizer = BertTokenizer.from_pretrained("legal_bert_ner_model5")

data_collator = CustomDataCollatorForTokenClassification(tokenizer)

# Define the file path
file_path = "content/admin.jsonl"

# Define the label list
label_list = ["PRECEDENT", "LAWYER", "JUDGE", "RESPONDENT", "GPE",
              "DATE", "OTHER_PERSON", "PROVISION", "ORG", "PETITIONER",
              "WITNESS", "COURT", "STATUTE", "CASE_NUMBER"]

# Initialize dataset
dataset2 = CustomDataset2(file_path, tokenizer, label_list)
# Print a sample from the evaluation dataset
sample = dataset2[0]
print("Sample:", sample)

# Prepare training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    logging_dir='./logs',
)
eval_dataset=dataset2.get_eval_dataset()

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset2,
    tokenizer=tokenizer,
    data_collator = data_collator,
    eval_dataset=eval_dataset
)


# Train the model
trainer.train()

# Evaluate the model on the evaluation dataset
eval_results = trainer.evaluate(eval_dataset=dataset2.get_eval_dataset())

# Print the evaluation results
print("Evaluation results:", eval_results)

# Save the fine-tuned model
model.save_pretrained('./fine_tuned_model')


Sample: {'input_ids': tensor([[  101, 11965,   177,  3849,  4675,  7025,   179,   166,  4473,   184,
          6582,   185,  3378,   185,   126,   638,   128,   163,  6441,   173,
          1147,   256,   394,   119, 12968,  5014,   118,   611,   115,  6457,
          8322,  2311,   118,   611,   273,   254,   119,   556,   692,   638,
          2309,   118,   240,   119,  1026,   240,  5288,  4107,   119,  7028,
          4383,  4300, 24691,  9259,   177,   154,   173,   661,   505,   111,
           163,   112,   119, 10648,   395,   178,   163,  7979,  3847, 20625,
           212,  8436, 20112,  8764, 10177,  2474,   111,  6247,  3272, 15726,
          8764, 10177,  2474,   212,  1523,   112,   217,   207,   272,   120,
         15786, 11872,  5647,   111,  2087,   212,   408,   486,   110,   163,
           648,   112,   217,   207,   408,   486,   276,   119, 11965,   177,
          3849,  4675,  7025,   179,   116,   116,  4473,   184,  6582,   185,
          3378,   185,   966, 

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluation results: {'eval_loss': 1.7395979166030884, 'eval_runtime': 0.1644, 'eval_samples_per_second': 12.168, 'eval_steps_per_second': 6.084, 'epoch': 3.0}


In [5]:
# Get predictions from the model
predictions = trainer.predict(eval_dataset)

# Print the shape of the predictions
print("Predictions shape:", predictions.predictions.shape)

# Print the first few predictions for inspection
print("Sample predictions:", predictions.predictions[:5])


Predictions shape: (2, 512, 14)
Sample predictions: [[[-0.76212335  1.3703938   0.6915548  ...  0.53673005 -0.6069215
    0.81710404]
  [ 0.15071033  0.7018597   0.95319915 ...  0.8818127  -0.78526247
    1.0575448 ]
  [-0.20709898  0.34900543  0.72603923 ...  0.66386414 -0.7663557
    0.9665575 ]
  ...
  [-0.9306531   1.9120032   1.057309   ... -0.0152879  -0.38310352
    0.35696003]
  [-0.6381649   2.0791337   1.4263552  ...  0.03515275 -0.40768763
    0.7192615 ]
  [-0.52926654  0.7358375   0.2904504  ...  0.78621686 -0.18602158
    1.3332641 ]]

 [[-0.5901942   1.033705    0.61574554 ...  0.75337225 -0.65282446
    0.88701206]
  [ 0.14502409  0.5168922   0.742612   ...  1.6869161  -0.8672279
    0.5946497 ]
  [-0.44689894  0.5852126   0.6448316  ...  1.4282548  -1.0369414
    0.63747656]
  ...
  [-0.5909515   1.947429    1.2512305  ...  0.60361916 -0.6016633
    0.99773526]
  [-0.4940542   1.9479268   1.2256385  ...  0.517805   -0.70657283
    0.5760161 ]
  [-0.6150603   0.6197026 

In [6]:
import numpy as np

# Calculate accuracy manually
correct_predictions = np.sum(np.array_equal(true_label, pred_label) for true_label, pred_label in zip(flat_labels, flat_predictions))
accuracy = correct_predictions / len(flat_predictions)
print("Accuracy:", accuracy)


print("Length of flat_labels:", len(flat_labels))
print("Length of flat_predictions:", len(flat_predictions))


Accuracy: 0.098388671875
Length of flat_labels: 4096
Length of flat_predictions: 4096


  correct_predictions = np.sum(np.array_equal(true_label, pred_label) for true_label, pred_label in zip(flat_labels, flat_predictions))


In [7]:
# Print the predictions
print("Predictions:", flat_predictions)
for i in range(5):
    print(flat_predictions[i])


Predictions: [ 3 11 11 ...  1  1 11]
3
11
11
11
11


In [66]:
print(correct_predictions)

0
