In [1]:
from TopicSegmentation import LegalBert, ModifiedStandardDecoder, PaddingMaskLayer
from nltk.tokenize import RegexpTokenizer
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
import pandas as pd
import numpy as np
import re
from Preprocessing import *

# Instantiate Model Architecture

In [5]:
# Model Configuration
vocab_size = 31000
embedding_dim = 768
num_heads = 8
ff_dim = 1024
dropout_rate = 0.1

decoder = ModifiedStandardDecoder(vocab_size, embedding_dim, num_heads, ff_dim, dropout_rate)

# Inputs to the model
segment_inputs = tf.keras.Input(shape=(None,), name="Segment_Input_Layer") # refers to the tokens fed into one at a time
encoder_outputs = tf.keras.Input(shape=(None, embedding_dim), name="Context_Vector_Input_Layer")

padding_mask_layer = PaddingMaskLayer(num_heads=num_heads, name="Padding_Mask_Layer")
padding_mask = padding_mask_layer(segment_inputs)

outputs = decoder(segment_inputs, encoder_outputs, padding_mask=padding_mask)
model_issues = tf.keras.Model(inputs=[encoder_outputs, segment_inputs], outputs=outputs)

model_issues.summary()

# Preprocess Data

In [6]:
x = preprocess("new_court_cases.csv")
court_cases = x.give_courtcases()
facts = x.give_rfi()

shape of initial dataframe:  (325, 4)
shape of dataframe when null values were dropped:  (325, 4)
shape of dataframe when preprocessed and duplicated values where dropped:  (325, 4)


# Instantiate Legal BERT Preprocessor and Encoder

In [7]:
# Initialize the preprocessor and legal BERT
legal_bert = LegalBert()

# Get context vectors of the whole court cases in batches
bert_output = legal_bert.get_context_vectors(court_cases, batch_size=32)

  return self.fget.__get__(instance, owner)()


In [8]:
bert_output.shape

torch.Size([325, 1024, 768])

# Tokenize and add paddings to each data for each model

In [9]:
tokenizer_issues = legal_bert.tokenizer
# Tokenize the issues segments using LegalBERT tokenizer
tokenized_segments_issues = tokenizer_issues(issues, padding=True, truncation=True, max_length=1024, return_tensors='tf')

# Convert to tensors
input_ids = tokenized_segments_issues['input_ids']

# Shift the input sequences to the right by one position
shifted_segments_issues = np.zeros_like(input_ids.numpy())
shifted_segments_issues[:, 1:] = input_ids[:, :-1]  # Shift right
shifted_segments_issues[:, 0] = tokenizer_issues.cls_token_id  # Use BERT's [CLS] token ID as the start token

# Convert shifted sequences to tensor
shifted_segments_issues = tf.convert_to_tensor(shifted_segments_issues)

# Set the court cases to xtrain
xbert_train = bert_output.detach().numpy()
xtrain = tf.convert_to_tensor(xbert_train, dtype=tf.float32)

# Set the segments to the ytrain and ytrain shifted
ytrain_issues = tf.convert_to_tensor(input_ids, dtype=tf.int32)
ytrain_shifted = tf.convert_to_tensor(shifted_segments_issues, dtype=tf.int32)

In [10]:
print("xtrain shape:", xtrain.shape)  # Should be (batch_size, seq_len, embedding_dim)
print("ytrain_shifted shape:", ytrain_shifted.shape)  # Should be (batch_size, seq_len)
print("ytrain_issues shape:", ytrain_issues.shape)  # Should be (batch_size, seq_len)

xtrain shape: (325, 1024, 768)
ytrain_shifted shape: (325, 1024)
ytrain_issues shape: (325, 1024)


# Train Models

In [11]:
# Compile the models
model_issues.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=1e-4), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model_issues.fit([xtrain, ytrain_issues], ytrain_shifted, epochs=3)
model_issues.save('facts_seq_to_seq.keras')

Epoch 1/3
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m822s[0m 72s/step - accuracy: 0.0412 - loss: 12.6687
Epoch 2/3
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m778s[0m 70s/step - accuracy: 0.0623 - loss: 9.0652
Epoch 3/3
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m888s[0m 77s/step - accuracy: 0.0403 - loss: 9.3904


In [11]:
class PrintPredictionsCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        # Prepare a sample input (tokenized and padded sequence)
        sample_court_case_input = ytrain_issues[:1]  # Shape: (1, sequence_length)
        sample_context_vector_input = xtrain[:1]  # Shape: (1, sequence_length, embedding_dim)
        
        # Make a prediction
        sample_output = self.model.predict([sample_court_case_input, sample_context_vector_input])
        
        # Decode the prediction
        decoded_output = tokenizer_issues.sequences_to_texts(sample_output.argmax(-1))
        
        print(f"\nSample Prediction after epoch {epoch+1}: {decoded_output}")

# Apply the callback during training
model_issues.fit([ytrain_issues, xtrain], ytrain_issues, epochs=3, callbacks=[PrintPredictionsCallback()])


Epoch 1/3


ValueError: Exception encountered when calling Functional.call().

[1mInvalid input shape for input Tensor("functional_5_1/Cast:0", shape=(None, 1024), dtype=float32). Expected shape (None, None, 768), but input has incompatible shape (None, 1024)[0m

Arguments received by Functional.call():
  • inputs=('tf.Tensor(shape=(None, 1024), dtype=int32)', 'tf.Tensor(shape=(None, 1024, 768), dtype=float32)')
  • training=True
  • mask=('None', 'None')