# Import All Modules

In [1]:
import tensorflow as tf
from transformers import AutoTokenizer
import numpy as np
from TopicSegmentation import *
import re
from Preprocessing import *

# Sample text

In [2]:
x = preprocess("new_court_cases.csv")
court_cases = x.give_courtcases()
issues = x.give_rfi()

shape of initial dataframe:  (325, 4)
shape of dataframe when null values were dropped:  (325, 4)
shape of dataframe when preprocessed and duplicated values where dropped:  (325, 4)


In [3]:
sample = [court_cases[0]]

# Load Models

In [4]:
# Define a function to load the model with custom objects
def load_model_with_custom_objects(model_path):
    # Define the custom objects dictionary
    custom_objects = {
        'ModifiedStandardDecoder': ModifiedStandardDecoder,
        'PaddingMaskLayer': PaddingMaskLayer
    }
    
    # Wrap the model loading in keras.utils.custom_object_scope
    with tf.keras.utils.custom_object_scope(custom_objects):
        loaded_model = tf.keras.models.load_model(model_path)
    return loaded_model

# Load the models using the function
model_issues = load_model_with_custom_objects('facts_seq_to_seq.keras')






# Initialize the testing data

In [5]:
# Initialize LegalBert instance
legal_bert = LegalBert()

# Preprocess the text: get context vectors from the BERT encoder
context_vectors = legal_bert.get_context_vectors(sample)

# Convert to a tensor for the model input
context_vectors = tf.convert_to_tensor(context_vectors.detach().numpy(), dtype=tf.float32)

print("Context Vector Shape:", context_vectors.shape)

  return self.fget.__get__(instance, owner)()


Context Vector Shape: (1, 1024, 768)


In [6]:
# Initialize the starting sequence with the [CLS] token (assuming CLS token ID is 101)
start_token_id = legal_bert.tokenizer.cls_token_id
generated_tokens = [start_token_id]
max_length = 50

In [7]:
decoder_input = tf.convert_to_tensor([generated_tokens], dtype=tf.int32)
decoder_input.shape

TensorShape([1, 1])

In [8]:
context_vectors.shape

TensorShape([1, 1024, 768])

# Model Inference

In [29]:
"""# Function to perform inference with modified decoder input handling
def generate_text(context_vectors, start_token_id, max_length):
    # Keep track of generated tokens
    generated_tokens = [start_token_id]
    
    for i in range(max_length):
        # Convert current token list to tensor
        decoder_input = tf.convert_to_tensor([generated_tokens], dtype=tf.int32)

        # Predict the next token
        predictions = model_issues([context_vectors, decoder_input], training=False)  # Use model call directly
        
        # Get the token ID with the highest probability
        next_token_id = np.argmax(predictions[0, -1, :])
        
        # Append the predicted token to the generated sequence
        generated_tokens.append(next_token_id)
        
        # Check for stop condition, such as reaching the [SEP] token
        if next_token_id == legal_bert.tokenizer.sep_token_id:
            break
    
    return generated_tokens"""

In [30]:
# Generate text
generated_sequence = generate_text(context_vectors, start_token_id, max_length)

InvalidArgumentError: Exception encountered when calling Softmax.call().

[1m{{function_node __wrapped__AddV2_device_/job:localhost/replica:0/task:0/device:CPU:0}} Incompatible shapes: [1,8,2,1024] vs. [1,1,1,2] [Op:AddV2] name: [0m

Arguments received by Softmax.call():
  • inputs=tf.Tensor(shape=(1, 8, 2, 1024), dtype=float32)
  • mask=tf.Tensor(shape=(1, 1, 1, 2), dtype=float32)

In [42]:
"""# Inference Function
def predict_sequence(encoder_input, decoder, max_length=50):
    # Encode the input sequence to get the context vectors
    context_vectors_pt = legal_bert.get_context_vectors([encoder_input])  # Pass raw text input
    
    # Convert the PyTorch tensor to a NumPy array and then to a TensorFlow tensor
    context_vectors_tf = tf.convert_to_tensor(context_vectors_pt.detach().numpy(), dtype=tf.float32)
    
    # Start with the initial token, usually <s> or equivalent for the model
    start_token_id = legal_bert.tokenizer.cls_token_id
    decoder_input = tf.convert_to_tensor([[start_token_id]], dtype=tf.int32)

    generated_tokens = []

    for _ in range(max_length):
        # Predict the next token
        predictions = model_issues([context_vectors_tf, decoder_input], training=False)  # Use model call directly
        
        # Get the most likely token
        predicted_token = tf.argmax(predictions[:, -1, :], axis=-1)
        
        # Append predicted token to generated sequence
        generated_tokens.append(predicted_token.numpy()[0])
        
        # Check for end of sequence token and break
        if predicted_token.numpy()[0] == legal_bert.tokenizer.sep_token_id:  # Use SEP token ID
            break

        # Convert predicted_token to int32
        predicted_token = tf.cast(predicted_token, tf.int32)
        
        # Update decoder input for the next prediction step
        decoder_input = tf.concat([decoder_input, tf.expand_dims(predicted_token, axis=-1)], axis=1)

    return legal_bert.tokenizer.decode(generated_tokens)"""

In [43]:
# Example usage:
encoder_input = "The quick brown fox jumps over the lazy dog."  # Sample input text
generated_text = predict_sequence(encoder_input, model_issues, max_length=50)
print("Generated Text:", generated_text)


InvalidArgumentError: Exception encountered when calling Softmax.call().

[1m{{function_node __wrapped__AddV2_device_/job:localhost/replica:0/task:0/device:CPU:0}} Incompatible shapes: [1,8,2,14] vs. [1,1,1,2] [Op:AddV2] name: [0m

Arguments received by Softmax.call():
  • inputs=tf.Tensor(shape=(1, 8, 2, 14), dtype=float32)
  • mask=tf.Tensor(shape=(1, 1, 1, 2), dtype=float32)

In [52]:
"""# Inference Function with Padding Mask Adjustments
def predict_sequence(encoder_input, decoder, max_length=50):
    # Encode the input sequence to get the context vectors
    context_vectors = legal_bert.get_context_vectors([encoder_input])  # Pass raw text input
    
    # Convert context_vectors to TensorFlow tensor and set proper dtype
    context_vectors_tf = tf.convert_to_tensor(context_vectors.detach().numpy(), dtype=tf.float32)
    
    # Start with the initial token, usually [CLS] or equivalent for the model
    start_token_id = legal_bert.tokenizer.cls_token_id
    decoder_input = tf.convert_to_tensor([[start_token_id]], dtype=tf.int32)
    
    # For testing, print initial shapes
    print("Initial context vector shape:", context_vectors_tf.shape)
    print("Initial decoder input shape:", decoder_input.shape)
    
    generated_tokens = []
    
    for step in range(max_length):
        # Generate the mask for the decoder input
        attention_mask = tf.cast(tf.sequence_mask(tf.shape(decoder_input)[1]), dtype=tf.float32)
        
        # Expand dimensions of attention_mask to match required shape
        attention_mask = tf.expand_dims(tf.expand_dims(attention_mask, 1), 1)
        
        # Predict the next token using the decoder
        predictions = model_issues([context_vectors_tf, decoder_input], training=False)
        
        # Get the most likely token
        predicted_token = tf.argmax(predictions[:, -1, :], axis=-1)

        # Print the predicted token for debugging
        print(f"Step {step + 1} - Predicted Token ID: {predicted_token.numpy()[0]}")
        
        # Append predicted token to generated sequence
        generated_tokens.append(predicted_token.numpy()[0])
        
        # Check for end of sequence token and break
        if predicted_token.numpy()[0] == legal_bert.tokenizer.sep_token_id:
            break

        # Convert predicted_token to int32
        decoder_input = tf.cast(decoder_input, tf.int64)
        
        # Update decoder input for the next prediction step
        decoder_input = tf.concat([decoder_input, tf.expand_dims(predicted_token, axis=-1)], axis=1)
    
    return legal_bert.tokenizer.decode(generated_tokens)"""

In [19]:
# Inference Function with No Mask in Inference Mode
def predict_sequence(encoder_input, decoder, max_length=50):
    # Encode the input sequence to get the context vectors
    context_vectors = legal_bert.get_context_vectors([encoder_input])  # Pass raw text input

    # Convert context_vectors to a TensorFlow tensor
    context_vectors_tf = tf.convert_to_tensor(context_vectors, dtype=tf.float32)
    
    # Start with the initial token, usually <s> or equivalent for the model
    start_token_id = legal_bert.tokenizer.cls_token_id
    decoder_input = tf.convert_to_tensor([[start_token_id]], dtype=tf.int32)

    generated_tokens = []

    for _ in range(max_length):
        # No padding mask in inference mode
        predictions = model_issues([context_vectors_tf, decoder_input], training=False)
        
        # Get the most likely token
        predicted_token = tf.argmax(predictions[:, -1, :], axis=-1)
        
        # Append predicted token to generated sequence
        generated_tokens.append(predicted_token.numpy()[0])
        
        # Check for end of sequence token and break
        if predicted_token.numpy()[0] == legal_bert.tokenizer.sep_token_id:
            break

        # Convert predicted_token to int32
        predicted_token = tf.cast(predicted_token, tf.int32)
        
        # Update decoder input for the next prediction step
        decoder_input = tf.concat([decoder_input, tf.expand_dims(predicted_token, axis=-1)], axis=1)

    return legal_bert.tokenizer.decode(generated_tokens)

In [20]:
# Usage Example
encoder_input = sample[0]
generated_sequence = predict_sequence(encoder_input, model_issues, max_length=50)
print("Generated Sequence:", generated_sequence)

Generated Sequence: president president president president president president president president president president president president president president president president president president president president president president president president president president president president president president president president president president president president president president president president president president president president president president president president president president


In [22]:
sample[0]

'section 258 of republic act ( r . a .) no . 7160 , also known as the local government code of 1991 ( lgc ), requires the treasurer of the local government to send the warrant of levy to the delinquent owner of the real property , among others . the term “ delinquent owner ” shall be construed as the person registered as owner of the realty based on the certificate of title , and not on the tax declaration . the failure of the treasurer to send the required notices to the delinquent registered owner of the property shall render void not only the levy , but the consequent public auction and sale of the subject property . this appeal by certiorari filed by the city government of antipolo and the city treasurer of antipolo ( city treasurer ; collectively , petitioners ), seek to reverse and set aside the november 18 , 2016 decision and the october 2 , 2017 order of the regional trial court of antipolo city , branch 99 ( rtc ) in civil case no . 14 - 10486 [ 6 ] which declared the forfeitu