In [2]:
import tensorflow as tf
from transformers import AutoTokenizer
import numpy as np
from TopicSegmentation import *

In [None]:
# Initialize LegalBert instance
legal_bert = LegalBert()

# Load the model
loaded_seq2seq_model = tf.keras.models.load_model('seq2seq_model.h5')

# Sample Data
input_text = "Blue na mama a tambay at FYI Corporation filed a lawsuit on August in Compound"

# Prepare data
tokens legal_bert.tokenizer(input_text, return_tensors='pt', truncation=True, padding=True, max_length=16384)

# Predict
predictions = loaded_seq2seq_model.predict(inputs['input_ids'])

In [None]:
predictions

In [3]:
from transformers import AutoTokenizer, AutoModel
import torch
from transformers import AutoTokenizer, AutoModel
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import (
    Embedding,
    Dropout,
    LayerNormalization,
    Dense,
    MultiHeadAttention,
)

In [4]:
class ModifiedStandardDecoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size, embedding_dim, num_heads, ff_dim, dropout_rate=0.1, **kwargs):
        super(ModifiedStandardDecoder, self).__init__(**kwargs)
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.dropout1 = Dropout(dropout_rate)
        self.positional_encoding = PositionalEncoding(embedding_dim)
        self.dropout2 = Dropout(dropout_rate)
        self.masked_self_attention = MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim, dropout=dropout_rate)
        self.layer_norm1 = LayerNormalization(epsilon=1e-6)
        self.multihead_attention = MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim, dropout=dropout_rate)
        self.layer_norm2 = LayerNormalization(epsilon=1e-6)
        self.ffn = tf.keras.Sequential([
            Dense(ff_dim, activation="relu"),
            Dropout(dropout_rate),
            Dense(embedding_dim)
        ])
        self.layer_norm3 = LayerNormalization(epsilon=1e-6)
        self.dropout3 = Dropout(dropout_rate)
        self.dense = Dense(vocab_size)
        self.softmax = tf.keras.activations.softmax

    def call(self, ruling, issues, facts, encoder_outputs, padding_mask=None):
        # Text embedding layer
        embedded_ruling = self.embedding(ruling)
        embedded_issues = self.embedding(issues)
        embedded_facts = self.embedding(facts)

        # Dropout
        embedded_ruling = self.dropout1(embedded_ruling)
        embedded_issues = self.dropout1(embedded_issues)
        embedded_facts = self.dropout1(embedded_facts)

        # Positional encoding
        embedded_ruling = self.positional_encoding(embedded_ruling)
        embedded_issues = self.positional_encoding(embedded_issues)
        embedded_facts = self.positional_encoding(embedded_facts)

        # Dropout
        embedded_ruling = self.dropout2(embedded_ruling)

        print(embedded_ruling)
        
        # Create look-ahead mask
        seq_len = tf.shape(ruling)[1]
        look_ahead_mask = self.create_look_ahead_mask(seq_len)
        look_ahead_mask = tf.convert_to_tensor(look_ahead_mask, dtype=tf.float32)
        look_ahead_mask = look_ahead_mask[tf.newaxis, tf.newaxis, :, :]  # Shape: (1, 1, seq_len, seq_len)

        # Masked multi-head self-attention
        attention_output = self.masked_self_attention(query=embedded, key=embedded, value=embedded, attention_mask=look_ahead_mask)

        # Layer normalization
        attention_output = self.layer_norm1(attention_output + embedded)

        # Multi-head attention with encoder output
        output = self.multihead_attention(query=attention_output, key=encoder_outputs, value=encoder_outputs, attention_mask=padding_mask)

        # Layer normalization
        output = self.layer_norm2(output + attention_output)

        # Feed-forward network
        ffn_output = self.ffn(output)

        # Layer normalization
        ffn_output = self.layer_norm3(ffn_output + output)

        # Dropout
        ffn_output = self.dropout3(ffn_output)

        # Dense layer
        logits = self.dense(ffn_output)

        # Softmax
        predictions = self.softmax(logits)

        return predictions

    def create_look_ahead_mask(self, size):
        mask = tf.linalg.band_part(tf.ones((size, size)), -1, 0)
        return mask

    def save(self,filename):
        self.model.save('bert_model_{}.h5'.format(filename))

    def get_config(self):
        config = super(ModifiedStandardDecoder, self).get_config()
        config.update({
            "vocab_size": self.embedding.input_dim,
            "embedding_dim": self.embedding.output_dim,
            "num_heads": self.masked_self_attention.num_heads,
            "ff_dim": self.ffn.layers[0].units,
            "dropout_rate": self.dropout1.rate,
        })
        return config

class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, d_model, max_len=512):
        super(PositionalEncoding, self).__init__()
        self.pos_encoding = self.positional_encoding(max_len, d_model)
        
    def positional_encoding(self, position, d_model):
        angle_rads = self.get_angles(np.arange(position)[:, np.newaxis], np.arange(d_model)[np.newaxis, :], d_model)
        # Apply sin to even positions and cos to odd positions
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
        pos_encoding = angle_rads[np.newaxis, ...]
        return tf.cast(pos_encoding, dtype=tf.float32)

    def get_angles(self, pos, i, d_model):
        angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
        return pos * angle_rates

    def call(self, inputs):
        return inputs + self.pos_encoding[:, :tf.shape(inputs)[1], :]

In [5]:
vocab_size = 8000
embedding_dim = 768
num_heads = 8
ff_dim = 512
dropout_rate = 0.1

decoder = ModifiedStandardDecoder(vocab_size, embedding_dim, num_heads, ff_dim, dropout_rate)

annotated_inputs = tf.keras.Input(shape=(None,))
encoder_outputs = tf.keras.Input(shape=(None, embedding_dim))

padding_mask = None  # Add padding mask if necessary

outputs = decoder(annotated_inputs, encoder_outputs, padding_mask=padding_mask)
model = tf.keras.Model(inputs=[annotated_inputs, encoder_outputs], outputs=outputs)

model.summary()


Tensor("positional_encoding_1/add:0", shape=(None, None, 768), dtype=float32)
Tensor("positional_encoding_1/add:0", shape=(None, None, 768), dtype=float32)


In [1]:
from TopicSegmentation import LegalBert, ModifiedStandardDecoder
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import sys, os
# Example usage
vocab_size = 8000
embedding_dim = 768
num_heads = 8
ff_dim = 512
dropout_rate = 0.1

decoder = ModifiedStandardDecoder(vocab_size, embedding_dim, num_heads, ff_dim, dropout_rate)

annotated_inputs = tf.keras.Input(shape=(None,))
encoder_outputs = tf.keras.Input(shape=(None, embedding_dim))

padding_mask = None  # Add padding mask if necessary

outputs = decoder(annotated_inputs, encoder_outputs, padding_mask=padding_mask)
model = tf.keras.Model(inputs=[annotated_inputs, encoder_outputs], outputs=outputs)

model.summary()


Tensor("positional_encoding_1/add:0", shape=(None, None, 768), dtype=float32)
Tensor("positional_encoding_1/add:0", shape=(None, None, 768), dtype=float32)


In [9]:
"""df_text = ["John Doe a lawyer at ABC Corporation filed a lawsuit on January in New York City", 
           "Mama mo a dentist at DFC Corporation filed a lawsuit on January in New York City"]

ruling = ['John Doe a lawyer at ABC Corporation', 'Mama mo a dentist at DFC Corporation']"""

df = pd.read_csv('court.csv')

df = df.iloc[:,2:]

df.dropna(inplace=True)

court_case = df['court case'].to_list()
ruling = df['rulings'].to_list()

df.head()

court case    0
facts         0
issues        0
rulings       0
dtype: int64


Unnamed: 0,court case,facts,issues,rulings
1,SECOND DIVISION\n[ G.R. No. 101798. ]\nPEOPLE ...,"on or about October 19, 1989 in the municipali...","In this appeal, appellant assigns two errors s...","Accordingly, in light of the foregoing statuto..."
2,G.R. No. L-49823\nTHIRD DIVISION\n[ G.R. No. L...,"On 30 September 1977, the City Court of Manila...",Whether the City Court of Manila has the juris...,This petition is impressed with merit.\r\n\r\n...
3,G.R. No. L-27120\r\n[ G.R. No. L-27120. ]\r\nT...,Petition for certiorari against the order of t...,is it correct to say that the respondent judge...,The declara­tions constitute judicial admissio...
4,THIRD DIVISION\n[ G.R. Nos. 255324 & 255353. A...,"Via this Petition for Review on Certiorari,[1]...","Through the instant Petition, petitioner raise...","On April 22, 2005, respondent filed with the C..."
5,"FIRST DIVISION\n[ G.R. No. 238714. August 30, ...",Respondents claim ownership over a 51.24-squar...,Petitioner raises the sole issue of whether th...,"In its November 4, 2014 Decision, the RTC decr..."


In [None]:
# Initialize the preprocessor and legal BERT
legal_bert = LegalBert()

bert_output = legal_bert.get_context_vectors(court_case)


# Tokenize the segments
tokenizer = Tokenizer(num_words=8000)  # Adjust num_words according to your vocabulary size
tokenizer.fit_on_texts(ruling)
tokenized_segments = tokenizer.texts_to_sequences(ruling)

max_seq_len = bert_output.shape[1]

# Pad the sequences to ensure uniform length
padded_segments = pad_sequences(tokenized_segments, padding='post', maxlen=max_seq_len)

# Convert to tensor
padded_segments = tf.convert_to_tensor(padded_segments)

print(f'Legal BERT shape: {bert_output.shape}')
print(f'Modified Decoder shape: {padded_segments.shape}')

# Ensure xtrain and ytrain have the same batch size
assert bert_output.shape[0] == padded_segments.shape[0]

xtrain = bert_output.detach().numpy()
ytrain = padded_segments

# Compile the model
model.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=1e-3), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train
model.fit([ytrain, xtrain], ytrain, epochs=5)

model.save('seqtoseq.h5')