In [1]:
from TopicSegmentation import *
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import torch

# Instantiate Model Architecture

In [2]:
# Example usage
vocab_size = 5000
embedding_dim = 768
num_heads = 8
ff_dim = 1024
dropout_rate = 0.1

In [3]:
decoder = ModifiedStandardDecoder(vocab_size, embedding_dim, num_heads, ff_dim, dropout_rate)

annotated_inputs = tf.keras.Input(shape=(None,))
encoder_outputs = tf.keras.Input(shape=(None, embedding_dim))

padding_mask_layer = PaddingMaskLayer()
padding_mask = padding_mask_layer(annotated_inputs)

outputs = decoder(annotated_inputs, encoder_outputs, padding_mask=padding_mask)
model_issues = tf.keras.Model(inputs=[annotated_inputs, encoder_outputs], outputs=outputs) # model for issues

model_issues.summary()


Tensor("positional_encoding_1/add:0", shape=(None, None, 768), dtype=float32)
Tensor("positional_encoding_1/add:0", shape=(None, None, 768), dtype=float32)


# Load Data

In [4]:
df = pd.read_csv('court.csv')

df = df.iloc[:,2:]

df.dropna(inplace=True)

court_case = df['court case'].to_list()
issues = df['issues'].to_list()

# Preprocess Data

## Instantiate preprocessor and encoder

In [5]:
# Initialize the preprocessor and legal BERT
legal_bert = LegalBert()

bert_output = legal_bert.get_context_vectors(court_case)

  return self.fget.__get__(instance, owner)()


## Tokenize and add paddings to each data for each model

In [6]:
# Tokenize the issues segments
tokenizer_issues = Tokenizer(num_words=5000)  # Adjust num_words according to your vocabulary size
tokenizer_issues.fit_on_texts(issues)
tokenized_segments_issues = tokenizer_issues.texts_to_sequences(issues)

# Max sequence length of all padding
max_seq_len = min(bert_output.shape[1], 1024)

# Pad the sequences to ensure uniform length
padded_segments_issues = pad_sequences(tokenized_segments_issues, padding='post', maxlen=max_seq_len)

# Convert to tensor
padded_segments_issues = tf.convert_to_tensor(padded_segments_issues)

xtrain = bert_output.detach().numpy() # all segment has the same x data but different y data
ytrain_issues = padded_segments_issues

# Train Models

In [7]:
# Compile the models
model_issues.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=1e-4), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train all model then save them
model_issues.fit([ytrain_issues, xtrain], ytrain_issues, epochs=3)
model_issues.save('issues.keras')

Epoch 1/3
Tensor("functional_2_1/modified_standard_decoder_1/dropout_1_2/stateless_dropout/SelectV2:0", shape=(None, 1024, 768), dtype=float32)
Tensor("functional_2_1/modified_standard_decoder_1/dropout_1_2/stateless_dropout/SelectV2:0", shape=(None, 1024, 768), dtype=float32)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 69s/step - accuracy: 3.9062e-05 - loss: 8.6068
Epoch 2/3
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 30s/step - accuracy: 0.8707 - loss: 4.0339
Epoch 3/3
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 29s/step - accuracy: 0.8707 - loss: 1.7135
