In [None]:
import pandas as pd
import numpy as np
import nltk
import re
import string
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, GlobalMaxPooling1D
from tensorflow.keras.callbacks import ModelCheckpoint


In [None]:

nltk.download('punkt')


In [None]:

# Load dataset
df = pd.read_csv("legal_data.csv")  # Columns: document, summary


In [None]:

# Clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"\d+", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    return text.strip()





In [None]:
# Simple Jaccard similarity for soft scoring
def jaccard_score(sent, summary):
    sent_set = set(sent.split())
    summary_set = set(summary.split())
    intersection = sent_set & summary_set
    union = sent_set | summary_set
    return len(intersection) / len(union) if union else 0.0



In [None]:
# Prepare sentence-score dataset
X, y = [], []

for _, row in df.iterrows():
    doc = clean_text(row['document'])
    summary = clean_text(row['summary'])
    
    sentences = sent_tokenize(doc)
    for sent in sentences:
        score = jaccard_score(sent, summary)
        X.append(sent)
        y.append(score)



In [None]:
# Tokenize
tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
X_pad = pad_sequences(X_seq, maxlen=50, padding='post')
y = np.array(y)

In [None]:
# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X_pad, y, test_size=0.2, random_state=42)


In [None]:

# Model (regression)
model = Sequential([
    Embedding(input_dim=20000, output_dim=64, input_length=50),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')  # Output is a score between 0 and 1
])



In [None]:
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae'])



In [None]:
# Checkpoint to save best model
checkpoint = ModelCheckpoint("sentence_scoring_model.keras", save_best_only=True, monitor='val_mae', mode='min')


In [None]:

# Train
model.fit(X_train, y_train, epochs=20, batch_size=64, validation_data=(X_val, y_val), callbacks=[checkpoint])
