## Importing our base model t-5 small

In [27]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import load_dataset

# Load model and tokenizer
model_name = "t5-small"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

In [28]:
# Disabling WANDB
import os
os.environ["WANDB_MODE"] = "disabled"


In [29]:
from sklearn.model_selection import train_test_split

In [30]:
# Load the dataset from Hugging Face
dataset = load_dataset('joelniklaus/legal_case_document_summarization')

# Filter the dataset to include only the relevant dataset name ('IN-Abs' in this case)
filtered_train = dataset['train'].filter(lambda x: 'IN-Abs' in x['dataset_name'])

# Remove the 'dataset_name' column to clean the dataset
filtered_train = filtered_train.remove_columns(['dataset_name'])

# Convert to pandas DataFrame for splitting into train/val
df_train = filtered_train.to_pandas()

# Split the data into training and validation sets (80% train, 20% validation)
train_df, val_df = train_test_split(df_train, test_size=0.2, random_state=42)

# Convert the pandas DataFrame back into Hugging Face dataset format
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Print the first few rows to check
print(train_dataset[:5])


Repo card metadata block was not found. Setting CardData to empty.


{'judgement': ['ivil Appeal No. 357 of 1957.\nAppeal by special leave from the judgment and order dated February 28, 1956, of the Allahabad High Court (Lucknow Bench) in Misc.\nCase No. 4 of 1955 and Civil Revision No. 189 of 1955, arising out of the order dated August 6, 1955 of the Civil Judge, Sitapur in Suit No. 16 of 1953.\nVidya Sagar, for the appellant.\nIqbal Ahmad, section N. Andley and Rameshwar Nath, for the respondent.\nJanuary 22.\nThe following Judgment of the Court was delivered by SARKAR J.\nThe respondent, a scheduled bank, sued the appellant in the court of the Civil Judge, Sitapore in Uttar Pradesh, for the recovery of money due under an instrument of mortgage.\nThe appellant contested the suit on several grounds one of which was that he was entitled to relief under the Uttar Pradesh Zamindar \'s Debt Reduction Act (U.P. XV of 1953) which reduced the amount recoverable on a debt as defined in it.\nNow a debt was defined in the Act in these terms: 2(f): "debt" means a

In [31]:
# Define the preprocess function for tokenizing the inputs and targets
def preprocess_function(examples):
    # Get the inputs (judgement) and targets (summary)
    inputs = examples["judgement"]
    targets = examples["summary"]

    # Tokenize the inputs and targets, with truncation and padding
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    # Use the tokenizer as the target tokenizer for the summary (labels)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")

    # Add the labels to the model inputs
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

In [32]:
# Apply the preprocessing function to the train and validation datasets
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)

# Optional: remove the 'text' column if it's no longer needed for training
tokenized_train = tokenized_train.remove_columns(['judgement', 'summary'])
tokenized_val = tokenized_val.remove_columns(['judgement', 'summary'])

Map:   0%|          | 0/5624 [00:00<?, ? examples/s]



Map:   0%|          | 0/1406 [00:00<?, ? examples/s]

In [33]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
    logging_dir='./logs', # logging directory
)



In [34]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
)


  trainer = Trainer(


In [35]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,3.1022,2.696011
2,2.8465,2.647023
3,2.7944,2.63258


TrainOutput(global_step=2109, training_loss=2.8860309685842296, metrics={'train_runtime': 867.7461, 'train_samples_per_second': 19.443, 'train_steps_per_second': 2.43, 'total_flos': 2283486874435584.0, 'train_loss': 2.8860309685842296, 'epoch': 3.0})

In [36]:
trainer.save_model("./final_model")