## Clause Identification

In [1]:
from transformers import AutoTokenizer, AutoModelForPreTraining
import pandas as pd
from datasets import Dataset

tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
model = AutoModelForPreTraining.from_pretrained("nlpaueb/legal-bert-base-uncased")
pd.set_option('display.max_colwidth', None)  # Ensure no truncation of column content
pd.set_option('display.max_rows', None)     # Display all rows
pd.set_option('display.max_columns', None) 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
file_path = "Datasets/master_clauses.csv"
df = pd.read_csv(file_path)
df['Filename'] = df['Filename'].str.replace('.pdf', '.txt', regex=False)

## Tokenizing Sentence for Terms and Conditions

In [None]:
from transformers import AutoTokenizer
import pandas as pd
import ast

# Step 1: Ensure all column values are lists
def ensure_list(value):
    if isinstance(value, list):
        return value  # Already a list
    elif isinstance(value, str) and value.startswith('[') and value.endswith(']'):
        # If it's a string that looks like a list, try to parse it
        try:
            import ast
            return ast.literal_eval(value)
        except (ValueError, SyntaxError):
            return [value]  # If parsing fails, wrap it as a single-item list
    elif pd.notnull(value):
        return [value]  # If it's a scalar, wrap it as a single-item list
    else:
        return []  # Handle NaN or None values as empty lists

# Combine text columns to form the input for tokenization
columns_to_include = [
    'Termination For Convenience',
    'Post-Termination Services',
    'Renewal Term',
    'Notice Period To Terminate Renewal',
    'Change Of Control',
    'Liquidated Damages',
    'Anti-Assignment'
]

# Load the ContractBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")

df_to_flatten = df[columns_to_include]


for column in df_to_flatten.columns:
    df_to_flatten[column] = df_to_flatten[column].apply(ensure_list)

# Step 2: Flatten the dataframe
flattened_rows = []
for index, row in df_to_flatten.iterrows():
    for column in df_to_flatten.columns:
        for clause in row[column]:
            flattened_rows.append({'Category': column, 'text': clause})

flattened_df = pd.DataFrame(flattened_rows)
# Flatten the "Clause" column: Convert lists into plain strings
flattened_df['text'] = flattened_df['text'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
# Remove rows with empty "Clause"
flattened_df = flattened_df[flattened_df['text'].str.strip() != ""]
# Create a new dataframe
df_positives = pd.DataFrame(flattened_df['text'])

df_positives['labels'] = 1

# Display the flattened dataframe
df_positives.head(5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_to_flatten[column] = df_to_flatten[column].apply(ensure_list)


Unnamed: 0,text,labels
0,"This Agreement is accepted by Company in the State of Nevada and shall be governed by and construed in accordance with the laws thereof, which laws shall prevail in the event of any conflict.",1
1,"Company shall not specify the business practices of MA, nor regulate the manner in which MA shall operate its business, provided that MA (a) conducts business in a manner that reflects favorably at all times on the Technology sold and the good name, goodwill and reputation of Company and its affiliates<omitted>",1
2,"This Agreement is subject to all laws, regulations, license conditions and decisions of the Canadian Radio-television and Telecommunications Commission (""CRTC"") municipal, provincial and federal governments or other authorities which are applicable to Rogers and/or Licensor, and which are now in force or hereafter adopted (""Applicable Law"").",1
3,This Agreement shall be governed by laws of the Province of Ontario and the federal laws of Canada applicable therein.,1
4,"All questions with respect to the construction of this Agreement, and the rights and liabilities of the Parties hereto, shall be governed by the laws of the State of Florida.",1


## Columns to train against (the non positive labels)

In [4]:
exclude_columns = ['Parties','Agreement Date','Effective Date', 'Expiration Date', "Filename", "Document Name", "Document Name-Answer"]
dynamic_exclude_columns = [col for col in df.columns if "-Answer" in col]
all_columns_to_exclude = set(exclude_columns + dynamic_exclude_columns+columns_to_include)
df_negatives= df.drop(columns=all_columns_to_exclude)


for column in df_negatives.columns:
    df_negatives[column] = df_negatives[column].apply(ensure_list)

# Step 2: Flatten the dataframe
flattened_rows = []
for index, row in df_negatives.iterrows():
    for column in df_negatives.columns:
        for clause in row[column]:
            flattened_rows.append({'Category': column, 'text': clause})

flattened_df = pd.DataFrame(flattened_rows)
# Flatten the "Clause" column: Convert lists into plain strings
flattened_df['text'] = flattened_df['text'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
# Remove rows with empty "Clause"
flattened_df = flattened_df[flattened_df['text'].str.strip() != ""]

# Create a new dataframe
df_negatives = pd.DataFrame(flattened_df['text'])

df_negatives['labels'] = 0

# Display the flattened dataframe
df_negatives.columns

Index(['text', 'labels'], dtype='object')

## Merging the datasets after labelling

In [5]:
# Merge the dataframes
merged_df = pd.concat([df_positives, df_negatives], ignore_index=True)
merged_df = merged_df[['text','labels']]

merged_df.shape

(8783, 2)

## Train and Test Splits

In [6]:
import random
# train_df, test_df = train_test_split(merged_df, test_size=0.2, random_state=42)
# Step 1: Randomly split the dataset into train and test
random.seed(42)  # Set seed for reproducibility
train_indices = random.sample(range(len(merged_df)), int(0.8 * len(merged_df)))  # 80% train
test_indices = list(set(range(len(merged_df))) - set(train_indices))      # Remaining 20% test

train_df = merged_df.iloc[train_indices].reset_index(drop=True)
test_df = merged_df.iloc[test_indices].reset_index(drop=True)

In [7]:
train_df.head()

Unnamed: 0,text,labels
0,"In the event of any termination of the Project by University, (a) University agrees to complete Phase I and II of the Project, and (b) ArTara will continue to provide annual funding until the completion of Phase II.",0
1,"Company agrees that no other Distributor will be appointed in any other state as a Distributor unless it is either the Company or Distributor, save and except for the state of Florida.",1
2,This Agreement shall automatically terminate in the event the Management Agreement is assigned or otherwise terminated.,0
3,"You agree that, at our option, you will sell to us any or all your assets used to operate the Franchised Business (including equipment, fixtures, furnishings, Delivery Vehicles, supplies, and inventory) that we ask in writing to purchase. 16.2.1. The purchase price for such items will be equal to your depreciated cost (determined below) or fair market value, whichever is less. The cost will be determined based upon a five (5) year straight-line depreciation of original costs. For equipment that is five (5) or more years old, the parties agree that fair market value will be deemed to be ten percent (10%) of the equipment's original cost. The fair market value of tangible assets must be determined without reference to good will, going-concern value, or other intangible assets. Page 32 of 39\n\nSource: PF HOSPITALITY GROUP INC., 10-12G, 9/23/2015\n\n\n\n\n\n16.2.2. We may exercise this option by delivering a notice of intent to purchase to you within 30 days after the expiration or termination of this Agreement. During that 30-day period, you agree not to dispose of, transfer, or otherwise hinder our ability to exercise our rights with respect to your assets. 16.2.3. If we exercise our option to purchase, we may setoff all amounts due to us under this Agreement and the cost of the appraisal (if any), against any payment due to you. 16.2.4. If we do not exercise our rights to purchase your Delivery Vehicle(s), you must immediately make such modifications or alterations to the Delivery Vehicle(s) that may be needed to remove any Proprietary Marks and to otherwise distinguish the appearance of the vehicle(s) from those used by other Restaurants.",0
4,"Without the prior written consent of the other party, neither party shall assign or transfer any of its rights or obligations hereunder, in whole or in part, to any third party, and any purported assignment without such prior written consent shall be null and void and of no force and effect; except that notice, but no consent shall be required for such assignment or transfer in connection with an internal reorganization or sale of the transferring party, including by merger or other business combination, or a sale of substantially all of the assets of the transferring party.",0


## Augmenting Train and Test Separately
To prevent data leakage

In [8]:

print(f"Training samples: {len(train_df)}")
print(f"Validation samples: {len(test_df)}")


Training samples: 7026
Validation samples: 1757


In [None]:
from transformers import pipeline
from datasets import Dataset
import pandas as pd

# Load the paraphrasing pipeline
paraphraser = pipeline("text2text-generation", model="t5-small", device=0)

# Define a function for paraphrasing
def paraphrase_batch(batch, num_return_sequences=3, num_beams=3):
    """
    Paraphrase a batch of texts using the Hugging Face pipeline.
    """
    inputs = [f"paraphrase: {text}" for text in batch['text']]
    paraphrased_results = paraphraser(
        inputs,
        max_length=128,
        num_return_sequences=num_return_sequences,
        num_beams=num_beams,
        truncation=True
    )

    paraphrased_texts = []
    # Process the results to extract 'generated_text'
    if isinstance(paraphrased_results, list) and isinstance(paraphrased_results[0], list):
        # Handle batched results: `paraphrased_results` is a nested list
        for sublist in paraphrased_results:
            paraphrased_texts.extend([result['generated_text'] for result in sublist])
    elif isinstance(paraphrased_results, list):
        # Handle unbatched results: `paraphrased_results` is a flat list
        paraphrased_texts.extend([result['generated_text'] for result in paraphrased_results])
    else:
        raise ValueError("Unexpected paraphrased_results structure.")

    # Repeat labels for each paraphrased result
    labels = sum([[label] * num_return_sequences for label in batch['labels']], [])

    return {'text': paraphrased_texts, 'labels': labels}

# Convert your DataFrame to a Dataset
dataset_train = Dataset.from_pandas(train_df)
dataset_test = Dataset.from_pandas(test_df)
# Apply the paraphrasing function in batches
batch_size = 16  # Adjust based on your GPU capacity
paraphrased_dataset_train = dataset_train.map(
    lambda batch: paraphrase_batch(batch, num_return_sequences=3),
    batched=True,
    batch_size=batch_size
)

# Apply the paraphrasing function in batches
batch_size = 16  # Adjust based on your GPU capacity
paraphrased_dataset_test = dataset_test.map(
    lambda batch: paraphrase_batch(batch, num_return_sequences=3),
    batched=True,
    batch_size=batch_size
)


df_augmented_train = pd.DataFrame({
    'text': [item for sublist in paraphrased_dataset_train['text'] for item in sublist] if isinstance(paraphrased_dataset_train['text'][0], list) else paraphrased_dataset_train['text'],
    'labels': [item for sublist in paraphrased_dataset_train['labels'] for item in sublist] if isinstance(paraphrased_dataset_train['labels'][0], list) else paraphrased_dataset_train['labels']
})
df_augmented_test = pd.DataFrame({
    'text': [item for sublist in paraphrased_dataset_test['text'] for item in sublist] if isinstance(paraphrased_dataset_test['text'][0], list) else paraphrased_dataset_test['text'],
    'labels': [item for sublist in paraphrased_dataset_test['labels'] for item in sublist] if isinstance(paraphrased_dataset_test['labels'][0], list) else paraphrased_dataset_test['labels']
})


# Append augmented data to the original DataFrame
train_df = pd.concat([train_df, df_augmented_train], ignore_index=True)
test_df = pd.concat([test_df, df_augmented_test], ignore_index=True)



Map:   1%|          | 48/7026 [00:47<1:53:17,  1.03 examples/s]

In [None]:

print(f"Training samples: {len(train_df)}")
print(f"Validation samples: {len(test_df)}")


Training samples: 6148
Validation samples: 2635


## Model Training and Tokenization

In [None]:

from transformers import AutoTokenizer

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
# Define a tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples["text"],        # The text column to tokenize
        padding="max_length",    # Pad all sequences to the same maximum length
        truncation=True,         # Truncate sequences longer than the model's limit
        max_length=128           # Maximum sequence length for BERT-based models
    )


# Convert dataframe to Hugging Face Dataset
hf_dataset_train = Dataset.from_pandas(train_df)
hf_dataset_test = Dataset.from_pandas(test_df)

# Drop rows with missing text values
hf_dataset_train = hf_dataset_train.filter(lambda example: example["text"] is not None)
hf_dataset_test = hf_dataset_test.filter(lambda example: example["text"] is not None)


tokenized_train_dataset = hf_dataset_train.map(tokenize_function, batched=True).remove_columns(["text"])
tokenized_test_dataset = hf_dataset_test.map(tokenize_function, batched=True).remove_columns(["text"])

train_dataset = tokenized_train_dataset
val_dataset = tokenized_test_dataset


Filter: 100%|██████████| 7026/7026 [00:00<00:00, 293520.65 examples/s]
Filter: 100%|██████████| 1757/1757 [00:00<00:00, 220211.93 examples/s]
Map: 100%|██████████| 7026/7026 [00:02<00:00, 2936.78 examples/s]
Map: 100%|██████████| 1757/1757 [00:00<00:00, 2887.00 examples/s]


In [None]:
from transformers import AutoModelForSequenceClassification

# Load ContractBERT for binary classification (2 labels: 0 and 1)
model = AutoModelForSequenceClassification.from_pretrained(
    "nlpaueb/legal-bert-base-uncased",
    num_labels=2  # Binary classification
)
model = model.to("cuda")  # Explicitly move the model to GPU

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import Trainer, TrainingArguments

In [None]:
from transformers import TrainingArguments
from transformers import EarlyStoppingCallback
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps=500,                     # Evaluate every 500 steps
    save_steps=500,                     # Save every 500 steps
    learning_rate=3e-5,                 # Adjusted learning rate
    per_device_train_batch_size=16,     # Smaller batch size to fit GPU memory
    gradient_accumulation_steps=2,      # Effective batch size = 32
    per_device_eval_batch_size=16,      
    num_train_epochs=10,                 # More epochs for better training
    weight_decay=0.01,
    save_total_limit=4,
    fp16=True,                          # Mixed precision for better GPU utilization
    disable_tqdm=False,
    log_level="error",
    lr_scheduler_type="cosine",  # Cosine annealing for better convergence
    warmup_steps=500,                   # Gradual learning rate warmup
    load_best_model_at_end=True,
    max_grad_norm=1.0,  # Limit gradient norm for stability
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)




In [None]:
from transformers import TrainingArguments
from transformers import EarlyStoppingCallback
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps=500,                     # Evaluate every 500 steps
    save_steps=500,                     # Save every 500 steps
    learning_rate=3e-5,                 # Adjusted learning rate
    per_device_train_batch_size=16,     # Smaller batch size to fit GPU memory
    gradient_accumulation_steps=2,      # Effective batch size = 32
    per_device_eval_batch_size=16,      
    num_train_epochs=10,                 # More epochs for better training
    weight_decay=0.01,
    save_total_limit=4,
    fp16=True,                          # Mixed precision for better GPU utilization
    disable_tqdm=False,
    log_level="error",
    lr_scheduler_type="cosine",  # Cosine annealing for better convergence
    warmup_steps=500,                   # Gradual learning rate warmup
    load_best_model_at_end=True,
    max_grad_norm=1.0,  # Limit gradient norm for stability
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)




In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,                       # Pretrained model
    args=training_args,                # Training arguments
    train_dataset=train_dataset,       # Training dataset
    eval_dataset=val_dataset           # Validation dataset
)
print(len(val_dataset))

1757


In [None]:
# Start fine-tuning the model
trainer.train()

  1%|          | 11/1100 [01:19<2:07:03,  7.00s/it]

KeyboardInterrupt: 

In [None]:

# Save the best model to a specific directory
save_directory = 'final_model_terms_and_conditions'
# Save the model and tokenizer
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

In [None]:
import evaluate
import numpy as np
from transformers import Trainer, TrainingArguments
from datasets import Dataset

# Load the accuracy metric using the `evaluate` library
accuracy_metric = evaluate.load("accuracy")

# Define a function to compute metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)  # Get predicted class
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    return {"accuracy": accuracy["accuracy"]}

# Define TrainingArguments for evaluation
training_args = TrainingArguments(
    output_dir="./final_model_terms_and_conditions",          # Directory to save results
    evaluation_strategy="epoch",    # Evaluate at the end of every epoch
    per_device_eval_batch_size=16,  # Batch size for evaluation
    logging_dir="./logs",           # Directory for logs
)


In [None]:
# Load the saved model for evaluation
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("final_confidentiality_and_NDA")

# Create a Trainer object for evaluation
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=val_dataset,  # Validation dataset
    compute_metrics=compute_metrics,
)

# Evaluate the model
results = trainer.evaluate()

# Print the validation accuracy
print(f"Validation Accuracy: {results['eval_accuracy']:.4f}")
