In [None]:
!pip install pandas numpy transformers scikit-learn tensorflow flask



In [None]:
import pandas as pd

# Load the dataset
data_path = "/legal_docs.csv"
df = pd.read_csv(data_path)

# Display the first few rows of the dataset
print("First 5 rows of the dataset:")
print(df.head())


First 5 rows of the dataset:
   Unnamed: 0                                        clause_text  clause_type  \
0           0                      Make any Investments, except:  investments   
1           1   No more than 45% of the “value” (as defined i...  investments   
2           2              Make or hold any Investments, except:  investments   
3           3   The SubAdviser is hereby authorized and direc...  investments   
4           4   Make any advance, loan, extension of credit (...  investments   

   totalwords  totalletters  
0         4.0          30.0  
1        76.0         460.0  
2         6.0          38.0  
3       228.0        1474.0  
4        52.0         329.0  


In [None]:
# Get the number of rows and columns
num_rows, num_columns = df.shape
print(f"Number of Rows: {num_rows}, Number of Columns: {num_columns}")

# Display the column names
print("\nColumn Names:")
print(df.columns)

# Display dataset info (data types, non-null counts)
print("\nDataset Info:")
df.info()

Number of Rows: 21187, Number of Columns: 5

Column Names:
Index(['Unnamed: 0', 'clause_text', 'clause_type', 'totalwords',
       'totalletters'],
      dtype='object')

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21187 entries, 0 to 21186
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    21187 non-null  int64  
 1   clause_text   21144 non-null  object 
 2   clause_type   21187 non-null  object 
 3   totalwords    21161 non-null  float64
 4   totalletters  21161 non-null  float64
dtypes: float64(2), int64(1), object(2)
memory usage: 827.7+ KB


In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("\nMissing Values in Each Column:")
print(missing_values)


Missing Values in Each Column:
Unnamed: 0       0
clause_text     43
clause_type      0
totalwords      26
totalletters    26
dtype: int64


In [None]:
import re

# Function to clean clause_text
def clean_text(text):
    if not isinstance(text, str):
        return ""  # Return an empty string for non-string values
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    return text

# Apply text cleaning
df['cleaned_clause_text'] = df['clause_text'].apply(clean_text)
print("\nSample Cleaned Text:")
print(df[['clause_text', 'cleaned_clause_text']].head())


Sample Cleaned Text:
                                         clause_text  \
0                      Make any Investments, except:   
1   No more than 45% of the “value” (as defined i...   
2              Make or hold any Investments, except:   
3   The SubAdviser is hereby authorized and direc...   
4   Make any advance, loan, extension of credit (...   

                                 cleaned_clause_text  
0                        make any investments except  
1   no more than 45 of the value as defined in se...  
2                make or hold any investments except  
3   the subadviser is hereby authorized and direc...  
4   make any advance loan extension of credit by ...  


In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode clause types
label_encoder = LabelEncoder()
df['clause_label'] = label_encoder.fit_transform(df['clause_type'])

# Save the mapping for interpretation later
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("\nLabel Mapping:")
print(label_mapping)
df.head()


Label Mapping:
{'Assignment': 0, 'Confidentiality': 1, 'Counterparts': 2, 'Definitions': 3, 'Entire': 4, 'Governing': 5, 'Headings': 6, 'Indemnification': 7, 'Insurance': 8, 'Miscellaneous': 9, 'NOW': 10, 'Notices': 11, 'Representations': 12, 'Severability': 13, 'Termination': 14, 'WHEREAS': 15, 'base-salary': 16, 'board': 17, 'capitalization': 18, 'compensation': 19, 'conversion_of_shares': 20, 'dividends': 21, 'employee_benefits': 22, 'esop': 23, 'financing': 24, 'foreign_investors': 25, 'grant': 26, 'grant_of_option': 27, 'interest': 28, 'investment-company-act': 29, 'investment_company': 30, 'investments': 31, 'loans': 32, 'ownership_of_shares': 33, 'payment': 34, 'payment_terms': 35, 'private_equity': 36, 'seed': 37, 'shares': 38, 'stock_option': 39, 'taxes': 40, 'vesting': 41}


Unnamed: 0.1,Unnamed: 0,clause_text,clause_type,totalwords,totalletters,cleaned_clause_text,clause_label
0,0,"Make any Investments, except:",investments,4.0,30.0,make any investments except,31
1,1,No more than 45% of the “value” (as defined i...,investments,76.0,460.0,no more than 45 of the value as defined in se...,31
2,2,"Make or hold any Investments, except:",investments,6.0,38.0,make or hold any investments except,31
3,3,The SubAdviser is hereby authorized and direc...,investments,228.0,1474.0,the subadviser is hereby authorized and direc...,31
4,4,"Make any advance, loan, extension of credit (...",investments,52.0,329.0,make any advance loan extension of credit by ...,31


In [None]:
from sklearn.model_selection import train_test_split

# Define features and target
X = df['cleaned_clause_text']
y = df['clause_label']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"\nTraining Samples: {len(X_train)}, Testing Samples: {len(X_test)}")


Training Samples: 16949, Testing Samples: 4238


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import torch

# Load the pre-trained T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# Prepare the data for T5
def preprocess_for_t5(input_texts, labels):
    inputs = [f"classify: {text}" for text in input_texts]
    targets = [str(label) for label in labels]
    return inputs, targets

train_inputs, train_targets = preprocess_for_t5(X_train, y_train)
test_inputs, test_targets = preprocess_for_t5(X_test, y_test)

In [None]:
def tokenize_texts(input_texts, target_texts):
    inputs = tokenizer(input_texts, truncation=True, padding=True, max_length=512, return_tensors="pt")
    targets = tokenizer(target_texts, truncation=True, padding=True, max_length=512, return_tensors="pt")
    return inputs, targets

train_encodings, train_labels = tokenize_texts(train_inputs, train_targets)
test_encodings, test_labels = tokenize_texts(test_inputs, test_targets)

In [None]:
import torch
# Create PyTorch datasets
class ClauseDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels.input_ids

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

# Create dataset objects
train_dataset = ClauseDataset(train_encodings, train_labels)
test_dataset = ClauseDataset(test_encodings, test_labels)


In [None]:
training_args = TrainingArguments(
    output_dir="./results",               # Directory to save the model checkpoints
    evaluation_strategy="epoch",          # Evaluate after every epoch
    learning_rate=2e-5,                   # Learning rate
    per_device_train_batch_size=8,        # Batch size per device
    num_train_epochs=10,                  # Number of epochs
    weight_decay=0.01,                    # Weight decay for optimizer
    logging_dir="./logs",                 # Directory for logs
    logging_steps=10,                     # Log every 10 steps
    save_strategy="epoch",                # Save model at the end of each epoch
    load_best_model_at_end=True           # Load the best model after training
)



In [None]:
trainer = Trainer(
    model=model,                         # The T5 model
    args=training_args,                  # Training arguments
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=test_dataset            # Evaluation dataset
)

In [None]:
# Fine-tune the model
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.2799,0.268947


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=2119, training_loss=0.9013540689649308, metrics={'train_runtime': 2289.5895, 'train_samples_per_second': 7.403, 'train_steps_per_second': 0.925, 'total_flos': 1.032122728710144e+16, 'train_loss': 0.9013540689649308, 'epoch': 1.0})

In [None]:
model.save_pretrained("fine_tuned_t5_clause_classifier")
tokenizer.save_pretrained("fine_tuned_t5_clause_classifier")

('fine_tuned_t5_clause_classifier/tokenizer_config.json',
 'fine_tuned_t5_clause_classifier/special_tokens_map.json',
 'fine_tuned_t5_clause_classifier/spiece.model',
 'fine_tuned_t5_clause_classifier/added_tokens.json')

In [None]:
eval_results = trainer.evaluate()
print("\nEvaluation Results:")
print(eval_results)


Evaluation Results:
{'eval_loss': 0.4045300781726837, 'eval_runtime': 118.4254, 'eval_samples_per_second': 35.786, 'eval_steps_per_second': 4.475, 'epoch': 1.0}


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch

# Function to generate predictions using the model
def generate_predictions(encodings, batch_size=8):
    model.eval()  # Set model to evaluation mode to save memory
    predictions = []

    # Run in batches to avoid out-of-memory error
    for i in range(0, len(encodings['input_ids']), batch_size):
        # Get the batch data
        batch_encodings = {key: val[i:i+batch_size].to(model.device) for key, val in encodings.items()}

        # Perform inference with no gradient computation to save memory
        with torch.no_grad():
            outputs = model.generate(**batch_encodings)

        # Decode predictions
        batch_predictions = [int(tokenizer.decode(output, skip_special_tokens=True)) for output in outputs]
        predictions.extend(batch_predictions)

    return predictions

# Generate predictions for the test set
predictions = generate_predictions(test_encodings)

# Convert test labels from tensor to list of integers (assuming test_labels is already in the right format)
true_labels = [int(tokenizer.decode(label, skip_special_tokens=True)) for label in test_labels.input_ids]

# Calculate metrics
accuracy = accuracy_score(true_labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')

print("\nEvaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")



Evaluation Metrics:
Accuracy: 0.8814
Precision: 0.8613
Recall: 0.8732
F1 Score: 0.8621
