In [3]:
# !pip install faiss-cpu pandas numpy scikit-learn sentence-transformers transformers joblib --quiet
import torch
import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from transformers import T5ForConditionalGeneration, T5Tokenizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings("ignore")

In [4]:
# =======================
# 1. Load dataset
# =======================
csv_path = "/content/drive/MyDrive/nda_models/Final_NDA_with_Augmented.csv"
data = pd.read_csv(csv_path, encoding="ISO-8859-1")

# Rename & clean
data = data.rename(columns={
    "clean_sentence": "SENTENCE",
    "Classification_Category": "CATEGORY"
})
data = data[["SENTENCE", "CATEGORY"]].dropna()
data["CATEGORY"] = data["CATEGORY"].str.strip().str.lower()

# Normalize labels
category_map = {
    "confidentiality obligation": "confidentiality obligations",
    "confidentiality obligations": "confidentiality obligations",
    "signatures": "signatures",
    "signature": "signatures",
    "governing law": "governing law",
    "remedies": "remedies",
    "non-competition": "non-competition",
    "non competition": "non-competition",
    "privacy/security": "privacy/security",
    "limitation of liability": "limitation of liability",
    "non-solicitation": "non-solicitation",
    "indemnification": "indemnification"
}
data["CATEGORY"] = data["CATEGORY"].map(category_map)
data = data.dropna(subset=["CATEGORY"]).reset_index(drop=True)

# Encode labels
label2id = {label: idx for idx, label in enumerate(sorted(data["CATEGORY"].unique()))}
id2label = {idx: label for label, idx in label2id.items()}
data["label"] = data["CATEGORY"].map(label2id)

print(f"Loaded {len(data)} samples | {len(label2id)} classes")
print("Label mapping:", label2id)

# Split
train_data, test_data = train_test_split(
    data, test_size=0.3, stratify=data["label"], random_state=42
)

# =======================
# 2. Generate embeddings (safe for Colab)
# =======================
print("Generating embeddings (MiniLM, batch_size=16)...")

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device.upper()}")

embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device=device)

train_embeddings = embedder.encode(
    train_data["SENTENCE"].tolist(),
    convert_to_numpy=True,
    show_progress_bar=True,
    batch_size=16
)

test_embeddings = embedder.encode(
    test_data["SENTENCE"].tolist(),
    convert_to_numpy=True,
    show_progress_bar=True,
    batch_size=16
)

# =======================
# 3. FAISS index
# =======================
d = train_embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(train_embeddings)
print(f"FAISS index built with {index.ntotal} embeddings (dimension={d})")

# =======================
# 4. T5 model for clause refinement
# =======================
print("Loading T5-small model (for paraphrasing)...")
tokenizer = T5Tokenizer.from_pretrained("t5-small")
generator = T5ForConditionalGeneration.from_pretrained("t5-small")

# =======================
# 5. Random Forest + Grid Search
# =======================
X_train, y_train = train_embeddings, train_data["CATEGORY"].tolist()

param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2]
}

rf = RandomForestClassifier(class_weight="balanced", random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)
best_rf = grid_search.best_estimator_

print("Best parameters found:", grid_search.best_params_)

# =======================
# 6. Evaluation
# =======================
rf_preds = best_rf.predict(test_embeddings)
rf_acc = accuracy_score(test_data["CATEGORY"], rf_preds)
print(f"\nRandom Forest Test Accuracy: {rf_acc * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(test_data["CATEGORY"], rf_preds))

# =======================
# 7. Retrieval + Refinement functions
# =======================
def retrieve_similar_clause(query, k=1):
    query_embedding = embedder.encode([query], convert_to_numpy=True)
    D, I = index.search(query_embedding, k)
    clause = train_data.iloc[I[0][0]]["SENTENCE"]
    category = train_data.iloc[I[0][0]]["CATEGORY"]
    return clause, category, D[0][0]

def refine_clause(clause):
    input_text = f"paraphrase: {clause} </s>"
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    outputs = generator.generate(input_ids, max_length=64)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# =======================
# 8. Demo
# =======================
# example_query = "The parties agree to maintain confidentiality of all proprietary data."
# similar_clause, similar_category, distance = retrieve_similar_clause(example_query)
# refined_clause = refine_clause(similar_clause)

# print("\nExample Query:", example_query)
# print(f"Most similar clause: {similar_clause}")
# print(f"Category: {similar_category} | Distance: {distance:.4f}")
# print(f"Refined version: {refined_clause}")

# =======================
# 9. Save model & FAISS index
# =======================
faiss.write_index(index, "/content/drive/MyDrive/nda_models/faiss_index.ndx")

import joblib
joblib.dump(best_rf, "/content/drive/MyDrive/nda_models/rf_nda_model.pkl")



Loaded 6258 samples | 7 classes
Label mapping: {'confidentiality obligations': 0, 'governing law': 1, 'indemnification': 2, 'non-competition': 3, 'non-solicitation': 4, 'remedies': 5, 'signatures': 6}
Generating embeddings (MiniLM, batch_size=16)...
Using device: CPU


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/274 [00:00<?, ?it/s]

Batches:   0%|          | 0/118 [00:00<?, ?it/s]

FAISS index built with 4380 embeddings (dimension=384)
Loading T5-small model (for paraphrasing)...


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Fitting 3 folds for each of 24 candidates, totalling 72 fits
Best parameters found: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}

Random Forest Test Accuracy: 94.09%

Classification Report:
                             precision    recall  f1-score   support

confidentiality obligations       0.94      0.99      0.97      1527
              governing law       0.98      0.79      0.88        77
            indemnification       1.00      0.78      0.88        27
            non-competition       1.00      0.31      0.47        36
           non-solicitation       1.00      0.58      0.73        26
                   remedies       1.00      0.76      0.86       121
                 signatures       0.84      0.77      0.80        64

                   accuracy                           0.94      1878
                  macro avg       0.97      0.71      0.80      1878
               weighted avg       0.94      0.94      0.94      1878



['/content/drive/MyDrive/nda_models/rf_nda_model.pkl']