In [1]:
# !pip install xgboost seaborn --quiet
# ============================================
# NDA Clause Classification Baselines with Class Weights
# ============================================


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
# df = pd.read_csv("Classification-Results/Final_Classification_gemini_2.5_flash_6000_samples.csv")
df = pd.read_csv("Final_NDA_with_Augmented.csv")

df = df[["clean_sentence", "Classification_Category"]].dropna()
df["Classification_Category"] = (
    df["Classification_Category"]
    .str.lower()
    .str.strip()
    .str.replace(r"\s+", " ", regex=True)
)

print("Distribution")
print(df["Classification_Category"].value_counts())

X = df["clean_sentence"]
y = df["Classification_Category"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=38
)

# TF-IDF vector
vectorizer = TfidfVectorizer(
    lowercase=True,
    ngram_range=(1, 2),
    min_df=2,
    max_features=20000
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Class Weight
classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
class_weight_dict = dict(zip(classes, class_weights))
print("\nClass Weights:")
print(class_weight_dict)

Distribution
Classification_Category
confidentiality obligations    5090
remedies                        405
governing law                   256
signatures                      213
privacy & security              119
non-competition                 119
indemnification                  89
non-solicitation                 86
indirect damages waiver          85
Name: count, dtype: int64

Class Weights:
{'confidentiality obligations': np.float64(0.14104842985000157), 'governing law': np.float64(2.807572936064556), 'indemnification': np.float64(8.105734767025089), 'indirect damages waiver': np.float64(8.375925925925927), 'non-competition': np.float64(6.054886211512717), 'non-solicitation': np.float64(8.375925925925927), 'privacy & security': np.float64(6.054886211512717), 'remedies': np.float64(1.7695618153364632), 'signatures': np.float64(3.372856077554064)}


In [3]:
# SVM (baseline)
svm = LinearSVC(class_weight=class_weight_dict)
svm.fit(X_train_tfidf, y_train)
pred_svm = svm.predict(X_test_tfidf)

print("\n=== SVM ===")
print(classification_report(y_test, pred_svm))

cm = confusion_matrix(y_test, pred_svm, labels=classes)
# plt.figure(figsize=(5,3))
# sns.heatmap(cm, annot=False, cmap="Blues", xticklabels=classes, yticklabels=classes)
# plt.title("Confusion Matrix - SVM with Class Weight")
# plt.show()


=== SVM ===
                             precision    recall  f1-score   support

confidentiality obligations       0.98      0.99      0.98      1527
              governing law       0.99      0.92      0.95        77
            indemnification       1.00      0.96      0.98        27
    indirect damages waiver       1.00      1.00      1.00        25
            non-competition       0.94      0.83      0.88        36
           non-solicitation       1.00      0.92      0.96        26
         privacy & security       0.77      0.67      0.72        36
                   remedies       0.96      0.91      0.94       121
                 signatures       0.87      0.97      0.92        64

                   accuracy                           0.97      1939
                  macro avg       0.95      0.91      0.93      1939
               weighted avg       0.97      0.97      0.97      1939



In [4]:
# Naive Bayes
sample_weights = y_train.map(class_weight_dict).values
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train, sample_weight=sample_weights)
pred_nb = nb.predict(X_test_tfidf)

print("\n=== Naive Bayes Classification ===")
print(classification_report(y_test, pred_nb))



=== Naive Bayes Classification ===
                             precision    recall  f1-score   support

confidentiality obligations       1.00      0.81      0.89      1527
              governing law       0.82      0.97      0.89        77
            indemnification       0.32      0.89      0.48        27
    indirect damages waiver       0.80      0.96      0.87        25
            non-competition       0.46      1.00      0.63        36
           non-solicitation       0.54      1.00      0.70        26
         privacy & security       0.44      0.83      0.58        36
                   remedies       0.74      0.97      0.84       121
                 signatures       0.43      1.00      0.60        64

                   accuracy                           0.84      1939
                  macro avg       0.62      0.94      0.72      1939
               weighted avg       0.92      0.84      0.86      1939



In [5]:
# XGBoost
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

classes = le.classes_
weights = y_train.map(class_weight_dict).values

xgb_model = xgb.XGBClassifier(
    objective="multi:softprob",
    num_class=len(classes),
    eval_metric="mlogloss"
)

xgb_model.fit(X_train_tfidf, y_train_enc, sample_weight=weights)
pred_xgb = xgb_model.predict(X_test_tfidf)

pred_xgb_labels = le.inverse_transform(pred_xgb)

print("\n=== XGBoost Classification ===")
print(classification_report(y_test, pred_xgb_labels))




=== XGBoost Classification ===
                             precision    recall  f1-score   support

confidentiality obligations       0.97      0.98      0.97      1527
              governing law       0.99      0.94      0.96        77
            indemnification       0.93      1.00      0.96        27
    indirect damages waiver       1.00      0.96      0.98        25
            non-competition       0.94      0.81      0.87        36
           non-solicitation       1.00      0.85      0.92        26
         privacy & security       0.71      0.67      0.69        36
                   remedies       0.93      0.93      0.93       121
                 signatures       0.75      0.86      0.80        64

                   accuracy                           0.96      1939
                  macro avg       0.91      0.89      0.90      1939
               weighted avg       0.96      0.96      0.96      1939



In [6]:
import os
import joblib
import json

SVM_DIR = "nda_flask/models/shallow_svm"
XGB_DIR = "nda_flask/models/shallow_xgboost"

os.makedirs(SVM_DIR, exist_ok=True)
os.makedirs(XGB_DIR, exist_ok=True)

# ====== Save SVM model ======
joblib.dump(svm, f"{SVM_DIR}/svm_model.pkl")
joblib.dump(vectorizer, f"{SVM_DIR}/tfidf_vectorizer.pkl")   # same vectorizer
joblib.dump(le, f"{SVM_DIR}/label_encoder.pkl")              # optional for SVM

with open(f"{SVM_DIR}/classes.json", "w") as f:
    json.dump(list(classes), f, indent=2)

print("SVM model saved to:", SVM_DIR)

# ====== Save XGBoost model ======
xgb_model.save_model(f"{XGB_DIR}/xgboost_model.json")
joblib.dump(vectorizer, f"{XGB_DIR}/tfidf_vectorizer.pkl")
joblib.dump(le, f"{XGB_DIR}/label_encoder.pkl")
joblib.dump(xgb_model.get_params(), f"{XGB_DIR}/xgb_best_params.pkl")

with open(f"{XGB_DIR}/classes.json", "w") as f:
    json.dump(list(classes), f, indent=2)

print("XGBoost model saved to:", XGB_DIR)


SVM model saved to: nda_flask/models/shallow_svm
XGBoost model saved to: nda_flask/models/shallow_xgboost


In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Encode labels
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

classes = le.classes_

# Get sample weights using your class_weight_dict
weights = y_train.map(class_weight_dict).values

# Random Forest model
rf_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    class_weight=None,    # we use sample_weight instead
    random_state=42
)

# Fit with sample weights
rf_model.fit(X_train_tfidf, y_train_enc, sample_weight=weights)

# Predict
pred_rf = rf_model.predict(X_test_tfidf)

# Convert back to original labels
pred_rf_labels = le.inverse_transform(pred_rf)

print("\n=== Random Forest Classification ===")
print(classification_report(y_test, pred_rf_labels))



=== Random Forest Classification ===
                             precision    recall  f1-score   support

confidentiality obligations       0.93      1.00      0.96      1527
              governing law       1.00      0.86      0.92        77
            indemnification       0.96      1.00      0.98        27
    indirect damages waiver       1.00      0.96      0.98        25
            non-competition       0.91      0.28      0.43        36
           non-solicitation       1.00      0.69      0.82        26
         privacy & security       1.00      0.17      0.29        36
                   remedies       0.99      0.74      0.85       121
                 signatures       0.87      0.75      0.81        64

                   accuracy                           0.93      1939
                  macro avg       0.96      0.72      0.78      1939
               weighted avg       0.94      0.93      0.92      1939

