In [3]:
# !pip install google-generativeai

import pandas as pd
import google.generativeai as genai
from tqdm import tqdm
import time
import random


In [6]:
file_path = "Classification-Results/Final_Classification_gemini_2.5_flash_6000_samples.csv"
df = pd.read_csv(file_path)
df["Classification_Category"] = df["Classification_Category"].str.lower().str.strip()
print(df["Classification_Category"].value_counts(normalize=False))


Classification_Category
confidentiality obligations    5090
remedies                        405
governing law                   256
signatures                      213
privacy & security              119
non-competition                  33
non-solicitation                  8
indemnification                   5
indirect damages waiver           2
Name: count, dtype: int64


In [10]:
label_definitions = {
    "confidentiality obligations": """
Describes which information is considered confidential or not, 
as well as exceptions to confidentiality and related obligations.
This is a standard section and must be retained in NDAs.
""",

    "remedies": """
This section describes what each party can do if the other party violates the NDA.
Instead of flagging if the terms are found, this will flag if the terms are NOT found,
as sometimes there is content missing from this section that should be included
(e.g. a party's ability to seek an immediate court injunction).
""",

    "privacy": """
Sometimes a company will include language that each party must comply with certain requirements 
to protect Personal Data. This should be flagged and removed, since those privacy requirements 
should be fully covered in a separate Data Processing Addendum, not in an NDA.
""",

    "indirect damages waiver": """
A non-standard section that, if found, should be flagged for removal. 
It states that one or both parties' liability for indirect or consequential damages is limited or waived.
This is not a standard NDA section and should always be removed.
""",

    "non-competition": """
A section stating that one or both parties will not compete against or interfere with each other's business.
This is not appropriate for NDAs, as they are early-stage documents and should not impose such restrictions.
This section should always be flagged or removed.
""",

    "non-solicitation": """
A section stating that one or both parties agree not to solicit or hire the other party's employees,
contractors, or consultants for a certain period of time.
This is also considered a 'bad' provision and should always be flagged or removed from NDAs.
""",

    "indemnification": """
A section that requires one or both parties to 'defend', 'indemnify', or 'hold harmless' the other party
from certain claims or legal liabilities. This is non-standard for NDAs and should always be flagged for removal.
""",

    "governing law": """
Specifies which jurisdiction's law governs the agreement and where disputes will be resolved
(e.g., any lawsuit must take place in New York and apply New York law).
This is a standard clause, although certain jurisdictions (e.g., Texas, non-U.S., Italy) may be flagged as non-standard.
""",
}


In [None]:
# definition = label_definitions.get(label.lower(), "No specific definition provided.")

# prompt = f"""
# You are an experienced contract attorney specializing in NDA (Non-Disclosure Agreement) drafting and analysis.

# Your task:
# - Generate {n_variations} new, diverse, legally valid variations of the following NDA clause.
# - Each variation must preserve the same **legal intent, enforceability, and compliance meaning** as the original.
# - You may rephrase, expand, or simplify the clause using professional legal English.
# - Avoid introducing new obligations or removing legal meaning.
# - Avoid trivial word swaps; instead, vary the **structure, tone, or legal phrasing** (e.g., restructure conditions, add/remove qualifiers, change active/passive voice, merge/split sentences).
# - Keep each version concise (1–3 sentences), grammatically correct, and in formal NDA style.

# Clause category: {label.upper()}
# Category definition:
# \"\"\"{definition}\"\"\"

# Original clause:
# \"\"\"{sentence}\"\"\"

# Output format:
# Each variation on a new line, numbered (1), (2), (3), etc.
# """


In [13]:
small_classes = [
    "indemnification",
    "non-competition",
    "non-solicitation",
    "indirect damages waiver",
]

genai.configure(api_key="AIzaSyBoWBFVTEsZMi_LLx4tPSkSOvlzej0ZmE8")
model = genai.GenerativeModel("gemini-2.5-flash")

def augment_clause(sentence, label, n_variations=3):
    definition = label_definitions.get(label.lower(), "No specific definition provided.")
    prompt = f"""
    You are an experienced corporate contract lawyer specializing in drafting and reviewing NDAs (Non-Disclosure Agreements). 
    Your goal is to generate several new, diverse, and legally valid versions of the following clause.

    Task:
    - Produce {n_variations} distinct variations of this clause.
    - Each variation must preserve the same legal intent, obligations, and enforceability as the original.
    - Avoid trivial word swaps or surface-level paraphrasing.

    Stylistic requirements:
    - Use formal legal English, consistent with NDA drafting style.
    - Keep each version concise (1-3 sentences), should not be to long.
    - Maintain clarity, consistency, and grammatical correctness.

    Clause category: {label.upper()}
    Category definition: {definition}

    Original clause:
    \"\"\"{sentence}\"\"\"

    Output format:
    Output each variation as a separate numbered line.
    """
    try:
        response = model.generate_content(prompt)
        raw_output = response.text
        if not raw_output:
            return []
        lines = [x.strip("-•1234567890. ").strip() for x in raw_output.split("\n") if len(x.strip()) > 20]
        return list(set(lines))[:n_variations]
    except Exception as e:
        print(f"Error for {label}: {e}")
        return []

augmented_rows = []

target_total = 120
n_variations = 3

for label in small_classes:
    subset = df[df["Classification_Category"] == label]
    if len(subset) == 0:
        continue
    print(f"\nAugmenting class: {label} ({len(subset)} samples)...")
    target_count = max(0, target_total - len(subset))
    repeats = max(1, target_count // (len(subset) * n_variations))
    for _, row in tqdm(subset.iterrows(), total=len(subset)):
        sentence = row["clean_sentence"]
        for _ in range(repeats):
            new_samples = augment_clause(sentence, label, n_variations=n_variations)
            for new_text in new_samples:
                augmented_rows.append({
                    "clean_sentence": new_text,
                    "Classification_Category": label,
                    "source_file": "AI_AUGMENTED_GEMINI"
                })
        time.sleep(1.2)

aug_df = pd.DataFrame(augmented_rows)
aug_df = aug_df.drop_duplicates(subset=["clean_sentence"])
aug_df = aug_df.groupby("Classification_Category", group_keys=False).apply(lambda x: x.sample(n=min(len(x), 120), random_state=42))
# aug_df.to_csv("Augmented_NDA_Clauses_cleaned.csv", index=False)
# print(aug_df["Classification_Category"].value_counts())
# merged_df = pd.concat([df, aug_df], ignore_index=True)
# merged_df.to_csv("Final_NDA_with_Augmented.csv", index=False)
# print("Total new samples generated:", len(aug_df))

# merged = pd.concat([df, aug_df])
# print("\n New class distribution:")
# print(merged["Classification_Category"].value_counts())



Augmenting class: indemnification (5 samples)...


100%|██████████| 5/5 [06:25<00:00, 77.11s/it]



Augmenting class: non-competition (33 samples)...


100%|██████████| 33/33 [07:16<00:00, 13.22s/it]



Augmenting class: non-solicitation (8 samples)...


100%|██████████| 8/8 [06:12<00:00, 46.55s/it]



Augmenting class: indirect damages waiver (2 samples)...


100%|██████████| 2/2 [05:05<00:00, 152.69s/it]
  aug_df = aug_df.groupby("Classification_Category", group_keys=False).apply(lambda x: x.sample(n=min(len(x), 120), random_state=42))


In [16]:
merged_df = pd.concat([df, aug_df], ignore_index=True)

# merged_df.to_csv("Final_NDA_with_Augmented.csv", index=False)

print("Combined dataset saved as: Final_NDA_with_Augmented.csv")
print("New class distribution after augmentation:")
print(merged_df["Classification_Category"].value_counts(normalize=False))
# print("Class percentage distribution:")
# print((merged_df["Classification_Category"].value_counts(normalize=True) * 100).round(2))

Combined dataset saved as: Final_NDA_with_Augmented.csv
New class distribution after augmentation:
Classification_Category
confidentiality obligations    5090
remedies                        405
governing law                   256
signatures                      213
privacy & security              119
non-competition                 119
indemnification                  89
non-solicitation                 86
indirect damages waiver          85
Name: count, dtype: int64


In [17]:
merged_df.to_csv("Final_NDA_with_Augmented.csv", index=False)