Data Processing

In [1]:
import pandas as pd
import numpy as np
from google import genai
import re

API Key

In [None]:
# Initialize the GenAI client

client = genai.Client(api_key="XXXX")

Extract Function Setting

In [3]:
# Extract classification from model output

def parse_classification_result(result_text):
    
    lines = result_text.strip().split('\n')
    classification = 'Classification Error' # Default value if parsing fails
    
    classification_pattern = re.compile(r'Classification:\s*(.*)')

    for line in lines:
        if classification_pattern.match(line):
            match = classification_pattern.search(line)
            if match:
                classification = match.group(1).strip()
                break # Found the classification, stop searching
                
    return classification

## Annotated Data Test

In [None]:
annotated_nda=pd.read_csv('./Annotated-NDAs/annotated_nda_para.csv')
# Unify capitalization
annotated_nda['category'] = annotated_nda['category'].str.capitalize()

In [None]:
#Reviewed clauses which were mislabeled
annotated_nda.loc[92,'category']='Remedies'
annotated_nda.loc[47,'category']='Confidentiality obligations'
annotated_nda.loc[187,'clean_sentence']= 'CONFIDENTIALITY NONDISCLOSURE AND NONCOMPETE AGREEMENT'
annotated_nda.loc[187,'clean_paragraph']= 'CONFIDENTIALITY NONDISCLOSURE AND NONCOMPETE AGREEMENT'
annotated_nda.loc[99,'clean_sentence']='To Company'
annotated_nda.loc[99,'clean_paragraph']= 'Any notice or other communication to be given in connection with this Agreement mustbe in writing and given by personal delivery ore-mailas follows:ToCompany:To the Service Provider:'
annotated_nda.loc[100,'clean_paragraph']= 'Any notice or other communication to be given in connection with this Agreement mustbe in writing and given by personal delivery ore-mailas follows:ToCompany:To the Service Provider:'
annotated_nda.loc[111,'category']='Signatures'

#replace Privacy & security with Privacy(Delete next time)
annotated_nda['category'] = np.where(annotated_nda['category']=="Privacy","Privacy & security",annotated_nda['category'])

In [None]:
# Define the system instruction for the model

SYSTEM_INSTRUCTION = """
You are a legal clause classification assistant specializing in NDA (Non-Disclosure Agreement) analysis.

Your task: Classify **each input sentence** into exactly ONE of the following 9 categories,

Explicit 8 Categories:
1. Remedies — Clauses describing what each party may do if the NDA is breached (e.g., injunctions, equitable relief).
2. Privacy & Security — Clauses imposing obligations regarding data privacy, personal data, or information-security audits.
3. Limitation of liability - Clauses that limit, exclude, or cap a party's legal responsibility for damages or losses arising from a breach or violation of the NDA itself, or from the party's failure to perform its confidentiality-related obligations under this Agreement. These clauses restrict remedies for wrongful disclosure, misuse, or breach of confidentiality, not disclaimers about how information is provided or used.
   - CRITICAL EXCLUSION: DO NOT classify clauses that disclaim liability for the ACCURACY, QUALITY, or USE of the DISCLOSED CONFIDENTIAL INFORMATION as Limitation of liability. These clauses belong to Confidentiality Obligations.
   - Standard “Force Majeure” provisions should also NOT be classified as Limitation of liability.
4. Non-competition — Clauses that regulate, restrict, or define either party's ability to engage in business activities that compete with the other party.
   - Pay attention: For clauses that use the word 'competitive', assess whether the dominant purpose is to address/manage competition (Non-competition) OR to define the limits/exceptions of the obligation to keep information secret (Confidentiality Obligations).
5. Non-solicitation — Clauses preventing one party from soliciting or hiring the other party's employees, contractors, or clients.
6. Indemnification — Clauses requiring one party to defend, indemnify, or hold harmless the other from claims or losses.
7. Governing Law — Clauses specifying the applicable law or jurisdiction for disputes.
8. Signatures — Only include the final signing section or purely execution-related representations, such as those confirming the authority to sign or execute the Agreement itself.:  
   - Clauses starting with or containing “IN WITNESS WHEREOF”, “IN WITNESS THEREOF”, or similar phrases.  
   - Signature/date blocks with “By:”, “Name:”, “Title:”, or “Date:” lines.  
   - Clauses stating the agreement is executed or signed by authorized representatives.  
   - Do NOT classify other uses of “execute” or “effective date” as Signatures.
   - CRITICAL EXCLUSION: DO NOT classify general contract warranty clauses that state parties warrant their authority and right to enter the agreement as Signatures. These are Confidentiality Obligations.

Default Category (Catch-all):
9. Confidentiality Obligations — If a clause does not clearly belong to any of the above categories,
classify it as Confidentiality Obligations. This includes clauses about definitions of confidential information,
exclusions, permitted use, disclosure limits, protection standards, duration, and return or destruction of information.

Tips:
-When the sentence is a short fragment or heading/title, classify it based on its literal meaning. If the heading contains or matches the name of a predefined category, assign that same category.

Classification Rules:
- Assign exactly ONE category per clause.
- Match based on semantic meaning, not just keywords.
- If multiple categories appear, select the dominant legal purpose.
- Do NOT create new categories or modify the names of existing ones.
- Only capitalize the first letter of each category name.
- The output must follow this exact format (case-sensitive):

Output format:
Classification: <category name>

Examples:
Classification: Governing law
Classification: Remedies
Classification: Confidentiality obligations
"""


API Calling (5x)

In [None]:
from collections import Counter
import time  # optional, to add small delay if needed

classification_results = []
confidence_scores = []

N_REPEATS = 5  # number of times to repeat each classification
print(f"Starting 5x classification loop for {len(annotated_nda)} rows...")

for i, row in annotated_nda.iterrows():
    nda_statement = row['clean_sentence']
    user_prompt = f"Please classify the following NDA statement:\n- {nda_statement}"

    repeat_results = []  # store results from 5 runs for this row

    for n in range(N_REPEATS):
        try:
            response = client.models.generate_content(
                model="gemini-2.5-flash",
                contents=[user_prompt],
                config=genai.types.GenerateContentConfig(
                    system_instruction=SYSTEM_INSTRUCTION,
                    thinking_config=genai.types.ThinkingConfig(thinking_budget=0)
                ),
            )

            classification = parse_classification_result(response.text)
            repeat_results.append(classification)

        except Exception as e:
            print(f"Error in run {n+1} for index {i}: {e}. Setting to 'API Error'.")
            repeat_results.append('API Error')

        # optional: avoid hitting rate limits
        time.sleep(0.2)

    # compute majority vote and confidence
    counts = Counter(repeat_results)
    final_label, freq = counts.most_common(1)[0]
    confidence = freq / N_REPEATS

    classification_results.append(final_label)
    confidence_scores.append(confidence)

    print(f"[{i}] Final: {final_label} | Confidence: {confidence:.2f} | Votes: {dict(counts)}")

# Add results to DataFrame
annotated_nda['Classification_Category'] = classification_results
annotated_nda['Confidence'] = confidence_scores

print("\n--- Final Classified DataFrame (with 5x consensus results) ---")
print(annotated_nda[['clean_sentence', 'Classification_Category', 'Confidence']])


In [None]:
# Calculate classification accuracy
annotated_nda['Classification_Category'] = annotated_nda['Classification_Category'].str.replace(r'^\s*\d+[\.\s]*', '', regex=True)
annotated_nda['Classification_Category'] = annotated_nda['Classification_Category'].str.strip()

annotated_nda['Classification_Accuracy']= np.where(annotated_nda['Classification_Category'] == annotated_nda['category'], 1, 0)
print(f"Classification Accuracy (5x consensus): {annotated_nda['Classification_Accuracy'].mean() * 100:.2f}%")


Single Clause Test (5x)

In [None]:
from collections import Counter
import time


row_index = 229   # Replace with the desired row index to test

nda_statement = annotated_nda.loc[row_index, 'clean_sentence']
print(f"\nSelected row {row_index}: {nda_statement}\n")

user_prompt = f"Please classify the following NDA statement:\n- {nda_statement}"

N_REPEATS = 5
results = []

print(f"Running 5x classification test for row {row_index}...\n")

for n in range(N_REPEATS):
    try:
        response = client.models.generate_content(
            model="gemini-2.5-flash",
            contents=[user_prompt],
            config=genai.types.GenerateContentConfig(
                system_instruction=SYSTEM_INSTRUCTION,
                thinking_config=genai.types.ThinkingConfig(thinking_budget=0)
            ),
        )

        classification = parse_classification_result(response.text)
        results.append(classification)
        print(f"Run {n+1}: {classification}")

        time.sleep(0.2) 

    except Exception as e:
        print(f"Error on run {n+1}: {e}")
        results.append("API Error")

counts = Counter(results)
final_label, freq = counts.most_common(1)[0]
confidence = freq / N_REPEATS

print("\n--- Summary ---")
print(f"All 5 results: {results}")
print(f"Final classification: {final_label}")
print(f"Confidence: {confidence:.2f}")


## Unlabeled Data Preprocessing

Few-Shot Prompting

In [4]:
# Define the system instruction for the model with few-shot prompting
SYSTEM_INSTRUCTION = """
You are a legal clause classification assistant specializing in NDA (Non-Disclosure Agreement) analysis.

Your task: Classify **each input sentence** into exactly ONE of the following 9 categories,

Explicit 8 Categories:
1. Remedies — Clauses describing what each party may do if the NDA is breached (e.g., injunctions, equitable relief).
2. Privacy & Security — Clauses imposing obligations regarding data privacy, personal data, or information-security audits.
3. Limitation of liability - Clauses that limit, exclude, or cap a party's legal responsibility for damages or losses arising from a breach or violation of the NDA itself, or from the party's failure to perform its confidentiality-related obligations under this Agreement. These clauses restrict remedies for wrongful disclosure, misuse, or breach of confidentiality, not disclaimers about how information is provided or used.
   - CRITICAL EXCLUSION: DO NOT classify clauses that disclaim liability for the ACCURACY, QUALITY, or USE of the DISCLOSED CONFIDENTIAL INFORMATION as Limitation of liability. These clauses belong to Confidentiality Obligations.
   - Standard “Force Majeure” provisions should also NOT be classified as Limitation of liability.
4. Non-competition — Clauses that regulate, restrict, or define either party's ability to engage in business activities that compete with the other party.
   - Pay attention: For clauses that use the word 'competitive', assess whether the dominant purpose is to address/manage competition (Non-competition) OR to define the limits/exceptions of the obligation to keep information secret (Confidentiality Obligations).
5. Non-solicitation — Clauses preventing one party from soliciting or hiring the other party's employees, contractors, or clients.
6. Indemnification — Clauses requiring one party to defend, indemnify, or hold harmless the other from claims or losses.
7. Governing Law — Clauses specifying the applicable law or jurisdiction for disputes.
8. Signatures — Only include the final signing section or purely execution-related representations, such as those confirming the authority to sign or execute the Agreement itself.:  
   - Clauses starting with or containing “IN WITNESS WHEREOF”, “IN WITNESS THEREOF”, or similar phrases.  
   - Signature/date blocks with “By:”, “Name:”, “Title:”, or “Date:” lines.  
   - Clauses stating the agreement is executed or signed by authorized representatives.  
   - Do NOT classify other uses of “execute” or “effective date” as Signatures.
   - CRITICAL EXCLUSION: DO NOT classify general contract warranty clauses that state parties warrant their authority and right to enter the agreement as Signatures. These are Confidentiality Obligations.

Default Category (Catch-all):
9. Confidentiality Obligations — If a clause does not clearly belong to any of the above categories,
classify it as Confidentiality Obligations. This includes clauses about definitions of confidential information,
exclusions, permitted use, disclosure limits, protection standards, duration, and return or destruction of information.

Tips:
-When the sentence is a short fragment or heading/title, classify it based on its literal meaning. If the heading contains or matches the name of a predefined category, assign that same category.

Classification Rules:
- Assign exactly ONE category per clause.
- Match based on semantic meaning, not just keywords.
- If multiple categories appear, select the dominant legal purpose.
- Do NOT create new categories or modify the names of existing ones.
- Only capitalize the first letter of each category name.
- The output must follow this exact format (case-sensitive):

Output format:
Classification: <category name>

Examples:
Classification: Governing law
Classification: Remedies
Classification: Confidentiality obligations

# Few-Shot Examples
Below are sample NDA clauses and their correct classifications.
Use them to infer the intended meaning of each category.

Example 1(Remedies):
Clause: "The Parties acknowledge that monetary damages may not be a sufficient remedy for unauthorized use or disclosure of Confidential Information and that each Party may, without waiving any other rights or remedies, seek injunctive or equitable relief as may be deemed proper by a court of competent jurisdiction, without obligation to post any bond or other security."
Classification: Remedies

Example 2(Privacy & Security):
Clause: "Each party agrees to establish and maintain security measures to protect the security and confidentiality of the personal information, including physical, technological, and administrative safeguards."
Classification: Privacy & security

Example 3(Limitation of liability):
Clause: "Neither party shall be liable for any indirect, special, incidental or consequential damages resulting from breach hereof."
Classification: Limitation of liability

Example 4(Non-competition):
Clause: "Each party acknowledges and agrees that the other party may have developed or is developing materials, products or services which are competitive with the materials, products or services contemplated by or embodied in confidential information of the other party."
Classification: Non-competition

Example 5(Non-solicitation):
Clause: "For a period of twelve (12) months from the date hereof, neither party shall directly or indirectly solicit or hire the other party’s employees or contractors with whom it had contact in connection with this Agreement."
Classification: Non-solicitation

Example 6(Indemnification):
Clause: "Each party agrees to indemnify, defend and hold harmless the other party from and against all losses, damages, costs and expenses arising from any claim alleging unauthorized disclosure or use of Confidential Information by the indemnifying party."
Classification: Indemnification

Example 7(Governing law):
Clause: "This Agreement shall be governed by and construed in accordance with the laws of the State of Ohio, without regard to conflict of law provisions."
Classification: Governing law

Example 8(Signatures):
Clause: “In witness whereof, the parties hereto, each acting with proper authority, have executed this agreement as of the Effective Date.”
Classification: Signatures
"""


In [6]:
from collections import Counter
import time

Unlabeled_Data = pd.read_csv('parsed_nda_para.csv')
Sample_Clause_1 = Unlabeled_Data.loc[0:1999,]

classification_results = []
confidence_scores = []

N_REPEATS = 5  # number of times to repeat each classification
print(f"Starting 5x classification loop for {len(Sample_Clause_1)} rows...")

for i, row in Sample_Clause_1.iterrows():
    nda_statement = row['clean_sentence']
    user_prompt = f"Please classify the following NDA statement:\n- {nda_statement}"

    repeat_results = []  # store results from 5 runs for this row

    for n in range(N_REPEATS):
        try:
            response = client.models.generate_content(
                model="gemini-2.5-flash",
                contents=[user_prompt],
                config=genai.types.GenerateContentConfig(
                    system_instruction=SYSTEM_INSTRUCTION,
                    thinking_config=genai.types.ThinkingConfig(thinking_budget=0)
                ),
            )

            classification = parse_classification_result(response.text)
            repeat_results.append(classification)

        except Exception as e:
            print(f"Error in run {n+1} for index {i}: {e}. Setting to 'API Error'.")
            repeat_results.append('API Error')

        # optional: avoid hitting rate limits
        time.sleep(0.2)

    # compute majority vote and confidence
    counts = Counter(repeat_results)
    final_label, freq = counts.most_common(1)[0]
    confidence = freq / N_REPEATS

    classification_results.append(final_label)
    confidence_scores.append(confidence)

    print(f"[{i}] Final: {final_label} | Confidence: {confidence:.2f} | Votes: {dict(counts)}")

# Add results to DataFrame
Sample_Clause_1['Classification_Category'] = classification_results
Sample_Clause_1['Confidence'] = confidence_scores

print("Finished classifying Sample_Clause_1.")


Starting 5x classification loop for 2000 rows...


[0] Final: Confidentiality obligations | Confidence: 1.00 | Votes: {'Confidentiality obligations': 5}
[1] Final: Confidentiality obligations | Confidence: 1.00 | Votes: {'Confidentiality obligations': 5}
[2] Final: Confidentiality obligations | Confidence: 1.00 | Votes: {'Confidentiality obligations': 5}
[3] Final: Confidentiality obligations | Confidence: 1.00 | Votes: {'Confidentiality obligations': 5}
[4] Final: Confidentiality obligations | Confidence: 1.00 | Votes: {'Confidentiality obligations': 5}
[5] Final: Confidentiality obligations | Confidence: 1.00 | Votes: {'Confidentiality obligations': 5}
[6] Final: Confidentiality obligations | Confidence: 1.00 | Votes: {'Confidentiality obligations': 5}
[7] Final: Confidentiality obligations | Confidence: 1.00 | Votes: {'Confidentiality obligations': 5}
[8] Final: Confidentiality obligations | Confidence: 1.00 | Votes: {'Confidentiality obligations': 5}
[9] Final: Confidentiality obligations | Confidence: 1.00 | Votes: {'Confidentiali

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Sample_Clause_1['Classification_Category'] = classification_results
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Sample_Clause_1['Confidence'] = confidence_scores


In [7]:
# Statistics of the classification results
category_counts = Sample_Clause_1['Classification_Category'].value_counts()
print("\n--- Classification Category Counts ---")
for category, count in category_counts.items():
    frequency = count / len(Sample_Clause_1)
    print(f"{category}: {count} ({frequency:.2%})")
#Saved the results
Sample_Clause_1.to_csv('./Classification-Results/Classification_gemini_2.5_flash_sample_clause_1.csv', index=False)



--- Classification Category Counts ---
Confidentiality obligations: 1664 (83.20%)
Remedies: 139 (6.95%)
Governing law: 77 (3.85%)
Signatures: 66 (3.30%)
Privacy & security: 45 (2.25%)
Non-competition: 5 (0.25%)
Limitation of liability: 4 (0.20%)


In [7]:
from collections import Counter
import time

PreProcessing_Data = pd.read_csv('./parsed_nda_para.csv')
Sample_Clause_2 = PreProcessing_Data.loc[2000:3999,]

classification_results = []
confidence_scores = []

N_REPEATS = 5  # number of times to repeat each classification
print(f"Starting 5x classification loop for {len(Sample_Clause_2)} rows...")

for i, row in Sample_Clause_2.iterrows():
    nda_statement = row['clean_sentence']
    user_prompt = f"Please classify the following NDA statement:\n- {nda_statement}"

    repeat_results = []  # store results from 5 runs for this row

    for n in range(N_REPEATS):
        try:
            response = client.models.generate_content(
                model="gemini-2.5-flash",
                contents=[user_prompt],
                config=genai.types.GenerateContentConfig(
                    system_instruction=SYSTEM_INSTRUCTION,
                    thinking_config=genai.types.ThinkingConfig(thinking_budget=0)
                ),
            )

            classification = parse_classification_result(response.text)
            repeat_results.append(classification)

        except Exception as e:
            print(f"Error in run {n+1} for index {i}: {e}. Setting to 'API Error'.")
            repeat_results.append('API Error')

        # optional: avoid hitting rate limits
        time.sleep(0.2)

    # compute majority vote and confidence
    counts = Counter(repeat_results)
    final_label, freq = counts.most_common(1)[0]
    confidence = freq / N_REPEATS

    classification_results.append(final_label)
    confidence_scores.append(confidence)

    print(f"[{i}] Final: {final_label} | Confidence: {confidence:.2f} | Votes: {dict(counts)}")

# Add results to DataFrame
Sample_Clause_2['Classification_Category'] = classification_results
Sample_Clause_2['Confidence'] = confidence_scores

print("Finished classifying Sample_Clause.")

Starting 5x classification loop for 2000 rows...
[2000] Final: Signatures | Confidence: 1.00 | Votes: {'Signatures': 5}
[2001] Final: Confidentiality obligations | Confidence: 1.00 | Votes: {'Confidentiality obligations': 5}
[2002] Final: Confidentiality obligations | Confidence: 1.00 | Votes: {'Confidentiality obligations': 5}
[2003] Final: Confidentiality obligations | Confidence: 1.00 | Votes: {'Confidentiality obligations': 5}
[2004] Final: Confidentiality obligations | Confidence: 1.00 | Votes: {'Confidentiality obligations': 5}
[2005] Final: Confidentiality obligations | Confidence: 1.00 | Votes: {'Confidentiality obligations': 5}
[2006] Final: Confidentiality obligations | Confidence: 1.00 | Votes: {'Confidentiality obligations': 5}
[2007] Final: Confidentiality obligations | Confidence: 1.00 | Votes: {'Confidentiality obligations': 5}
[2008] Final: Confidentiality obligations | Confidence: 1.00 | Votes: {'Confidentiality obligations': 5}
[2009] Final: Confidentiality obligation

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Sample_Clause_2['Classification_Category'] = classification_results
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Sample_Clause_2['Confidence'] = confidence_scores


In [8]:
# Statistics of the classification results
category_counts = Sample_Clause_2['Classification_Category'].value_counts()
print("\n--- Classification Category Counts ---")
for category, count in category_counts.items():
    frequency = count / len(Sample_Clause_2)
    print(f"{category}: {count} ({frequency:.2%})")
#Saved the results
Sample_Clause_2.to_csv('./Classification-Results/Classification_gemini_2.5_flash_sample_2.csv', index=False)



--- Classification Category Counts ---
Confidentiality obligations: 1660 (83.00%)
Remedies: 122 (6.10%)
Governing law: 91 (4.55%)
Signatures: 70 (3.50%)
Privacy & security: 23 (1.15%)
Non-competition: 18 (0.90%)
API Error: 8 (0.40%)
Limitation of liability: 5 (0.25%)
Indemnification: 2 (0.10%)
Non-solicitation: 1 (0.05%)


In [14]:
from collections import Counter
import time

PreProcessing_Data = pd.read_csv('./parsed_nda_para.csv')
Sample_Clause_3 = PreProcessing_Data.loc[3990:5999,]

classification_results = []
confidence_scores = []

N_REPEATS = 5  # number of times to repeat each classification
print(f"Starting 5x classification loop for {len(Sample_Clause_3)} rows...")

for i, row in Sample_Clause_3.iterrows():
    nda_statement = row['clean_sentence']
    user_prompt = f"Please classify the following NDA statement:\n- {nda_statement}"

    repeat_results = []  # store results from 5 runs for this row

    for n in range(N_REPEATS):
        try:
            response = client.models.generate_content(
                model="gemini-2.5-flash",
                contents=[user_prompt],
                config=genai.types.GenerateContentConfig(
                    system_instruction=SYSTEM_INSTRUCTION,
                    thinking_config=genai.types.ThinkingConfig(thinking_budget=0)
                ),
            )

            classification = parse_classification_result(response.text)
            repeat_results.append(classification)

        except Exception as e:
            print(f"Error in run {n+1} for index {i}: {e}. Setting to 'API Error'.")
            repeat_results.append('API Error')

        # optional: avoid hitting rate limits
        time.sleep(0.2)

    # compute majority vote and confidence
    counts = Counter(repeat_results)
    final_label, freq = counts.most_common(1)[0]
    confidence = freq / N_REPEATS

    classification_results.append(final_label)
    confidence_scores.append(confidence)

    print(f"[{i}] Final: {final_label} | Confidence: {confidence:.2f} | Votes: {dict(counts)}")

# Add results to DataFrame
Sample_Clause_3['Classification_Category'] = classification_results
Sample_Clause_3['Confidence'] = confidence_scores

print("Finished classifying Sample_Clause.")

Starting 5x classification loop for 2010 rows...
[3990] Final: Remedies | Confidence: 1.00 | Votes: {'Remedies': 5}
[3991] Final: Confidentiality obligations | Confidence: 1.00 | Votes: {'Confidentiality obligations': 5}
[3992] Final: Confidentiality obligations | Confidence: 1.00 | Votes: {'Confidentiality obligations': 5}
[3993] Final: Confidentiality obligations | Confidence: 1.00 | Votes: {'Confidentiality obligations': 5}
[3994] Final: Governing law | Confidence: 1.00 | Votes: {'Governing law': 5}
[3995] Final: Remedies | Confidence: 1.00 | Votes: {'Remedies': 5}
[3996] Final: Confidentiality obligations | Confidence: 1.00 | Votes: {'Confidentiality obligations': 5}
[3997] Final: Confidentiality obligations | Confidence: 1.00 | Votes: {'Confidentiality obligations': 5}
[3998] Final: Confidentiality obligations | Confidence: 1.00 | Votes: {'Confidentiality obligations': 5}
[3999] Final: Confidentiality obligations | Confidence: 1.00 | Votes: {'Confidentiality obligations': 5}
[4000

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Sample_Clause_3['Classification_Category'] = classification_results
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Sample_Clause_3['Confidence'] = confidence_scores


In [15]:
# Statistics of the classification results
category_counts = Sample_Clause_3['Classification_Category'].value_counts()
print("\n--- Classification Category Counts ---")
for category, count in category_counts.items():
    frequency = count / len(Sample_Clause_3)
    print(f"{category}: {count} ({frequency:.2%})")
#Saved the results
Sample_Clause_3.to_csv('./Classification-Results/Classification_gemini_2.5_flash_sample_3.csv', index=False)


--- Classification Category Counts ---
Confidentiality obligations: 1615 (80.35%)
Remedies: 152 (7.56%)
Governing law: 82 (4.08%)
Privacy & security: 76 (3.78%)
Signatures: 59 (2.94%)
API Error: 10 (0.50%)
Non-competition: 8 (0.40%)
Limitation of liability: 4 (0.20%)
Indemnification: 2 (0.10%)
Non-solicitation: 2 (0.10%)


In [5]:
from collections import Counter
import time

PreProcessing_Data = pd.read_csv('./parsed_nda_para.csv')
Sample_Clause_4 = PreProcessing_Data.loc[5988:5999,]

classification_results = []
confidence_scores = []

N_REPEATS = 5  # number of times to repeat each classification
print(f"Starting 5x classification loop for {len(Sample_Clause_4)} rows...")

for i, row in Sample_Clause_4.iterrows():
    nda_statement = row['clean_sentence']
    user_prompt = f"Please classify the following NDA statement:\n- {nda_statement}"

    repeat_results = []  # store results from 5 runs for this row

    for n in range(N_REPEATS):
        try:
            response = client.models.generate_content(
                model="gemini-2.5-flash",
                contents=[user_prompt],
                config=genai.types.GenerateContentConfig(
                    system_instruction=SYSTEM_INSTRUCTION,
                    thinking_config=genai.types.ThinkingConfig(thinking_budget=0)
                ),
            )

            classification = parse_classification_result(response.text)
            repeat_results.append(classification)

        except Exception as e:
            print(f"Error in run {n+1} for index {i}: {e}. Setting to 'API Error'.")
            repeat_results.append('API Error')

        # optional: avoid hitting rate limits
        time.sleep(0.2)

    # compute majority vote and confidence
    counts = Counter(repeat_results)
    final_label, freq = counts.most_common(1)[0]
    confidence = freq / N_REPEATS

    classification_results.append(final_label)
    confidence_scores.append(confidence)

    print(f"[{i}] Final: {final_label} | Confidence: {confidence:.2f} | Votes: {dict(counts)}")

# Add results to DataFrame
Sample_Clause_4['Classification_Category'] = classification_results
Sample_Clause_4['Confidence'] = confidence_scores

print("Finished classifying Sample_Clause.")

Starting 5x classification loop for 12 rows...
[5988] Final: Confidentiality obligations | Confidence: 1.00 | Votes: {'Confidentiality obligations': 5}
[5989] Final: Privacy & security | Confidence: 0.80 | Votes: {'Privacy & security': 4, 'Confidentiality obligations': 1}
[5990] Final: Privacy & security | Confidence: 1.00 | Votes: {'Privacy & security': 5}
[5991] Final: Privacy & security | Confidence: 1.00 | Votes: {'Privacy & security': 5}
[5992] Final: Privacy & security | Confidence: 1.00 | Votes: {'Privacy & security': 5}
[5993] Final: Confidentiality obligations | Confidence: 1.00 | Votes: {'Confidentiality obligations': 5}
[5994] Final: Confidentiality obligations | Confidence: 1.00 | Votes: {'Confidentiality obligations': 5}
[5995] Final: Confidentiality obligations | Confidence: 1.00 | Votes: {'Confidentiality obligations': 5}
[5996] Final: Confidentiality obligations | Confidence: 1.00 | Votes: {'Confidentiality obligations': 5}
[5997] Final: Confidentiality obligations | Co

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Sample_Clause_4['Classification_Category'] = classification_results
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Sample_Clause_4['Confidence'] = confidence_scores


In [6]:
# Statistics of the classification results
category_counts = Sample_Clause_4['Classification_Category'].value_counts()
print("\n--- Classification Category Counts ---")
for category, count in category_counts.items():
    frequency = count / len(Sample_Clause_4)
    print(f"{category}: {count} ({frequency:.2%})")
#Saved the results
Sample_Clause_4.to_csv('./Classification-Results/Classification_gemini_2.5_flash_sample_4.csv', index=False)


--- Classification Category Counts ---
Confidentiality obligations: 8 (66.67%)
Privacy & security: 4 (33.33%)


In [12]:
import pandas as pd
import glob
import os

files = glob.glob('Classification-Results/*.csv')

dfs = []
for file in files:
    df = pd.read_csv(file)
    dfs.append(df)

merged_df = pd.concat(dfs, ignore_index=True)
merged_df = merged_df.query("Classification_Category != 'API Error'")
merged_df.to_csv('./Classification-Results/Final_Classification_gemini_2.5_flash_6000_samples.csv', index=False)

In [3]:
import pandas as pd
processed_df = pd.read_csv('./Classification-Results/Final_Classification_gemini_2.5_flash_6000_samples.csv')
category_counts = processed_df['Classification_Category'].value_counts()
print("\n--- Final Classification Category Counts ---")
for category, count in category_counts.items():
    frequency = count / len(processed_df)
    print(f"{category}: {count} ({frequency:.2%})")



--- Final Classification Category Counts ---
Confidentiality obligations: 5090 (83.02%)
Remedies: 405 (6.61%)
Governing law: 256 (4.18%)
Signatures: 212 (3.46%)
Privacy & security: 119 (1.94%)
Non-competition: 33 (0.54%)
Non-solicitation: 8 (0.13%)
Indemnification: 5 (0.08%)
Indirect damages waiver: 2 (0.03%)
Signatures : 1 (0.02%)
