In [4]:
# Import necessary libraries
import openai
import json
import re
import os
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from lib.openai_client import OpenAIClient

In [5]:
# Set up cache directories
xml_cache_dir = './cache/xml'
json_cache_dir = './cache/json'
contract_cache_dir = './cache/contracts'
result_cache_dir = './cache/results'

for directory in [xml_cache_dir, json_cache_dir, contract_cache_dir, result_cache_dir]:
    os.makedirs(directory, exist_ok=True)

In [6]:
# Load prompts
def load_prompts(mode):
    if mode == 'xml':
        with open('./prompts/generate_xml_from_contract.txt', 'r') as file:
            generation_prompt = file.read()
        with open('./prompts/generate_contract_from_xml.txt', 'r') as file:
            reconstruction_prompt = file.read()
    elif mode == 'json':
        with open('./prompts/generate_json_from_contract.txt', 'r') as file:
            generation_prompt = file.read()
        with open('./prompts/generate_contract_from_json.txt', 'r') as file:
            reconstruction_prompt = file.read()
    else:
        raise ValueError("Invalid mode. Use 'xml' or 'json'.")
    
    return generation_prompt, reconstruction_prompt

xml_generation_prompt, xml_reconstruction_prompt = load_prompts('xml')
json_generation_prompt, json_reconstruction_prompt = load_prompts('json')

In [7]:
# Utility functions
def load_contracts():
    contract_dir = './contracts'
    contract_files = [f for f in os.listdir(contract_dir) if f.endswith('.txt')]
    contract_files.sort()
    return contract_files

def cleanup_contract(contract_text):
    contract_text = re.sub(r'<scratchpad>.*?</scratchpad>', '', contract_text, flags=re.DOTALL)
    contract_text = re.sub(r'<.*?>', '', contract_text)
    contract_text = re.sub(r'^\s*---\s*$', '', contract_text, flags=re.MULTILINE)
    contract_text = re.sub(r'\n\s*\n', '\n\n', contract_text)
    return contract_text.strip()

def cleanup_generated_object(text, mode):
    tag = 'xml_output' if mode == 'xml' else 'json_output'
    pattern = f'<{tag}>.*?</{tag}>'
    match = re.search(pattern, text, re.DOTALL)
    if match:
        return match.group(0).strip()
    else:
        print(f"Warning: No <{tag}> tags found. Returning original text.")
        return text

def get_cache_filename(base_dir, original_file, extension):
    base_name = os.path.basename(original_file)
    return os.path.join(base_dir, base_name.replace('.txt', extension))

In [8]:
# OpenAI client setup
openaiclient = OpenAIClient()

In [9]:
# Main processing functions
def generate_object(contract_text, mode):
    openaiclient.reset_context()
    prompt = (xml_generation_prompt if mode == 'xml' else json_generation_prompt)
    prompt = prompt.replace("{{CONTRACT}}", contract_text)
    openaiclient.add_message("user", prompt)
    response = openaiclient.get_response()
    return cleanup_generated_object(response, mode)

def generate_contract_from_object(object_text, mode):
    openaiclient.reset_context()
    prompt = (xml_reconstruction_prompt if mode == 'xml' else json_reconstruction_prompt)
    prompt = prompt.replace(f"{{{{{'XML' if mode == 'xml' else 'JSON'}_DOCUMENT}}}}", object_text)
    openaiclient.add_message("user", prompt)
    response = openaiclient.get_response()
    return cleanup_contract(response)

def process_contract(contract_file, mode):
    cache_dir = xml_cache_dir if mode == 'xml' else json_cache_dir
    cache_file = get_cache_filename(cache_dir, contract_file, f'.{mode}')
    
    if os.path.exists(cache_file):
        print(f"Using cached {mode.upper()} for {contract_file}")
        with open(cache_file, 'r') as f:
            return f.read()
    
    print(f"Generating new {mode.upper()} for {contract_file}")
    try:
        with open(os.path.join('./contracts', contract_file), 'r') as f:
            contract_text = f.read()
        
        generated_object = generate_object(contract_text, mode)
        
        with open(cache_file, 'w') as f:
            f.write(generated_object)
        
        return generated_object
    except Exception as e:
        print(f"Error processing {contract_file}: {str(e)}")
        return None

def reconstruct_contract(object_text, original_contract_file, mode):
    cache_file = get_cache_filename(contract_cache_dir, original_contract_file, f'_reconstructed_{mode}.txt')
    
    if os.path.exists(cache_file):
        print(f"Using cached reconstructed contract for {original_contract_file}")
        with open(cache_file, 'r') as f:
            return f.read()
    
    print(f"Generating new contract from {mode.upper()} for {original_contract_file}")
    reconstructed_contract = generate_contract_from_object(object_text, mode)

    with open(cache_file, 'w') as f:
        f.write(reconstructed_contract)

    return reconstructed_contract

In [10]:
# Embedding and comparison functions
def create_embedding(text):
    return openaiclient.get_embedding(text)

def compare_embeddings(emb1, emb2):
    return cosine_similarity([emb1], [emb2])[0][0]

def save_results(similarities, mode):
    cache_file = os.path.join(result_cache_dir, f"similarities_{mode}.json")
    with open(cache_file, 'w') as f:
        json.dump(similarities, f)

In [11]:
# Main process
def process_contracts(mode):
    similarities = []
    
    for contract_file in load_contracts():
        print(f"Processing {contract_file}")
        
        # Load original contract
        with open(os.path.join('./contracts', contract_file), 'r') as f:
            original_contract = f.read()
        
        # Generate XML/JSON object
        generated_object = process_contract(contract_file, mode)
        
        if generated_object is not None:
            # Reconstruct contract
            reconstructed_contract = reconstruct_contract(generated_object, contract_file, mode)
            
            # Create embeddings
            original_embedding = create_embedding(original_contract)
            reconstructed_embedding = create_embedding(reconstructed_contract)
            
            # Compare embeddings
            similarity = compare_embeddings(original_embedding, reconstructed_embedding)
            similarities.append(similarity)
            
            print(f"Similarity for {contract_file}: {similarity:.4f}")
        
    save_results(similarities, mode)
    return similarities

In [12]:
# Run the process for both XML and JSON
print("Processing contracts using XML...")
xml_similarities = process_contracts('xml')

print("\nProcessing contracts using JSON...")
json_similarities = process_contracts('json')

Processing contracts using XML...
Processing agreement_01.txt
Generating new XML for agreement_01.txt
Generating new contract from XML for agreement_01.txt
Similarity for agreement_01.txt: 0.3078
Processing lease_01.txt
Generating new XML for lease_01.txt
Generating new contract from XML for lease_01.txt
Similarity for lease_01.txt: 0.3523
Processing media_01.txt
Generating new XML for media_01.txt
Generating new contract from XML for media_01.txt
Similarity for media_01.txt: 0.3909
Processing ngo_01.txt
Generating new XML for ngo_01.txt
Generating new contract from XML for ngo_01.txt
Similarity for ngo_01.txt: 0.2984
Processing professional_01.txt
Generating new XML for professional_01.txt
Generating new contract from XML for professional_01.txt
Similarity for professional_01.txt: 0.4575
Processing support_01.txt
Generating new XML for support_01.txt
Generating new contract from XML for support_01.txt
Similarity for support_01.txt: 0.3391

Processing contracts using JSON...
Processing

In [13]:
# Analyze and display results
def print_results(similarities, mode):
    print(f"\nResults for {mode.upper()}:")
    print(f"Average similarity: {np.mean(similarities):.4f}")
    print(f"Minimum similarity: {np.min(similarities):.4f}")
    print(f"Maximum similarity: {np.max(similarities):.4f}")

print_results(xml_similarities, 'xml')
print_results(json_similarities, 'json')

# Compare XML and JSON performance
xml_mean = np.mean(xml_similarities)
json_mean = np.mean(json_similarities)

print("\nComparison:")
if xml_mean > json_mean:
    print(f"XML outperformed JSON by {(xml_mean - json_mean) * 100:.2f}%")
elif json_mean > xml_mean:
    print(f"JSON outperformed XML by {(json_mean - xml_mean) * 100:.2f}%")
else:
    print("XML and JSON performed equally well")


Results for XML:
Average similarity: 0.3577
Minimum similarity: 0.2984
Maximum similarity: 0.4575

Results for JSON:
Average similarity: 0.2929
Minimum similarity: 0.2418
Maximum similarity: 0.3645

Comparison:
XML outperformed JSON by 6.48%
