In [1]:
!pip install -q -r requirements.txt

In [2]:
import openai
import json
import re
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [3]:
from lib.openai_client import OpenAIClient

In [4]:
# Define cache directories
json_cache_dir = './cache/json'
contract_cache_dir = './cache/contracts_json'
result_cache_dir = './cache/results_json'
os.makedirs(json_cache_dir, exist_ok=True)
os.makedirs(contract_cache_dir, exist_ok=True)
os.makedirs(result_cache_dir, exist_ok=True)



In [5]:
# Load the prompt
with open('./prompts/generate_json_from_contract.txt', 'r') as file:
    json_generation_prompt = file.read()

with open('./prompts/generate_contract_from_json.txt', 'r') as file:
    contract_generation_prompt = file.read()

# Load the JSON schema
with open('schema/contract_schema.json', 'r') as file:
    json_schema = file.read()

In [6]:
def load_contracts():
    contract_dir = './contracts'
    contract_files = [f for f in os.listdir(contract_dir) if f.endswith('.txt')]
    contract_files.sort()
    return contract_files  # Return just the filenames, not full paths



In [7]:
def cleanup_contract(contract_text):
    # Remove <scratchpad> section
    contract_text = re.sub(r'<scratchpad>.*?</scratchpad>', '', contract_text, flags=re.DOTALL)
    
    # Remove any other JSON-like tags
    contract_text = re.sub(r'<.*?>', '', contract_text)
    
    # Remove any lines that only contain '---'
    contract_text = re.sub(r'^\s*---\s*$', '', contract_text, flags=re.MULTILINE)
    
    # Remove extra blank lines
    contract_text = re.sub(r'\n\s*\n', '\n\n', contract_text)
    
    # Strip leading and trailing whitespace
    contract_text = contract_text.strip()
    
    return contract_text

In [8]:
def cleanup_json(json_text):
    # Find the content between <contract> tags
    match = re.search(r'<contract>.*?</contract>', json_text, re.DOTALL)
    if match:
        # Extract the matched content
        cleaned_json = match.group(0)
        # Remove any leading/trailing whitespace
        cleaned_json = cleaned_json.strip()
        return cleaned_json
    else:
        # If no <contract> tags are found, return the original text
        print("Warning: No <contract> tags found in the JSON. Returning original text.")
        return json_text

In [9]:
# Load contract files
contract_files = load_contracts()

In [10]:
openaiclient = OpenAIClient()

In [11]:
def get_cache_filename(base_dir, original_file, extension):
    base_name = os.path.basename(original_file)
    return os.path.join(base_dir, base_name.replace('.txt', extension))


In [12]:
def generate_json_object(contract_text, schema):
    openaiclient.reset_context()
    prompt = json_generation_prompt.replace("{{JSON_SCHEMA}}", schema).replace("{{CONTRACT}}", contract_text)
    openaiclient.add_message("user", prompt)
    response = openaiclient.get_response()
    
    # Check if the response contains the expected tags
    if "<json_output>" in response and "</json_output>" in response:
        json_output = response.split("<json_output>")[1].split("</json_output>")[0].strip()
        json_output = cleanup_json(json_output)
        return json_output
    else:
        # If tags are missing, return the entire response
        print("Warning: <json_output> tags not found in the response. Returning full response.")
        json_output = cleanup_json(response.strip())
        return json_output


In [13]:
def generate_contract_from_json(json_document, schema):
    openaiclient.reset_context()
    prompt = contract_generation_prompt.replace("{{JSON_SCHEMA}}", schema).replace("{{JSON_DOCUMENT}}", json_document)
    openaiclient.add_message("user", prompt)
    response = openaiclient.get_response()
    return response.strip()

In [14]:
def process_contract_to_json(contract_file, schema):
    cache_file = get_cache_filename(json_cache_dir, contract_file, '.json')
    
    if os.path.exists(cache_file):
        print(f"Using cached JSON for {contract_file}")
        with open(cache_file, 'r') as f:
            return f.read()
    
    print(f"Generating new JSON for {contract_file}")
    try:
        with open(contract_file, 'r') as f:
            contract_text = f.read()
        
        json_object = generate_json_object(contract_text, schema)
        
        with open(cache_file, 'w') as f:
            f.write(json_object)
        
        return json_object
    except Exception as e:
        print(f"Error processing {contract_file}: {str(e)}")
        return None


In [15]:
def generate_new_json_from_reconstructed(reconstructed_contract, schema, original_contract_file):
    cache_file = get_cache_filename(json_cache_dir, original_contract_file, '_new.json')
    
    if os.path.exists(cache_file):
        print(f"Using cached new JSON for {original_contract_file}")
        with open(cache_file, 'r') as f:
            return f.read()
    
    print(f"Generating new JSON from reconstructed contract for {original_contract_file}")
    new_json_object = generate_json_object(reconstructed_contract, schema)
    
    with open(cache_file, 'w') as f:
        f.write(new_json_object)
    
    return new_json_object

In [16]:
def process_json_to_contract(json_object, original_contract_file, schema):
    cache_file = get_cache_filename(contract_cache_dir, original_contract_file, '_reconstructed.txt')
    
    if os.path.exists(cache_file):
        print(f"Using cached reconstructed contract for {original_contract_file}")
        with open(cache_file, 'r') as f:
            return f.read()
    
    print(f"Generating new contract from JSON for {original_contract_file}")
    reconstructed_contract = generate_contract_from_json(json_object, schema)

    # Clean up the reconstructed contract
    reconstructed_contract = cleanup_contract(reconstructed_contract)

    with open(cache_file, 'w') as f:
        f.write(reconstructed_contract)

    return reconstructed_contract

In [17]:
def compare_embeddings(original_embeddings, new_embeddings):
    similarities = []
    for orig_emb, new_emb in zip(original_embeddings, new_embeddings):
        similarity = cosine_similarity([orig_emb], [new_emb])[0][0]
        similarities.append(similarity)
    return similarities

In [18]:
def create_embeddings(jsons):
    embeddings = []
    for json in jsons:
        embedding = openaiclient.get_embedding(json)
        embeddings.append(embedding)
    return embeddings

In [19]:
def save_results(similarities):
    cache_file = os.path.join(result_cache_dir, "similarities.json")
    json_string = json.dumps(similarities)
    with open(cache_file, 'w') as f:
        f.write(json_string)


In [20]:
def generate_data():
    contract_dir = './contracts'
    original_jsons = []
    new_jsons = []
    
    for contract_file in contract_files:
        full_path = os.path.join(contract_dir, contract_file)
        
        # Generate original JSON
        original_json = process_contract_to_json(full_path, json_schema)
        if original_json is not None:
            # Clean up the original JSON
            original_json = cleanup_json(original_json)
            print(f"Original JSON for {contract_file}:")
            print(original_json[:50] + "...")
            print("\n")
            
            # Store original JSON
            original_jsons.append(original_json)
            
            # Reconstruct contract from JSON
            reconstructed_contract = process_json_to_contract(original_json, contract_file, json_schema)
            
            print(f"Reconstructed contract for {contract_file}:")
            print(reconstructed_contract[:50] + "...")
            print("\n")
            
            # Generate new JSON from reconstructed contract
            new_json = generate_new_json_from_reconstructed(reconstructed_contract, json_schema, contract_file)
            
            # Clean up the new JSON
            new_json = cleanup_json(new_json)
            print(f"New JSON generated from reconstructed contract for {contract_file}:")
            print(new_json[:50] + "...")
            print("\n")
            
            # Store new JSON
            new_jsons.append(new_json)
            
        else:
            print(f"Skipping processing for {contract_file} due to JSON generation error.")
    
    return original_jsons, new_jsons

In [21]:
# Generate data
original_jsons, new_jsons = generate_data()

# Create embeddings
print("Creating embeddings for original JSONs...")
original_embeddings = create_embeddings(original_jsons)
print("Creating embeddings for new JSONs...")
new_embeddings = create_embeddings(new_jsons)

# Compare embeddings
print("Comparing embeddings...")
similarities = compare_embeddings(original_embeddings, new_embeddings)

# Print results
for i, similarity in enumerate(similarities):
    print(f"Similarity for contract {i+1}: {similarity:.4f}")

print(f"\nAverage similarity: {np.mean(similarities):.4f}")
print(f"Minimum similarity: {np.min(similarities):.4f}")
print(f"Maximum similarity: {np.max(similarities):.4f}")
save_results(similarities)
print("Similarities saved in cache.")

Using cached JSON for ./contracts/agreement_01.txt
Original JSON for agreement_01.txt:
{
  "contract": {
    "parties": [
      {
       ...


Using cached reconstructed contract for agreement_01.txt
Reconstructed contract for agreement_01.txt:
**CONTRACT AGREEMENT**

This Contract Agreement (“...


Using cached new JSON for agreement_01.txt
New JSON generated from reconstructed contract for agreement_01.txt:
{
  "contract": {
    "parties": [
      {
       ...


Using cached JSON for ./contracts/employment_01.txt
Original JSON for employment_01.txt:
{
  "contract": {
    "parties": [
      {
       ...


Using cached reconstructed contract for employment_01.txt
Reconstructed contract for employment_01.txt:
**EMPLOYMENT CONTRACT**

**This Employment Contrac...


Using cached new JSON for employment_01.txt
New JSON generated from reconstructed contract for employment_01.txt:
{
  "contract": {
    "parties": [
      {
       ...


Using cached JSON for ./contracts/lease_01.txt
Original 