In [73]:
import anthropic
import os
import json
from dotenv import load_dotenv
from supabase import create_client


In [74]:
load_dotenv()

True

In [75]:
client = anthropic.Anthropic()

In [76]:
supabase = create_client(os.environ["SUPABASE_URL"], os.environ["SUPABASE_SERVICE_KEY"])
response = supabase.table("declass_rules") \
    .select("rule, grouping, level, content") \
    .neq("level", "U") \
    .execute()

rules = response.data

In [77]:
response2 = supabase.table("words") \
    .select("num,text") \
    .eq("doc_id", "511f609a-8e71-483d-8f20-9438d9e1b8df") \
    .eq("page", 1) \
    .execute()

response2.data

[{'num': 0, 'text': 'Declassified'},
 {'num': 1, 'text': 'and'},
 {'num': 2, 'text': 'Approved'},
 {'num': 3, 'text': 'For'},
 {'num': 4, 'text': 'Release'},
 {'num': 5, 'text': '2012/05/23'},
 {'num': 6, 'text': ':'},
 {'num': 7, 'text': 'CIA-RDP99-01448R000402200001-0'},
 {'num': 8, 'text': 'TRANSCRIPT'},
 {'num': 9, 'text': 'FOR'},
 {'num': 10, 'text': 'PUBLIC'},
 {'num': 11, 'text': 'AFFAIRS'},
 {'num': 12, 'text': 'STAFF'},
 {'num': 13, 'text': '--'},
 {'num': 14, 'text': 'station'},
 {'num': 15, 'text': 'WAMU-FM'},
 {'num': 16, 'text': 'ne'},
 {'num': 17, 'text': 'PROGRAM'},
 {'num': 18, 'text': 'The'},
 {'num': 19, 'text': 'Diane'},
 {'num': 20, 'text': 'Rehm'},
 {'num': 21, 'text': 'Show'},
 {'num': 22, 'text': 'city'},
 {'num': 23, 'text': 'Washington,'},
 {'num': 24, 'text': 'D.c.'},
 {'num': 25, 'text': 'DATE'},
 {'num': 26, 'text': 'November'},
 {'num': 27, 'text': '22,'},
 {'num': 28, 'text': '1993'},
 {'num': 29, 'text': '10:05'},
 {'num': 30, 'text': 'AM'},
 {'num': 31, 

In [78]:
response3 = supabase.table("words") \
    .select("num,text") \
    .eq("doc_id", "511f609a-8e71-483d-8f20-9438d9e1b8df") \
    .eq("page", 2) \
    .execute()

words_page2 = [word["text"] for word in response3.data]

In [79]:
prompt_string = """
You are tasked with generating a list of redactions for a given list of words based on a set of rules. Your goal is to identify sections of text that need to be redacted according to the provided rules and output a list of non-overlapping redaction intervals. The user will provide the list of words, the rules are provided in this system prompt.

Here is the list of rules to apply:
<rules>
{RULES}
</rules>

Follow these steps to complete the task:

1. Read through the list of words carefully. Consider that these words may form complete sentences or parts of sentences.

2. Review each rule in the provided list. Pay attention to the rule number, grouping, level, and content description.

3. For each rule, scan the list of words to identify any sections that match or closely relate to the rule's content description.

4. When you find a match, determine the start and end indices for the section that should be redacted. The start index should be the index of the first word to be redacted, and the end index should be the index of the last word to be redacted.

5. Keep in mind that you are processing one page at a time. There may be text preceding the beginning or following the end of the given list of words. When deciding on redactions near the beginning or end of the list, consider what the likely context would be and redact accordingly.

6. Ensure that your redaction intervals do not overlap. If you find overlapping intervals, merge them into a single, larger interval that covers all the words that need to be redacted.

7. Create a list of redaction objects. Each object should have the following properties:
   - start: the index of the first word to be redacted (integer, minimum 0)
   - end: the index of the last word to be redacted (integer, maximum len(words)-1)
   - rule: the exact rule identifier from the rules list that triggered this redaction (use the "rule" field from the matching rule)
   - level: the exact level specified in the matching rule (use the "level" field from the matching rule)
   - reasoning: a brief explanation of why this section matches the rule (string)

8. Sort the list of redaction objects by the start index in ascending order.

Output your final list of redactions in the following format:
<redactions>
[{{start:index(int),end:index(int),rule:rule_id_from_rules,level:level_from_rules,reasoning:"brief explanation"}},
{{start:index(int),end:index(int),rule:rule_id_from_rules,level:level_from_rules,reasoning:"brief explanation"}},
{{start:index(int),end:index(int),rule:rule_id_from_rules,level:level_from_rules,reasoning:"brief explanation"}},
...]
</redactions>

Here's an example of what your output might look like:
<example>
<redactions>
[{{start:0,end:5,rule:"{EXAMPLE_RULE_1}",level:"{EXAMPLE_LEVEL_1}",reasoning:"This section contains personal identifying information matching rule 1"}},
{{start:10,end:15,rule:"{EXAMPLE_RULE_2}",level:"{EXAMPLE_LEVEL_2}",reasoning:"This content describes sensitive company operations prohibited by rule 2"}},
{{start:20,end:25,rule:"{EXAMPLE_RULE_3}",level:"{EXAMPLE_LEVEL_3}",reasoning:"These words match keywords associated with proprietary technology in rule 3"}}]
</redactions>
</example>

Remember:
- NO OVERLAPPING INTERVALS in your final output
- Use ONLY the exact rule identifiers and levels as they appear in the rules list
- The start index should be at minimum 0
- The end index should be at maximum len(words)-1
"""

In [80]:
# Get example values for the template
example_rule_1 = rules[0]["rule"]  # "3.3.1-28"
example_level_1 = rules[0]["level"]  # "S"
example_rule_2 = rules[1]["rule"]  # "3.3.1-29"
example_level_2 = rules[1]["level"]  # "S"
example_rule_3 = rules[2]["rule"]  # "3.3.1-30"
example_level_3 = rules[2]["level"]  # "S"

final_prompt = prompt_string.format(
    RULES=rules,
    EXAMPLE_RULE_1=example_rule_1,
    EXAMPLE_LEVEL_1=example_level_1,
    EXAMPLE_RULE_2=example_rule_2,
    EXAMPLE_LEVEL_2=example_level_2,
    EXAMPLE_RULE_3=example_rule_3,
    EXAMPLE_LEVEL_3=example_level_3
)

In [81]:
print(final_prompt)


You are tasked with generating a list of redactions for a given list of words based on a set of rules. Your goal is to identify sections of text that need to be redacted according to the provided rules and output a list of non-overlapping redaction intervals. The user will provide the list of words, the rules are provided in this system prompt.

Here is the list of rules to apply:
<rules>
</rules>

Follow these steps to complete the task:

1. Read through the list of words carefully. Consider that these words may form complete sentences or parts of sentences.

2. Review each rule in the provided list. Pay attention to the rule number, grouping, level, and content description.

3. For each rule, scan the list of words to identify any sections that match or closely relate to the rule's content description.

4. When you find a match, determine the start and end indices for the section that should be redacted. The start index should be the index of the first word to be redacted, and the e

In [82]:
import anthropic

client = anthropic.Anthropic()

response = client.messages.create(
    model="claude-3-7-sonnet-20250219",
    max_tokens=2048,
    system=[
      {
        "type": "text",
        "text": final_prompt,
        "cache_control": {"type": "ephemeral"}
      }
    ],
    messages=[{"role": "user", "content": json.dumps(response2.data)}],
)

print(response.usage.model_dump_json())

{"cache_creation_input_tokens":7187,"cache_read_input_tokens":0,"input_tokens":5394,"output_tokens":292}


In [83]:
print(response.content[0].text)

I'll analyze the text to identify sections that need to be redacted according to the provided rules.

<redactions>
[{start:7,end:7,rule:"3.7.3-8",level:"S",reasoning:"Contains a CIA reference number (CIA-RDP99-01448R000402200001-0) which is a technical identifier that could reveal system information"},
{start:145,end:154,rule:"3.4.1-4",level:"C",reasoning:"Names of Washington Post staff writers involved in examining intelligence documents, which reveals general information about intelligence activities and methods"},
{start:166,end:171,rule:"3.2-2",level:"C",reasoning:"References to previously closed files on the assassination being made available, revealing general information about sensitive data sets"},
{start:176,end:191,rule:"3.5.1-6",level:"C",reasoning:"Contains general information about U.S. leaders' decisions related to intelligence matters that could damage foreign relations"},
{start:379,end:379,rule:"3.7.3-8",level:"S",reasoning:"Contains a CIA reference number (CIA-RDP99-0

In [84]:
response2 = client.messages.create(
    model="claude-3-7-sonnet-20250219",
    max_tokens=2048,
    system=[
      {
        "type": "text",
        "text": final_prompt,
        "cache_control": {"type": "ephemeral"}
      }
    ],
    messages=[{"role": "user", "content": json.dumps(response3.data)}],
)

print(response2.usage.model_dump_json())

{"cache_creation_input_tokens":0,"cache_read_input_tokens":7187,"input_tokens":6009,"output_tokens":226}


In [89]:
import re
redactions_match = re.search(r'<redactions>\s*(.*?)\s*</redactions>', response.content[0].text, re.DOTALL)
if redactions_match:
    redactions_only = redactions_match.group(1)

In [95]:
eval(redactions_only)

NameError: name 'start' is not defined

In [96]:
def convert_to_json(input_string):
    # Add quotes around property names
    step1 = re.sub(r'([{,])\s*([a-zA-Z0-9_]+):', r'\1"\2":', input_string)
    
    # Convert JavaScript object to valid JSON
    json_obj = eval(f"[{step1.strip('[]')}]")
        
    return json_obj

In [97]:
convert_to_json(redactions_only)

[{'start': 7,
  'end': 7,
  'rule': '3.7.3-8',
  'level': 'S',
  'reasoning': 'Contains a CIA reference number (CIA-RDP99-01448R000402200001-0) which is a technical identifier that could reveal system information'},
 {'start': 145,
  'end': 154,
  'rule': '3.4.1-4',
  'level': 'C',
  'reasoning': 'Names of Washington Post staff writers involved in examining intelligence documents, which reveals general information about intelligence activities and methods'},
 {'start': 166,
  'end': 171,
  'rule': '3.2-2',
  'level': 'C',
  'reasoning': 'References to previously closed files on the assassination being made available, revealing general information about sensitive data sets'},
 {'start': 176,
  'end': 191,
  'rule': '3.5.1-6',
  'level': 'C',
  'reasoning': "Contains general information about U.S. leaders' decisions related to intelligence matters that could damage foreign relations"},
 {'start': 379,
  'end': 379,
  'rule': '3.7.3-8',
  'level': 'S',
  'reasoning': 'Contains a CIA refer