In [50]:
from mistralai import Mistral
import requests
import numpy as np
import os
from getpass import getpass
from dotenv import load_dotenv
from google import genai


In [18]:
api_key = os.environ["MISTRAL_API_KEY"]

client = Mistral(api_key=api_key)

original_version_file_path = "/Users/juankostelec/Google_drive/Projects/legal-assistant-bot/data/2014-2018_network_television_code_v13.pdf"
amendment_file_path = "/Users/juankostelec/Google_drive/Projects/legal-assistant-bot/data/changes/2018MOA-TV-National-Code_0.pdf"

def get_markdown_from_pdf(file_path):

    uploaded_pdf = client.files.upload(
        file={
            "file_name": file_path,
            "content": open(file_path, "rb"),
        },
        purpose="ocr"
    )  
    signed_url = client.files.get_signed_url(file_id=uploaded_pdf.id)

    ocr_response = client.ocr.process(
        model="mistral-ocr-latest",
        document={
            "type": "document_url",
            "document_url": signed_url.url,
        }
    )

    # Create single markdown string from ocr response
    markdown_text = ""
    for page in ocr_response.pages:
        markdown_text += page.markdown

    return markdown_text



In [38]:
from pydantic import BaseModel
from typing import Optional


# Do API call to Claude to ask a question based on markdown context
system_prompt = """You are a specialized assistant focused on SAG-AFTRA agreements and contracts. 
Your task is to analyze the provided context and question, then return a structured JSON response.
provide accurate, well-referenced answers based solely on the provided context from official SAG-AFTRA documents.


Question:
{question}

Context provided:
{context}

Please provide your response in the following JSON format:
{{
    "relevant_text": "The exact text from the agreement that is relevant to the question",
    "citations": {{
        "agreement_name": "Name of the agreement",
        "section": "Section number/title",
        "paragraph": "Paragraph/Item number",
        "page": "Page number (if available)"
    }},
    "answer": "Clear and concise answer to the question based on the relevant text"
}}
"""

def call_claude(prompt):
    # Import the Anthropic client
    from anthropic import Anthropic
    
    # Initialize the client with your API key
    # Make sure to set your API key as an environment variable
    anthropic = Anthropic(api_key=os.getenv('ANTHROPIC_API_KEY'))
    
    # Create a message and get the response from Claude
    message = anthropic.messages.create(
        model="claude-3-7-sonnet-20250219",
        max_tokens=1000,
        messages=[{
            "role": "user",
            "content": prompt
        }]
    )
    
    # Return the response content
    return message.content


def call_gpt4(prompt):
    # Import the OpenAI client
    from openai import OpenAI
    
    # Initialize the client with your API key
    # Make sure to set your API key as an environment variable
    client = OpenAI()  # api_key=os.getenv('OPENAI_API_KEY')

    # Create a chat completion and get the response from GPT-4
    response = client.chat.completions.create(
        model="gpt-4o",  # or "gpt-4" depending on your needs
        messages=[{
            "role": "system",
            "content": system_prompt
        },
        {
            "role": "user",
            "content": prompt
        }],
        temperature=0.1,  # Lower temperature for more precise responses
        max_tokens=2000
    )
    
    # Return the response content
    return response.choices[0].message.content


class Citation(BaseModel):
    agreement_name: str
    section: str
    paragraph: Optional[str] = None
    page: Optional[int] = None

class SAGResponse(BaseModel):
    relevant_text: str
    citations: Citation
    answer: str


def call_gemini_api(prompt):
    # Configure the API
    genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
    
    # Load Gemini Pro model
    model = genai.GenerativeModel('models/gemini-2.0-flash') #Â gemini-2.5-pro-preview-03-25')
    
    # Configure generation parameters for structured output
    generation_config = {
        "temperature": 0.0,  # Lower temperature for more precise responses
        "top_p": 0.8,
        "top_k": 40,
        "max_output_tokens": 20000,
    }
    try:
        # Generate response
        response = model.generate_content(
            prompt,
            generation_config=generation_config,
            config={
                'response_mime_type': 'application/json',
                'response_schema': SAGResponse,
            }
        )
        try:
            parsed_response: SAGResponse = response.parsed
            return parsed_response
        except Exception as e:
            print(f"Error parsing JSON response: {e}")
            return {
                "error": "Failed to parse response as JSON",
                "raw_response": response.text
            }
                
    except Exception as e:
        print(f"Error calling Gemini API: {e}")
        return {
            "error": str(e),
            "raw_response": None
        }

In [39]:
load_dotenv(".env")

# Data
initial_version_markdown_text = get_markdown_from_pdf(original_version_file_path)
amendment_markdown_text = get_markdown_from_pdf(amendment_file_path)

In [40]:
question =  "What was the effective rate on October 2016 for a 40-minute single program performance?"

prompt = create_sag_prompt(initial_version_markdown_text, question)
response = call_gemini_api(prompt)

print(response)
# print(response[0].text)

Error calling Gemini API: GenerativeModel.generate_content() got an unexpected keyword argument 'config'
{'error': "GenerativeModel.generate_content() got an unexpected keyword argument 'config'", 'raw_response': None}


In [48]:
from mistralai import Mistral
import os
from dotenv import load_dotenv
import google.generativeai as genai
from pydantic import BaseModel
from typing import Optional


def get_markdown_from_pdf(file_path):
    mistral_client = Mistral(api_key=os.environ["MISTRAL_API_KEY"])
    uploaded_pdf = mistral_client.files.upload(
        file={
            "file_name": file_path,
            "content": open(file_path, "rb"),
        },
        purpose="ocr",
    )
    signed_url = mistral_client.files.get_signed_url(file_id=uploaded_pdf.id)

    ocr_response = mistral_client.ocr.process(
        model="mistral-ocr-latest",
        document={
            "type": "document_url",
            "document_url": signed_url.url,
        },
    )

    # Create single markdown string from ocr response
    markdown_text = ""
    for page in ocr_response.pages:
        markdown_text += page.markdown

    return markdown_text


# Gemini API calls and prompts


def create_initial_answer_prompt(context, question):
    prompt = """
You are a specialized assistant focused on SAG-AFTRA agreements and contracts. 
Your task is to analyze the provided context and question, then return a structured JSON response.
Provide accurate, well-referenced answers based solely on the provided context from official SAG-AFTRA documents.
Use the following schema for your response:

{{
    "relevant_text": "The exact text from the agreement that answers the question",
    "citations": {{
        "agreement_name": "Name of the agreement",
        "section": "Section number/title",
        "paragraph": "Paragraph/Item number",
        "page": "Page number (if available)"
    }},
    "answer": "Clear and concise answer to the question"
}}

Question:
{question}

Context:
{context}
"""
    return prompt.format(context=context, question=question)


class Citation(BaseModel):
    agreement_name: str
    section: str
    paragraph: Optional[str] = None
    page: Optional[int] = None


class SAGResponse(BaseModel):
    relevant_text: str
    citations: Citation
    answer: str


class RefinedResponse(BaseModel):
    amendment_relevant_text: Optional[str] = None
    refined_answer: str


def create_refine_answer_prompt(context, question, initial_relevant_context, initial_answer):
    prompt = """
You are a specialized assistant focused on SAG-AFTRA agreements and their amendments. 
Your task is to analyze if any amendments modify or affect the initial answer, and provide a structured response.

Question:
{question}

Initial agreement context:
{initial_relevant_context}

Initial Answer:
{initial_answer}

Amendment Context:
{context}

Please analyze the amendments and provide your response in the following JSON format:
{
    "amendment_relevant_text": "The exact text from amendments that modifies the initial answer (null if no relevant amendments)",
    "refined_answer": "The updated answer incorporating any amendments, or the original answer if no changes apply",
}

Important guidelines:
1. Determine if the amendments is even relevant based on the date when the amendment became valid, and the question date (if no explicit date is provided, use the current date, i.e. April 2025)
2. If no relevant amendments are found, keep the initial answer and set is_answer_changed to false
"""

    return prompt.format(
        question=question,
        initial_relevant_context=initial_relevant_context,
        initial_answer=initial_answer,
        context=context,
    )


def call_gemini_api(prompt, response_format: BaseModel):
    # Configure the API
    client = genai.Client(api_key=os.environ["GEMINI_API_KEY"])
    response = client.models.generate_content(
        model="gemini-2.0-flash",
        contents=prompt,
        config={
            "response_mime_type": "application/json",
            "response_schema": response_format,
        },
    )
    return response.parsed



In [42]:

load_dotenv()
original_version_file_path = "/Users/juankostelec/Google_drive/Projects/legal-assistant-bot/data/2014-2018_network_television_code_v13.pdf"
amendment_file_path = "/Users/juankostelec/Google_drive/Projects/legal-assistant-bot/data/changes/2018MOA-TV-National-Code_0.pdf"

# Preprocess the files
original_version_file_path = "/Users/juankostelec/Google_drive/Projects/legal-assistant-bot/data/2014-2018_network_television_code_v13.pdf"
amendment_file_path = "/Users/juankostelec/Google_drive/Projects/legal-assistant-bot/data/changes/2018MOA-TV-National-Code_0.pdf"

original_version_markdown = get_markdown_from_pdf(original_version_file_path)
amendment_markdown = get_markdown_from_pdf(amendment_file_path)



Error calling Gemini API: GenerativeModel.generate_content() got an unexpected keyword argument 'config'
{'error': "GenerativeModel.generate_content() got an unexpected keyword argument 'config'", 'raw_response': None}


AttributeError: 'dict' object has no attribute 'relevant_text'

In [54]:
# Create prompt
load_dotenv(".env")
question = (
    "What was the effective rate on October 2016 for a 40-minute single program performance?"
)
prompt = create_initial_answer_prompt(original_version_markdown, question)

# Call Gemini API
response = call_gemini_api(prompt, SAGResponse)

print(response)

# # Refine answer
# prompt = create_refine_answer_prompt(
#     context=amendment_markdown,
#     question=question,
#     initial_relevant_context=response.relevant_text,
#     initial_answer=response.answer,
# )
# response = call_gemini_api(prompt, RefinedResponse)
# print(response)

ClientError: 400 INVALID_ARGUMENT. {'error': {'code': 400, 'message': 'API key not valid. Please pass a valid API key.', 'status': 'INVALID_ARGUMENT', 'details': [{'@type': 'type.googleapis.com/google.rpc.ErrorInfo', 'reason': 'API_KEY_INVALID', 'domain': 'googleapis.com', 'metadata': {'service': 'generativelanguage.googleapis.com'}}, {'@type': 'type.googleapis.com/google.rpc.LocalizedMessage', 'locale': 'en-US', 'message': 'API key not valid. Please pass a valid API key.'}]}}

In [None]:
# Let us try to split the markdown at every paragraph

path = "/Users/juankostelec/Google_drive/Projects/legal-assistant-bot/data/markdown/2014-2018_network_television_code_v13.md"

with open(path, "r") as f:
    text = f.read()

chunks = text.split("\n\n")

for chunk in chunks:
    print(chunk)
    print("\n")
    breakpoint()




# SAG$\cdot$AFTRA. 
## 2014-2018 SAG-AFTRA National Code of Fair Practice for Network Television Broadcasting# TABLE OF CONTENTS 
Paragraph Subiect Page
1. Length of Contract ..... 1
2. Principal ..... 1
Performers
2.A. Dramatic Programs ..... 1
2.B. Non-Dramatic Programs ..... 3
3. Performers Who Speak Five Lines or Less. ..... 6
4. Commercial Performers and Announcers Off-Camera ..... 8
5. Groups \& Chornes ..... 11
5.A. Group Dancers ..... 11
5.B. Chorus Singers ..... 16
6. Specialty Acts ..... 20
7. Sportscasters ..... 21
8. Background Actors ..... 23
9. Live Signature Numbers ..... 27
9.A. Standard Non-Commercial Openings and Closings and Musical Signatures ..... 27
10. Promotional Announcements ..... 28
11. Sustaining Programs ..... 32
12. Rates for Programs in Excess of Two Hours and Morning News Programs ..... 32
13. Periods of Rehearsal and Rates ..... 32
14. Rehearsal Days ..... 33
15. Overtime ..... 33
16. Rest Between Days ..... 35
17. Minimum Daily Call ..... 36
18. Minimu