In [1]:
import os
from datetime import datetime
from pathlib import Path

from openai import OpenAI
import instructor
from pydantic import BaseModel, Field
from typing import List, Optional, Dict

import math
from collections import Counter

In [2]:
from dotenv import load_dotenv

In [3]:
load_dotenv()

True

In [4]:
# ---------------------------------------------------------------------
# 1.  Endpoint client
# ---------------------------------------------------------------------
client = OpenAI(
    api_key=os.environ.get("BASE10_API_KEY", "YOUR_API_KEY"),
    base_url="https://model-7qr7px53.api.baseten.co/environments/production/sync/v1",
)

# Wrap client so responses are automatically parsed
client = instructor.from_openai(client, mode=instructor.Mode.MD_JSON)

In [5]:
# ---------------------------------------------------------------------
# 2.  Design an appropriate system prompt for these tasks
# ---------------------------------------------------------------------

In [6]:
# TODO: Improve the system prompt for better performance on legal documents
SYSTEM_PROMPT = """You are an expert legal transcript analyzer"""


In [7]:
# ---------------------------------------------------------------------
# 3.  Load the transcript
# ---------------------------------------------------------------------

In [8]:
RAW_TRANSCRIPT = open('transcript.txt').read()

In [9]:
# ---------------------------------------------------------------------
# 3.  Summarize the transcript (example)
# ---------------------------------------------------------------------

In [10]:
USER_PROMPT_TEMPLATE="""
Summarize this transcript
{raw_transcript}
"""

In [11]:
class TranscriptSummary(BaseModel):
    summary: str = Field(..., description="Transcript summary")

In [12]:
summary_response, raw_summary_completion = client.chat.completions.create_with_completion(
    model="qwen-3",
    response_model=TranscriptSummary,
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT_TEMPLATE.format(raw_transcript=RAW_TRANSCRIPT)},
    ],
    temperature=0.6,
)


In [13]:
print(summary_response.summary)

The transcript documents a deposition led by Terry Seligman, representing plaintiff Moonlight Plaza Associates, examining Ms. Jacob, a senior environmental project manager at Cancun Farms. Ms. Jacob confirms her role involves overseeing environmental remediation and assessments through hired consultants, though she does not directly manage day-to-day activities. She reviews consultant-provided documents and discusses remedial scope but does not dictate report preparation. Adjustments to scope are based on collaborative discussions and site conditions. Ms. Jacob reports to Cancun’s Vice President of Environmental and does not review property leases, focusing instead on regulatory compliance to achieve closure on spill sites. Two exhibits (Cancun Farms 1 and 2) are referenced during the deposition.


In [14]:
SYSTEM_PROMPT = """
You are an expert legal transcript analyst with a strong focus on legal precision and clarity. 
Your role is to carefully analyze depositions, ensuring all legal terminology, procedural details, and testimony nuances are correctly interpreted and clearly conveyed.
Maintain a professional, detailed, and exact tone appropriate for legal professionals relying on your summaries.
"""


In [15]:
summary_response, raw_summary_completion = client.chat.completions.create_with_completion(
    model="qwen-3",
    response_model=TranscriptSummary,
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT_TEMPLATE.format(raw_transcript=RAW_TRANSCRIPT)},
    ],
    temperature=0.6,
)


In [16]:
print(summary_response.summary)

The transcript details a deposition led by Terry Seligman, counsel for plaintiff Moonlight Plaza Associates, examining Ms. Jacob, a senior environmental project manager at Cancun Farms. After confirming technical setup and procedural instructions, Seligman establishes Ms. Jacob's role in overseeing environmental remediation at Cancun Farms, clarifying her reliance on external consultants for day-to-day remediation tasks. Key topics include her review of consultant-generated documents, her involvement in scoping remediation efforts, and her reliance on site history and regulatory requirements (rather than leases) to determine appropriate remediation scope. Objections are noted during questions about direct control over consultants. The deposition concludes with the marking of Exhibits Cancun Farms 1 and 2 for record-keeping.


In [17]:
# This model also includes reasoning tokens. Do you agree with it's reasoning?

In [18]:
print(raw_summary_completion.choices[0].message.reasoning_content)

Okay, let's tackle this query. The user wants a summary of the provided transcript in a specific JSON format. First, I need to understand the content of the transcript.

The transcript starts with Speaker A, Terry Seligman, introducing himself and explaining the deposition process to Ms. Jacob. He checks if she's under any influence and confirms she's the designated deponent. Then, he presents documents (Cancun Farms 1 and 2) and asks about her role as a senior environmental project manager. The discussion covers her responsibilities, interactions with consultants, and how Cancun Farms determines remediation scopes. There are objections from Speaker C, but the main points are about her role, document reviews, and company procedures.

Now, the summary needs to capture all these elements concisely. The JSON schema requires a "summary" key. I should mention the parties involved, the purpose of the deposition, her role, document discussions, scope determination, objections, and the conclus

In [19]:
outline = open('sample.txt').read()

In [20]:
class Introduction(BaseModel):
    deponent_name: str = Field(..., description="Full name of the deponent")
    role: Optional[str] = Field(None, description="Role or title of the deponent in the organization or case")
    attorneys_present: List[str] = Field(..., description="Names of attorneys present during the deposition")
    instructions_given: Optional[str] = Field(None, description="Instructions provided to the deponent at the beginning")
    initial_demeanor_notes: Optional[str] = Field(None, description="Observed demeanor of the deponent at the start")

class BackgroundAndRole(BaseModel):
    job_title: str = Field(..., description="Job title of the deponent")
    employer: Optional[str] = Field(None, description="Current or past employer relevant to the deposition")
    job_duties: Optional[str] = Field(None, description="Description of the deponent's job responsibilities")
    case_relationship: Optional[str] = Field(None, description="How the deponent is related to the case")

class ScopeOfWork(BaseModel):
    decision_making_authority: Optional[str] = Field(None, description="Extent of the deponent's decision-making authority")
    interactions_with_consultants: Optional[str] = Field(None, description="Nature of the deponent's interactions with consultants")
    documents_handled: Optional[str] = Field(None, description="Types of documents the deponent handled")
    limitations: Optional[str] = Field(None, description="Limitations on the deponent's role or responsibilities")

class LegalRelevance(BaseModel):
    admissions: Optional[List[str]] = Field(None, description="Statements that may be admissions")
    contradictions: Optional[List[str]] = Field(None, description="Inconsistencies in the testimony")
    regulatory_references: Optional[List[str]] = Field(None, description="Mentions of regulations or legal standards")
    contract_references: Optional[List[str]] = Field(None, description="Mentions of contracts or contractual obligations")

class KeyTestimonyAreas(BaseModel):
    background_and_role: BackgroundAndRole = Field(..., description="Deponent's background and role in the matter")
    scope_of_work: Optional[ScopeOfWork] = Field(None, description="Deponent's scope of work and responsibilities")
    legal_relevance: Optional[LegalRelevance] = Field(None, description="Legally significant parts of the testimony")

class CredibilityNotes(BaseModel):
    clarity: Optional[str] = Field(None, description="Clarity and coherence of the deponent's answers")
    technical_issues: Optional[str] = Field(None, description="Any audio or technical disruptions")
    signs_of_evasion: Optional[str] = Field(None, description="Indicators that the deponent may be evasive")
    demeanor: Optional[str] = Field(None, description="Behavior or attitude impacting credibility")

class Objection(BaseModel):
    description: str = Field(..., description="Description of the objection raised")
    objecting_party: Optional[str] = Field(None, description="Name of the attorney or party who raised the objection")
    impact_on_testimony: Optional[str] = Field(None, description="Effect the objection had on the witness or the record")

class Quote(BaseModel):
    quote_text: str = Field(..., description="Exact quoted statement from the deponent")
    timestamp_reference: Optional[str] = Field(None, description="Time reference where the quote appears")

class DepositionSummary(BaseModel):
    case_title: str = Field(..., description="Title of the legal case")
    date_of_deposition: Optional[str] = Field(None, description="Date when the deposition was taken")
    deponent: str = Field(..., description="Name of the deponent")
    deposed_by: Optional[str] = Field(None, description="Name of the examining attorney")
    timestamps: Optional[str] = Field(None, description="Timestamp log format or file reference")

    introduction: Introduction = Field(..., description="Introduction section of the deposition")
    key_testimony_areas: KeyTestimonyAreas = Field(..., description="Main areas of testimony relevant to the case")
    credibility_notes: Optional[CredibilityNotes] = Field(None, description="Credibility-related observations")
    objections_noted: Optional[List[Objection]] = Field(None, description="List of objections made during deposition")
    important_quotes: Optional[List[Quote]] = Field(None, description="Key quotes from the deponent")
    summary_of_key_points: List[str] = Field(..., description="Summary list of important points from the deposition")

USER_PROMPT_TEMPLATE = f"""
You are an expert legal transcript analyst.  
**Task:** Use the following deposition outline to generate a structured summary of the transcript.

### Outline:
{outline}

### Transcript:
{RAW_TRANSCRIPT}
# """

In [21]:
summary_response, raw_summary_completion = client.chat.completions.create_with_completion(
    model="qwen-3",
    response_model=DepositionSummary,
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT_TEMPLATE},
    ],
    temperature=0.6,
)


In [22]:
summary_response.case_title

'Moonlight Plaza Associates v. Cancun Farms'

In [23]:
summary_response.deponent

'Andrea Jacob'

In [24]:
summary_response.deposed_by

'Terry Seligman (Richmond and Levine, P.C.)'

In [25]:
summary_response.introduction.deponent_name, summary_response.introduction.role


('Andrea Jacob', 'Senior Environmental Project Manager')

In [26]:
summary_response.key_testimony_areas.background_and_role


BackgroundAndRole(job_title='Senior Environmental Project Manager', employer='Cancun Farms', job_duties='Oversee assessment and remediation work on sites with open spills; interface with environmental consultants.', case_relationship='Designated corporate representative for deposition topics related to environmental remediation.')

In [27]:
summary_response.key_testimony_areas.scope_of_work


ScopeOfWork(decision_making_authority='Reviews and discusses scope proposals from consultants but does not dictate specific remedial methods.', interactions_with_consultants='Collaborates with consultants on remediation plans; reviews and edits consultant-prepared documents.', documents_handled='Environmental reports and remediation plans (e.g., Exhibit Cancun Farms 1 and 2).', limitations='Does not review leases; defers to consultants for report preparation per regulatory standards.')

In [28]:
summary_response.key_testimony_areas.legal_relevance


LegalRelevance(admissions=['I do not normally review leases.', 'We just work with my consultant to close the spills.'], contradictions=None, regulatory_references=['State and local regulations for no further action letters.'], contract_references=['Environmental consultant agreements.'])

In [29]:
summary_response.credibility_notes


CredibilityNotes(clarity='Answers generally clear but occasionally affected by technical delays.', technical_issues='Audio delay noted; deponent repeated questions for clarity.', signs_of_evasion='No overt evasion; direct answers provided.', demeanor='Professional and cooperative despite technical challenges.')

In [30]:
summary_response.objections_noted


[Objection(description='Objection to form (unclear specific ground); overruled.', objecting_party='Speaker C', impact_on_testimony='Deponent answered after repetition of the question.')]

In [31]:
summary_response.important_quotes


[Quote(quote_text='I do not normally review leases.', timestamp_reference='00:06:45'),
 Quote(quote_text='I just work with my consultant to close the spills.', timestamp_reference='00:07:06')]

In [32]:
summary_response.summary_of_key_points


['Andrea Jacob oversees environmental remediation projects for Cancun Farms but relies on third-party consultants for day-to-day execution.',
 'She does not review leases or dictate specific remedial methods to consultants, instead focusing on scope discussions.',
 'Admitted to excluding lease reviews from remediation decisions, citing reliance on consultants.',
 'Remediation goals align with state and local regulatory standards for closure.',
 "Designated corporate representative with direct oversight of environmental consultants' work products."]

In [33]:
print(raw_summary_completion.choices[0].message.reasoning_content)

Okay, let me tackle this query step by step. The user wants me to generate a structured JSON summary of a deposition transcript based on the provided outline. First, I need to parse the transcript and map the information to the schema they specified.

Starting with the **Case Title**, the transcript mentions "Cancun Farms" and the plaintiff "Moonlight Plaza Associates," so I'll combine that into the case title. The **Deponent** is clearly Ms. Jacob, and the **Date of Deposition** isn't explicitly stated, so I'll leave it as null. **Timestamps Covered** should be from the start to the end of the transcript, which is up to 00:08:46. The **Deposed By** is Terry Seligman from Richmond and Levine, P.C., representing the plaintiff.

Moving to the **Introduction** section: The deponent's name is Andrea Jacob, and her role is Senior Environmental Project Manager. The attorneys present are Terry Seligman and possibly someone named Porter (from "Mr. Porter" when marking exhibits). The instructio

In [34]:
# ---------------------------------------------------------------------
# 4.  Classify the type of legal proceeding
# ---------------------------------------------------------------------

In [35]:
# TODO: Design a prompt to classify the type of legal proceeding

In [36]:
USER_PROMPT_TEMPLATE = f"""
You are an expert legal transcript analyst.  

**Task:** Determine the type of legal proceeding.
   Common types include:
   - Deposition  
   - Arbitration  
   - Trial (Civil or Criminal)  
   - Hearing (Administrative, Pretrial, Evidentiary, etc.)  
   - Mediation  
   - Sentencing  
   - Motion Hearing  
   - Voir Dire  
   - Grand Jury Proceeding

   
### Transcript:
{RAW_TRANSCRIPT}
"""

In [37]:
# TODO: Add attributes to a Pydantic model for the classification response

In [38]:
class LegalProceeding(BaseModel):
    type_of_legal_proceeding: str = Field(..., description="Type of Legal Proceeding")

In [39]:
proceeding_response, raw_proceeding_completion = client.chat.completions.create_with_completion(
    model="qwen-3",
    response_model=LegalProceeding,
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT_TEMPLATE.format(raw_transcript=RAW_TRANSCRIPT)},
    ],
    temperature=0.2
)

In [40]:
results = []

for experiment in range(0,5):
    proceeding_response, raw_proceeding_completion = client.chat.completions.create_with_completion(
    model="qwen-3",
    response_model=LegalProceeding,
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT_TEMPLATE.format(raw_transcript=RAW_TRANSCRIPT)},
    ],
    temperature=0.2 # Decrease to make it more deterministic, legal classification
    )

    results.append(proceeding_response.type_of_legal_proceeding)

In [41]:
def entropy(labels):
    total = len(labels)
    counts = Counter(labels)
    ent = 0.0
    
    for count in counts.values():
        p = count / total
        ent -= p * math.log2(p)
    
    return ent

In [42]:
entropy(results) 

0.0

In [43]:
print(raw_proceeding_completion.choices[0].message.reasoning_content)

Okay, let's tackle this. The user wants me to determine the type of legal proceeding based on the provided transcript.

First, I'll read through the transcript to pick up clues. The speaker starts by introducing himself as Terry Seligman from a law firm, representing the plaintiff. He mentions "deposition" several times in his instructions to the witness. For example, he says, "I just have a list of general instructions I give to all deponents," which is a strong indicator of a deposition. 

The witness, Ms. Jacob, is being questioned about her role at Cancun Farms, her responsibilities, and her interactions with environmental consultants. The attorney also refers to exhibits being marked as "Cancun Farms 1" and "Cancun Farms 2," which is typical in depositions where documents are entered into evidence. 

There's an objection from another speaker (probably opposing counsel) at 00:04:05 and 00:05:04, which is common in depositions when attorneys object to questions but still allow the w

In [44]:
# ---------------------------------------------------------------------
# 5.  Identify each speaker listed
# ---------------------------------------------------------------------

In [45]:
# TODO: Design a prompt to identify each speaker listed (Speaker A, Speaker B, etc.)

In [46]:
USER_PROMPT_TEMPLATE = f"""
You are an expert legal transcript analyst.  

**Task:** Identify and label each speaker in the transcript (e.g., Speaker A, Speaker B, etc.) with their most likely role or name, based on the context. 
    Common roles include:
    - Attorney (name if available)
    - Witness
    - Judge
    - Court Reporter
    - Clerk
    - Expert
    - Defendant
    - Plaintiff
    - Arbitrator
   
### Transcript:
{RAW_TRANSCRIPT}
"""

In [47]:
class Speakers(BaseModel):
    speakers: Dict[str, str] = Field(..., description="Mapping of speaker to their name or role")


In [48]:
# TODO: Add attributes to a Pydantic model for the speaker response

In [49]:
speakers_response, raw_speakers_completion = client.chat.completions.create_with_completion(
    model="qwen-3",
    response_model=Speakers,
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT_TEMPLATE.format(raw_transcript=RAW_TRANSCRIPT)},
    ],
    temperature=0.6,
)

In [50]:
speakers_response.speakers

{'Speaker A': 'Attorney (Plaintiff, Terry Seligman, Richmond and Levine, P.C.)',
 'Speaker B': 'Witness (Senior Environmental Project Manager, Cancun Farms)',
 'Speaker C': 'Attorney (Defense, objecting counsel)'}

In [51]:
# ---------------------------------------------------------------------
# 5.  Identify sections of cross-talk
# ---------------------------------------------------------------------

In [52]:
# TODO: Design a prompt to identify sections of cross-talk

In [53]:
USER_PROMPT_TEMPLATE = f"""
You are an expert legal transcript analyst.  

**Task:** Identify every instance of cross-talk in the transcript — where two or more speakers talk over each other, interrupt, or speak simultaneously.
Be exhaustive and do not miss any overlapping speech.

**Instructions:**
- Provide **start and end timestamps** for each crosstalk event
- List the names of all speakers involved
- Provide the overlapping lines **exactly as spoken**
- Include a brief note describing the nature of the interruption or overlap

### Transcript:
{RAW_TRANSCRIPT}
""" 

In [54]:
# TODO: Design the Pydantic model for the cross-talk response

In [55]:
class CrossTalkAnalysis(BaseModel):
    start_time: str = Field(..., description="Start timestamp of the crosstalk")
    end_time: str = Field(..., description="End timestamp of the crosstalk")
    speakers: List[str] = Field(..., description="List of speaker names involved in the crosstalk")
    lines: Dict[str, str] = Field(..., description="Mapping of speaker name to their overlapping spoken line")
    notes: Optional[str] = Field(None, description="Brief note describing nature of crosstalk")

class CrossTalkEvent(BaseModel):
    events: List[CrossTalkAnalysis] = Field(..., description="List of all crosstalk")

In [56]:
crosstalk_response, raw_crosstalk_completion = client.chat.completions.create_with_completion(
    model="qwen-3",
    response_model=CrossTalkEvent,
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT_TEMPLATE.format(raw_transcript=RAW_TRANSCRIPT)},
    ],
    temperature=0.4,
)

In [57]:
len(crosstalk_response.events)

4

In [58]:
crosstalk_response.events

[CrossTalkAnalysis(start_time='00:04:05', end_time='00:04:08', speakers=['Speaker A', 'Speaker C'], lines={'Speaker A': 'So do you revise documents provided to you by. By those environmental consultants?', 'Speaker C': 'Objection. You may answer.'}, notes='Speaker C objects while Speaker A is asking the question, creating overlapping speech.'),
 CrossTalkAnalysis(start_time='00:04:24', end_time='00:04:25', speakers=['Speaker A', 'Speaker B'], lines={'Speaker A': 'Do you discuss those documents with the.', 'Speaker B': 'And I will make edits.'}, notes='Simultaneous speaking; Speaker B begins responding before Speaker A completes the question.'),
 CrossTalkAnalysis(start_time='00:06:16', end_time='00:06:20', speakers=['Speaker B', 'Speaker C'], lines={'Speaker B': "If they're available.", 'Speaker C': 'The connection is a little rough here.'}, notes='Speaker C comments on technical issues while Speaker B is responding, resulting in overlapping speech.'),
 CrossTalkAnalysis(start_time='00

In [65]:
def pretty_print(self):
    print(f"Crosstalk detected from {self.start_time} to {self.end_time}")
    print(f"Speakers involved: {', '.join(self.speakers)}")
    print("Overlapping lines:")
    for speaker in self.speakers:
        line = self.lines.get(speaker, "")
        print(f"  {speaker}: \"{line}\"")
    print(f"Note: {self.notes}")

In [66]:
for i, event in enumerate(crosstalk_response.events):
    print(f"Crosstalk {i+1}")
    pretty_print(event)
    print(f"----")


Crosstalk 1
Crosstalk detected from 00:04:05 to 00:04:08
Speakers involved: Speaker A, Speaker C
Overlapping lines:
  Speaker A: "So do you revise documents provided to you by. By those environmental consultants?"
  Speaker C: "Objection. You may answer."
Note: Speaker C objects while Speaker A is asking the question, creating overlapping speech.
----
Crosstalk 2
Crosstalk detected from 00:04:24 to 00:04:25
Speakers involved: Speaker A, Speaker B
Overlapping lines:
  Speaker A: "Do you discuss those documents with the."
  Speaker B: "And I will make edits."
Note: Simultaneous speaking; Speaker B begins responding before Speaker A completes the question.
----
Crosstalk 3
Crosstalk detected from 00:06:16 to 00:06:20
Speakers involved: Speaker B, Speaker C
Overlapping lines:
  Speaker B: "If they're available."
  Speaker C: "The connection is a little rough here."
Note: Speaker C comments on technical issues while Speaker B is responding, resulting in overlapping speech.
----
Crosstalk 4


In [61]:
# results = []

# for experiment in range(0,5):
#     crosstalk_response, raw_crosstalk_completion = client.chat.completions.create_with_completion(
#     model="qwen-3",
#     response_model=CrossTalkEvent,
#     messages=[
#         {"role": "system", "content": SYSTEM_PROMPT},
#         {"role": "user", "content": USER_PROMPT_TEMPLATE.format(raw_transcript=RAW_TRANSCRIPT)},
#     ],
#     temperature=0.6,
#     )
#     results.append(len(crosstalk_response.events))

In [None]:
# # high temperature - faster, but unstable results
# # low temperature - slower, more stable results
# results 
