In [1]:
from langchain_openai import ChatOpenAI

import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')

# Initialize the LLM
llm = ChatOpenAI( api_key=api_key, temperature=0.1) #model_name='gpt-4o',


In [2]:

query = "What is 3 * 12?"

llm.invoke(query)

AIMessage(content='3 * 12 is equal to 36.', response_metadata={'token_usage': {'completion_tokens': 10, 'prompt_tokens': 15, 'total_tokens': 25}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-6c3c3814-a306-4f2c-bb68-8099861eea66-0', usage_metadata={'input_tokens': 15, 'output_tokens': 10, 'total_tokens': 25})

In [3]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.document_loaders import TextLoader
from langchain_openai import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from typing import List
import datetime
import os

class Claims:
    def __init__(self, speaker:str, claim:str, timestamp:str, measurable:bool, analysis:str)->None:
        self.speaker = speaker
        self.claim = claim
        self.measurable = measurable
        self.analysis = analysis
        self.timestamp = timestamp

# Define Pydantic models
class ClaimModel(BaseModel):
    claim: str
    measurable: bool
    analysis: str

class ClaimList(BaseModel):
    claims: List[ClaimModel]

# Create the output parser
parser = PydanticOutputParser(pydantic_object=ClaimList)

# Load the document
file_path = "data/Kamala_Harris_holds_first_campaign_rally_FULL_SPEECH_2024-07-23_190126.txt"
loader = TextLoader(file_path)
document = loader.load()

# Create an LLM
# llm = ChatOpenAI(temperature=0)

# Create a prompt to extract speaker from filename
extract_speaker_template = ChatPromptTemplate.from_template(
    "Extract the speaker's name from this filename: {filename}. Only return the name, nothing else."
)

# Extract speaker using LLM
speaker_chain = extract_speaker_template | llm
speaker_result = speaker_chain.invoke({"filename": os.path.basename(file_path)})
speaker = speaker_result.content.strip()

# Get current timestamp
timestamp = datetime.datetime.now().isoformat()

# Create the prompt templates for claim extraction and analysis
extract_claims_template = ChatPromptTemplate.from_template(
    "Extract claims from the following text:\n\n{text}\n\n list each claim as a separate bullet point. Claims:"
)

analyze_claims_template = ChatPromptTemplate.from_template(
    """For each claim, determine if it is measurable or not. If measurable, explain how it could be quantifiably measured or validated using real-world data. If not measurable, explain why it's too vague or subjective to measure. Consider specific metrics, data sources, or methods that could be used for validation.

    Format your response as a list of claims with their analysis, following this structure:
    {format_instructions}

    Claims to analyze:
    {claims}
    """
)

# Create the LCEL chain for claim extraction and analysis
chain = (
    {"text": lambda x: x['text']}
    | extract_claims_template
    | llm
    | {"claims": lambda x: x.content}
    | analyze_claims_template.partial(format_instructions=parser.get_format_instructions())
    | llm
    | parser
)

# Run the chain
result = chain.invoke({"text": document[0].page_content})

# Convert the results to Claims objects
claims_objects = [
    Claims(
        speaker=speaker,
        claim=claim.claim,
        timestamp=timestamp,
        measurable=claim.measurable,
        analysis=claim.analysis
    )
    for claim in result.claims
]
print("nunber of claims: ", len(claims_objects))
# Print the results
for claim in claims_objects:
    print(f"Speaker: {claim.speaker}")
    print(f"Claim: {claim.claim}")
    print(f"Timestamp: {claim.timestamp}")
    print(f"Measurable: {'Yes' if claim.measurable else 'No'}")
    print(f"Analysis: {claim.analysis}")
    print()
# 

nunber of claims:  8
Speaker: Kamala Harris
Claim: Joe Biden's legacy of accomplishment is unmatched in modern history
Timestamp: 2024-07-28T21:58:10.266963
Measurable: No
Analysis: This claim is subjective and cannot be objectively measured as it depends on individual interpretation of what constitutes accomplishment and how it compares to other historical figures.

Speaker: Kamala Harris
Claim: Kamala Harris has earned enough delegates to secure the Democratic nomination
Timestamp: 2024-07-28T21:58:10.266963
Measurable: Yes
Analysis: This claim is measurable by tracking the number of pledged delegates Kamala Harris has secured in the Democratic primary elections. This data is publicly available and can be quantified to determine if she has reached the threshold for nomination.

Speaker: Kamala Harris
Claim: Kamala Harris will continue to unite the party to win in November
Timestamp: 2024-07-28T21:58:10.266963
Measurable: No
Analysis: This claim is subjective and speculative as it per

In [4]:
for claim in claims_objects:
    if claim.measurable:
        print(f"Speaker: {claim.speaker}")
        print(f"Claim: {claim.claim}")
        print(f"Timestamp: {claim.timestamp}")
        print(f"Analysis: {claim.analysis}")
        print()

Speaker: Kamala Harris
Claim: Kamala Harris has earned enough delegates to secure the Democratic nomination
Timestamp: 2024-07-28T21:58:10.266963
Analysis: This claim is measurable by tracking the number of pledged delegates Kamala Harris has secured in the Democratic primary elections. This data is publicly available and can be quantified to determine if she has reached the threshold for nomination.

Speaker: Kamala Harris
Claim: Kamala Harris has a record of taking on perpetrators and holding them accountable
Timestamp: 2024-07-28T21:58:10.266963
Analysis: This claim is measurable by examining Kamala Harris's past cases as a prosecutor and her actions as a senator. The number of cases prosecuted, convictions secured, and policies implemented can be quantified to assess her record in holding perpetrators accountable.

Speaker: Kamala Harris
Claim: Kamala Harris will fight for reproductive freedom and sign laws to restore reproductive freedoms as president
Timestamp: 2024-07-28T21:58:1

In [5]:
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker

SQLALCHEMY_DATABASE_URL = "sqlite:///./test.db"

engine = create_engine(SQLALCHEMY_DATABASE_URL, echo=True)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)

db = SessionLocal()
try:
    yield db
finally:
    db.close()
Base = declarative_base()
    for claim in claims:  
        # db_claim = Claim(**claim.dict(), video_id=video_id) 
        print(claim.__str__())
        db_claim = Claim(speaker=claim.speaker,
            claim=claim.claim,
            timestamp=claim.timestamp,
            measurable=claim.measurable,  
            analysis=claim.analysis, 
            quote=claim.quote,video_id=video_id)
        db.add(db_claim)
         
    db.commit() 


IndentationError: unexpected indent (258980941.py, line 16)

In [None]:
session.execute(text('CREATE EXTENSION IF NOT EXISTS vector'))

In [None]:
## csv approch
# from langchain_core.prompts import ChatPromptTemplate
# from langchain_community.document_loaders import CSVLoader
# from langchain_openai import ChatOpenAI
# from langchain.output_parsers import PydanticOutputParser
# from pydantic import BaseModel, Field
# from typing import List
# import datetime
# import os

# class Claims:
#     def __init__(self, speaker:str, claim:str, timestamp:str, measurable:bool, analysis:str, source_timestamp:str)->None:
#         self.speaker = speaker
#         self.claim = claim
#         self.measurable = measurable
#         self.analysis = analysis
#         self.timestamp = timestamp
#         self.source_timestamp = source_timestamp

# # Define Pydantic models
# class ClaimModel(BaseModel):
#     claim: str
#     measurable: bool
#     analysis: str
#     source_timestamp: str

# class ClaimList(BaseModel):
#     claims: List[ClaimModel]

# # Create the output parser
# parser = PydanticOutputParser(pydantic_object=ClaimList)

# # Load the document
# file_path = "data/NEW_Trump_Pledges_To_Shut_Down_Department_Of_Education_At_Faith__Freedom_Event__FULL_SPEECH_2024-06-22T203001Z.csv"
# loader = CSVLoader(file_path)
# document = loader.load()


# # Create a prompt to extract speaker from filename
# extract_speaker_template = ChatPromptTemplate.from_template(
#     "Extract the speaker's name from this filename: {filename}. Only return the name, nothing else."
# )

# # Extract speaker using LLM
# speaker_chain = extract_speaker_template | llm
# speaker_result = speaker_chain.invoke({"filename": os.path.basename(file_path)})
# speaker = speaker_result.content.strip()

# # Get current timestamp
# timestamp = datetime.datetime.now().isoformat()

# # Create the prompt templates for claim extraction and analysis
# extract_claims_template = ChatPromptTemplate.from_template(
#     "Extract the most impactful claims made from the following text:\n\n{text}\n\n List each claim as a separate bullet point. After each claim, include in parentheses () the timestamp from the text that was used to derive the claim. Claims:"
# )

# analyze_claims_template = ChatPromptTemplate.from_template(
#     """For each claim, determine if it is measurable or not. If measurable, explain how it could be quantifiably measured or validated using real-world data. If not measurable, explain why it's too vague or subjective to measure. Consider specific metrics, data sources, or methods that could be used for validation.

#     Format your response as a list of claims with their analysis, following this structure:
#     {format_instructions}

#     Claims to analyze:
#     {claims}
#     """
# )

# # Create the LCEL chain for claim extraction and analysis
# chain = (
#     {"text": lambda x: x['text']}
#     | extract_claims_template
#     | llm
#     | (lambda x: {"claims": [{"claim": c.split("(")[0], "source_timestamp": c.split("(")[1].split(")")[0]} for c in x.content.split("\n") if "(" in c and ")" in c]})
#     | analyze_claims_template.partial(format_instructions=parser.get_format_instructions())
#     | llm
#     | parser
# )

# # Run the chain
# result = chain.invoke({"text": document[0].page_content})

# # Convert the results to Claims objects
# claims_objects = [
#     Claims(
#         speaker=speaker,
#         claim=claim.claim,
#         timestamp=timestamp,
#         measurable=claim.measurable,
#         analysis=claim.analysis,
#         source_timestamp=claim.source_timestamp
#     )
#     for claim in result.claims
# ]

# print("number of claims: ", len(claims_objects))
# # Print the results
# for claim in claims_objects:
#     print(f"Speaker: {claim.speaker}")
#     print(f"Claim: {claim.claim}")
#     print(f"Source Timestamp: {claim.source_timestamp}")
#     print(f"Timestamp: {claim.timestamp}")
#     print(f"Measurable: {'Yes' if claim.measurable else 'No'}")
#     print(f"Analysis: {claim.analysis}")
#     print()


In [None]:
## quote approch
# from langchain_core.prompts import ChatPromptTemplate
# from langchain_community.document_loaders import TextLoader
# from langchain_openai import ChatOpenAI
# from langchain.output_parsers import PydanticOutputParser
# from pydantic import BaseModel, Field
# from typing import List
# import datetime
# import os

# class Claims:
#     def __init__(self, speaker:str, claim:str, timestamp:str, measurable:bool, analysis:str,quote:str)->None:
#         self.speaker = speaker
#         self.claim = claim
#         self.measurable = measurable
#         self.analysis = analysis
#         self.timestamp = timestamp
#         self.quote = quote

# # Define Pydantic models
# class ClaimModel(BaseModel):
#     claim: str
#     measurable: bool
#     analysis: str
#     quote: str

# class ClaimList(BaseModel):
#     claims: List[ClaimModel]

# # Create the output parser
# parser = PydanticOutputParser(pydantic_object=ClaimList)

# # Load the document
# file_path = "data/President_Bidens_State_of_the_Union_Address_2024-03-08T034913Z.txt"
# loader = TextLoader(file_path)
# document = loader.load()


# # Create a prompt to extract speaker from filename
# extract_speaker_template = ChatPromptTemplate.from_template(
#     "Extract the speaker's name from this filename: {filename}. Only return the name, nothing else."
# )

# # Extract speaker using LLM
# speaker_chain = extract_speaker_template | llm
# speaker_result = speaker_chain.invoke({"filename": os.path.basename(file_path)})
# speaker = speaker_result.content.strip()

# # Get current timestamp
# timestamp = datetime.datetime.now().isoformat()

# # Create the prompt templates for claim extraction and analysis
# extract_claims_template = ChatPromptTemplate.from_template(
#     "Extract the most impactful claims made from the following text:\n\n{text}\n\n List each claim as a separate bullet point. After each claim, include in parentheses () the partial quote from the text that was used to derive the claim. Claims:"
# )

# analyze_claims_template = ChatPromptTemplate.from_template(
#     """For each claim, determine if it is measurable or not. If measurable, explain how it could be quantifiably measured or validated using real-world data. If not measurable, explain why it's too vague or subjective to measure. Consider specific metrics, data sources, or methods that could be used for validation.

#     Format your response as a list of claims with their analysis, following this structure:
#     {format_instructions}

#     Claims to analyze:
#     {claims}
#     """
# )

# # Create the LCEL chain for claim extraction and analysis
# chain = (
#     {"text": lambda x: x['text']}
#     | extract_claims_template
#     | llm
#     | (lambda x: {"claims": [{"claim": c.split("(")[0], "quote": c.split("(")[1].split(")")[0]} for c in x.content.split("\n") if "(" in c and ")" in c]})
#     | analyze_claims_template.partial(format_instructions=parser.get_format_instructions())
#     | llm
#     | parser
# )

# # Run the chain
# result = chain.invoke({"text": document[0].page_content})

# # Convert the results to Claims objects
# claims_objects = [
#     Claims(
#         speaker=speaker,
#         claim=claim.claim,
#         timestamp=timestamp,
#         measurable=claim.measurable,
#         analysis=claim.analysis,
#         quote=claim.quote
#     )
#     for claim in result.claims
# ]

# print("nunber of claims: ", len(claims_objects))
# # Print the results
# for claim in claims_objects:
#     print(f"Speaker: {claim.speaker}")
#     print(f"Claim: {claim.claim}")
#     print(f"Quote: {claim.quote}")
#     print(f"Timestamp: {claim.timestamp}")
#     print(f"Measurable: {'Yes' if claim.measurable else 'No'}")
#     print(f"Analysis: {claim.analysis}")
#     print()


In [None]:
## chracther index approch
# from langchain_core.prompts import ChatPromptTemplate
# from langchain_community.document_loaders import TextLoader
# from langchain_openai import ChatOpenAI
# from langchain.output_parsers import PydanticOutputParser
# from pydantic import BaseModel, Field
# from typing import List
# import datetime
# import os

# # Import the Claims class
# # from models.claims import Claims

# class Claims:
#     def __init__(self, speaker:str, claim:str, timestamp:str, measurable:bool, analysis:str, char_indices:tuple)->None:
#         self.speaker = speaker
#         self.claim = claim
#         self.measurable = measurable
#         self.analysis = analysis
#         self.timestamp = timestamp
#         self.char_indices = char_indices

# # Define a Pydantic model that matches the Claims class
# class ClaimModel(BaseModel):
#     speaker: str
#     claim: str
#     timestamp: str
#     measurable: bool
#     analysis: str
#     char_indices: str


# class ClaimList(BaseModel):
#     claims: List[ClaimModel]

# # Create the output parser
# parser = PydanticOutputParser(pydantic_object=ClaimList)

# # Load the document
# file_path = "data/President_Bidens_State_of_the_Union_Address_2024-03-08T034913Z.txt"
# loader = TextLoader(file_path)
# document = loader.load()

# # Get current timestamp
# timestamp = datetime.datetime.now().isoformat()

# # Create a prompt to extract speaker from filename
# extract_speaker_template = ChatPromptTemplate.from_template(
#     "Extract the speaker's name from this filename: {filename}. Only return the name, nothing else."
# )

# # Extract speaker using LLM
# speaker_chain = extract_speaker_template | llm
# speaker_result = speaker_chain.invoke({"filename": os.path.basename(file_path)})
# speaker = speaker_result.content.strip()

# # Create the prompt templates
# extract_claims_template = ChatPromptTemplate.from_template(
#     "Extract claims from the following text:\n\n{text}\n\n For each claim, list it as a separate bullet point and include the starting and ending character indices in parentheses. Claims:"
# )


# analyze_claims_template = ChatPromptTemplate.from_template(
#     """For each claim, determine if it is measurable or not. If measurable, explain how it could be quantifiably measured or validated using real-world data. If not measurable, explain why it's too vague or subjective to measure. Consider specific metrics, data sources, or methods that could be used for validation.

#     Format your response as a list of claims with their analysis, following this structure:
#     {format_instructions}

#     Claims to analyze:
#     {claims}
#     """
# )


# # Create the LCEL chain
# chain = (
#     {"text": lambda x: x['text']}
#     | extract_claims_template
#     | llm
#     | (lambda x: {"claims": [{"claim": c.split("(")[0], "char_indices": c.split("(")[1].split(")")[0]} for c in x.content.split("\n") if "(" in c and ")" in c]})
#     | analyze_claims_template.partial(format_instructions=parser.get_format_instructions())
#     | llm
#     | parser
# )


# # Run the chain
# result = chain.invoke({"text": document[0].page_content})

# # Convert the results to Claims objects
# claims_objects = [
#     Claims(
#         speaker=speaker,
#         claim=claim.claim,
#         timestamp=timestamp,
#         measurable=claim.measurable,
#         analysis=claim.analysis,
#         char_indices=claim.char_indices
#     )
#     for claim in result.claims
# ]


# # Print the results
# for claim in claims_objects:
#     print(f"Speaker: {claim.speaker}")
#     print(f"Claim: {claim.claim}")
#     print(f"Timestamp: {claim.timestamp}")
#     print(f"Measurable: {'Yes' if claim.measurable else 'No'}")
#     print(f"Analysis: {claim.analysis}")
#     print(f"char_indices:{claim.char_indices}")
