In [197]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import chromadb

In [682]:
import vertexai
from vertexai.preview.generative_models import GenerativeModel, Image

  from tensorflow.tsl.python.lib.core import pywrap_ml_dtypes


### Data Loading

In [364]:
reliable_articles = pd.read_csv("Data/Politifact_Data/CSV/politifact_articles.csv")
reliable_articles = reliable_articles.drop(columns='Unnamed: 0')
reliable_articles.rename(columns={'Statement': 'Title'}, inplace=True)
reliable_articles = reliable_articles.dropna()

In [365]:
pf_true_statements = pd.read_csv("Data/Politifact_Data/CSV/true_df.csv")
pf_true_statements = pf_true_statements.drop(columns='Unnamed: 0')
pf_true_statements = pf_true_statements.dropna()

In [366]:
def filter_short_strings(text):
    return '' if len(text) < 7 else text

In [440]:
factcheckorg_articles = pd.read_csv("Data/FactCheckOrg/factcheckorg_webscrape_200pages.csv")
factcheckorg_articles['List_data'].fillna('', inplace=True)
factcheckorg_articles['List_data'] = factcheckorg_articles['List_data'].apply(filter_short_strings)
factcheckorg_articles = factcheckorg_articles.dropna(subset=['Text'])
factcheckorg_articles['Text'] = factcheckorg_articles['Text'].str.replace('Para leer en español, vea esta traducción de Google Translate.', '')
factcheckorg_articles['Text'] = factcheckorg_articles['Text'].str.replace(r' Editor’s Note:.*$', '', regex=True)
factcheckorg_articles = factcheckorg_articles.reset_index()
factcheckorg_articles = factcheckorg_articles.drop(columns=['index'])
factcheckorg_articles['Title_and_Date'] = factcheckorg_articles['Title'] + ' , ' + factcheckorg_articles['Date']
factcheckorg_articles = factcheckorg_articles.drop(columns=['Title', 'Date'])
factcheckorg_articles

Unnamed: 0,Text,List_data,Title_and_Date
0,Here’s how the United States has fared since ...,The economy added more than 14 million jobs. T...,"Biden’s Numbers, January 2024 Update , January..."
1,Following his projected win in New Hampshire’...,Trump misleadingly claimed that “they accept D...,FactChecking Trump’s New Hampshire Victory Spe...
2,Efforts are underway in many states to disqua...,,Posts Distort History in Comparing Lincoln Wit...
3,State Rep. Nick Wilson proposed changing Kent...,,"Kentucky Lawmaker Fixes Incest Bill, But Not U..."
4,In a victory speech after decisively winning ...,When talking about terrorism and a travel ban ...,"FactChecking Trump’s Iowa Victory Speech , Jan..."
...,...,...,...
2006,"Hillary Clinton’s campaign manager, Robby Mook...",,"Clinton Campaign’s ‘Kremlin’ Deception , Augus..."
2007,"During a town hall meeting, Libertarian presid...",,"Unpacking Pot’s Impact in Colorado , August 19..."
2008,President Barack Obama credited his administra...,,"‘Record’ College Enrollment Rates? , August 19..."
2009,This week’s rundown of repeated claims include...,,"Groundhog Friday , August 19, 2016"


### Chunking and Tokenizing

In [445]:
def tokenize_into_sentences(text):
    return sent_tokenize(text)

In [446]:
def tokenize_into_chunks(text, min_words=75):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []

    for sentence in sentences:
        words = word_tokenize(sentence)
        if len(current_chunk) + len(words) < min_words:
            current_chunk.extend(words)
        else:
            if any(sentence.endswith(p) for p in ['.', '!', '?', '¡', '¿']):
                chunks.append(' '.join(current_chunk))
                current_chunk = words
            else:
                current_chunk.extend(words)

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

In [447]:
#FactCheckOrg Article Chunking
factcheckorg_articles['chunks_text'] = factcheckorg_articles['Text'].apply(tokenize_into_chunks)
factcheckorg_articles['chunkslistdata'] = factcheckorg_articles['List_data'].apply(tokenize_into_chunks)

# Determine the maximum number of chunks across both columns
max_chunks_text = factcheckorg_articles['chunks_text'].apply(len).max()
max_chunks_list_data = factcheckorg_articles['chunkslistdata'].apply(len).max()
max_total_chunks = max(max_chunks_text, max_chunks_list_data)

# Create columns for each chunk in both 'Text' and 'List_data'
for i in range(1, max_total_chunks + 1):
    factcheckorg_articles[f'chunk_text_{i}'] = factcheckorg_articles['chunks_text'].apply(lambda x: x[i - 1] if len(x) >= i else None)
    factcheckorg_articles[f'chunklistdata{i}'] = factcheckorg_articles['chunkslistdata'].apply(lambda x: x[i - 1] if len(x) >= i else None)

# Drop unnecessary columns
factcheckorg_articles = factcheckorg_articles.drop(columns=['chunks_text', 'chunkslistdata'])

  factcheckorg_articles[f'chunk_text_{i}'] = factcheckorg_articles['chunks_text'].apply(lambda x: x[i - 1] if len(x) >= i else None)
  factcheckorg_articles[f'chunklistdata{i}'] = factcheckorg_articles['chunkslistdata'].apply(lambda x: x[i - 1] if len(x) >= i else None)
  factcheckorg_articles[f'chunk_text_{i}'] = factcheckorg_articles['chunks_text'].apply(lambda x: x[i - 1] if len(x) >= i else None)
  factcheckorg_articles[f'chunklistdata{i}'] = factcheckorg_articles['chunkslistdata'].apply(lambda x: x[i - 1] if len(x) >= i else None)
  factcheckorg_articles[f'chunk_text_{i}'] = factcheckorg_articles['chunks_text'].apply(lambda x: x[i - 1] if len(x) >= i else None)
  factcheckorg_articles[f'chunklistdata{i}'] = factcheckorg_articles['chunkslistdata'].apply(lambda x: x[i - 1] if len(x) >= i else None)
  factcheckorg_articles[f'chunk_text_{i}'] = factcheckorg_articles['chunks_text'].apply(lambda x: x[i - 1] if len(x) >= i else None)
  factcheckorg_articles[f'chunklistdata{i}'] = factche

In [232]:
#Politifact True Statement Text Chunking
pf_true_statements['chunks'] = pf_true_statements['Text'].apply(tokenize_into_chunks)

max_chunks = pf_true_statements['chunks'].apply(len).max()

for i in range(1, max_chunks + 1):
    pf_true_statements[f'chunk_{i}'] = pf_true_statements['chunks'].apply(lambda x: x[i - 1] if len(x) >= i else None)

pf_true_statements = pf_true_statements.drop(columns=['chunks'])

In [201]:
#Politifact Articles Chunking
reliable_articles['chunks'] = reliable_articles['Text'].apply(tokenize_into_chunks)

max_chunks = reliable_articles['chunks'].apply(len).max()

for i in range(1, max_chunks + 1):
    reliable_articles[f'chunk_{i}'] = reliable_articles['chunks'].apply(lambda x: x[i - 1] if len(x) >= i else None)

reliable_articles = reliable_articles.drop(columns=['chunks'])

  reliable_articles[f'chunk_{i}'] = reliable_articles['chunks'].apply(lambda x: x[i - 1] if len(x) >= i else None)
  reliable_articles[f'chunk_{i}'] = reliable_articles['chunks'].apply(lambda x: x[i - 1] if len(x) >= i else None)
  reliable_articles[f'chunk_{i}'] = reliable_articles['chunks'].apply(lambda x: x[i - 1] if len(x) >= i else None)
  reliable_articles[f'chunk_{i}'] = reliable_articles['chunks'].apply(lambda x: x[i - 1] if len(x) >= i else None)
  reliable_articles[f'chunk_{i}'] = reliable_articles['chunks'].apply(lambda x: x[i - 1] if len(x) >= i else None)
  reliable_articles[f'chunk_{i}'] = reliable_articles['chunks'].apply(lambda x: x[i - 1] if len(x) >= i else None)
  reliable_articles[f'chunk_{i}'] = reliable_articles['chunks'].apply(lambda x: x[i - 1] if len(x) >= i else None)
  reliable_articles[f'chunk_{i}'] = reliable_articles['chunks'].apply(lambda x: x[i - 1] if len(x) >= i else None)
  reliable_articles[f'chunk_{i}'] = reliable_articles['chunks'].apply(lambda x: 

In [448]:
#This is the chunked reliable politifact news articles
factcheckorg_articles.head(2)

Unnamed: 0,Text,List_data,Title_and_Date,chunk_text_1,chunklistdata1,chunk_text_2,chunklistdata2,chunk_text_3,chunklistdata3,chunk_text_4,...,chunk_text_210,chunklistdata210,chunk_text_211,chunklistdata211,chunk_text_212,chunklistdata212,chunk_text_213,chunklistdata213,chunk_text_214,chunklistdata214
0,Here’s how the United States has fared since ...,The economy added more than 14 million jobs. T...,"Biden’s Numbers, January 2024 Update , January...",Here ’ s how the United States has fared since...,The economy added more than 14 million jobs . ...,"Here , we present those and other statistical ...",Gasoline is up 29 % . Average weekly earnings ...,"Employment — The U.S. economy added 14,263,000...",The S & P 500 has increased 28.2 % . The numbe...,"There were 28,000 fewer public school teachers...",...,,,,,,,,,,
1,Following his projected win in New Hampshire’...,Trump misleadingly claimed that “they accept D...,FactChecking Trump’s New Hampshire Victory Spe...,Following his projected win in New Hampshire ’...,Trump misleadingly claimed that “ they accept ...,Trump also repeated false claims about illegal...,He falsely claimed that former U.N . Ambassado...,"“ In this state , in the Republican primary , ...",Trump claimed Democrats “ want to raise your t...,"In order to participate , voters have to be re...",...,,,,,,,,,,


In [458]:
factcheckorg_articles['chunk_text_1'][0]

'Here ’ s how the United States has fared since President Joe Biden took office three years ago : Biden , who appears to be headed for a rematch with former President Donald Trump , is going into an election year with some favorable and unfavorable numbers . Unemployment is down , and consumer confidence is rising . But overall inflation is high , and wages aren ’ t keeping pace with inflation .'

## Text Cleaning Functions

In [975]:
#cleaning text with regex to convert to more readable format
def clean_text_pre(text):
   """
   Cleans up text with specific spacing rules.

   Args:
       text: The text to be cleaned.

   Returns:
       The cleaned text.
   """

   # Remove extra spaces around punctuation marks, except for commas.
   text = re.sub(r"\s+([^,\s\w])", r"\1", text)
   text = re.sub(r"([^\s\w])\s+", r"\1", text)

   # Add a space before quotation marks, but not after.
   text = re.sub(r"([^\s])\'", r"\1 '", text)
   text = re.sub(r'"\s', r'"', text)

   # Add a space after commas.
   text = re.sub(r",", r", ", text)

   # Remove double backticks and single quotes that aren't part of the intended quotation.
   text = re.sub(r"`([^`]+)`", r"\1", text)
   text = re.sub(r"'([^']+)'", r"\1", text)

   # Replace single backticks with standard apostrophes.
   text = text.replace("`", "'")

   return text

In [978]:
#cleaning text with regex to convert to a more readable format
def clean_text_after_generation(text):
    """
    Cleans up text with specific spacing rules and other strange non-readable formats.

    Args:
        text: The text to be cleaned.

    Returns:
        The cleaned text.
    """
    # Remove \ characters.
    text = text.replace("\\", "")

    # Add a space before quotation marks, but not after.
    text = re.sub(r"([^\s])\'", r"\1 '", text)
    text = re.sub(r'"\s', r'"', text)

    # Remove double backticks and single quotes that aren't part of intended quotation.
    text = re.sub(r"`([^`]+)`", r"\1", text)
    text = re.sub(r"'([^']+)'", r"\1", text)

    # Replace single backticks with standard apostrophes.
    text = text.replace("`", "'")


    return text

### Vector Database with Chroma

In [202]:
chroma_client = chromadb.Client()

In [203]:
pf_vector_db = chroma_client.create_collection(name="pf_vector_db")

In [449]:
#FactCheckOrg Text breakdown
chunks_list = []
titles_list = []
ids_list = []
start_id = 96621

for index, row in factcheckorg_articles.iterrows():
    title = row['Title_and_Date']
    for col in factcheckorg_articles.columns:
        if col.startswith('chunk_'):
            chunk = row[col]
            if chunk is not None:
                chunks_list.append(chunk)
                titles_list.append({"Title_and_Date": title})
                ids_list.append(f"id{start_id}")
                start_id += 1
        elif col.startswith('chunklist'):
            chunk = row[col]
            if chunk is not None:
                chunks_list.append(chunk)
                titles_list.append({"Title_and_Date": title})
                ids_list.append(f"id{start_id}")
                start_id += 1

In [454]:
len(ids_list)

59860

In [261]:
#Politifact Claimer and Statement break down
statements_list = []
claimers_list = []
ids_list = []
start_id = 94049

for index, row in pf_true_statements.iterrows():
    claimer = row['Claimer']
    statement = row['Statement']
    
    claimers_list.append({"Claimer": claimer})
    statements_list.append(statement)
    ids_list.append(f"id{start_id}")
    start_id += 1

In [456]:
#Adding text to vector database in batches of 5000 (max batch size is just over 5000)
start_size = 0
batch_size_increment = 5000
batch_size = 5000
for i in range(((len(chunks_list)//batch_size)+1)):
    pf_vector_db.add(
        documents=chunks_list[start_size:batch_size],
        metadatas=titles_list[start_size:batch_size],
        ids=ids_list[start_size:batch_size])
    start_size = start_size + batch_size_increment
    batch_size = batch_size + batch_size_increment
    print(start_size)

5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
55000
60000


In [712]:
len(chunks_list)

59860

In [457]:
pf_vector_db.count()

156480

In [709]:
db2 = Chroma.from_documents(pf_vector_db, persist_directory="./chroma_db")

AttributeError: 'tuple' object has no attribute 'page_content'

In [717]:
pf_vector_db.query(
    query_texts=['The unemployment rate has gone down since Biden took office in 2020'],
    n_results=5,
)['documents'][0]

['Unemployment — The unemployment rate plunged during Biden ’ s first 14 months , down to 3.6 % in March from 6.4 % when he took office . “ There have been only three months in the last 50 years where the unemployment rate in America is lower than it is now , ” Biden said of the most recent report .',
 'Unemployment — The unemployment rate fell from 6.3 % at the time Biden took office to 3.5 % in March — a decline of 2.8 percentage points . The current rate is exactly where it was in the months just before the pandemic . That ’ s uncommonly low .',
 'Unemployment — The unemployment rate fell from 6.4 % at the time Biden took office to 3.5 % in September — a decline of 2.9 percentage points . The current rate is exactly where it was in the months just before the pandemic .',
 'The nation ’ s overall unemployment rate has dropped sharply on Biden ’ s watch , from 6.3 % when he took office to 3.4 % in January . Although Black and Hispanic Americans continue to have higher unemployment rat

### Vertex AI and RAG chaining

In [None]:
#install following dependencies
!pip -q install langchain_experimental langchain_core
!pip -q install google-generativeai
!pip -q install google-ai-generativelanguage
!pip -q install langchain-google-genai

In [979]:
PROJECT_ID = "gen-lang-client-0321728687"
REGION = "us-central1"
vertexai.init(project=PROJECT_ID, location=REGION)

In [980]:
import os
import google.generativeai as genai
from IPython.display import display
from IPython.display import Markdown
from langchain_core.messages import HumanMessage
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.schema.runnable import RunnableMap
from vertexai.preview import generative_models
import multiprocessing

In [992]:
from langchain.globals import set_llm_cache

In [981]:
#initializing model, applying configurations, and initializing chat
vertexai.init(project=PROJECT_ID, location=REGION)
model = generative_models.GenerativeModel("gemini-pro")
config = {"max_output_tokens": 2048, "temperature": 0.0}

safety_config = {
generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_NONE,
generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_NONE,
generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_NONE,
generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_NONE
}

chat = model.start_chat()

In [982]:
#example full news article and chunking the article
news_article = """As former President Donald Trump petitions the courts to be held legally immune for his actions as president, he’s begun comparing presidents and police officers. In a Jan.19 Truth Social post, Trump argued that "a president of the United States must have full immunity, without which it would be impossible for him/her to properly function." He added that immunity is needed even for "events that ‘cross the line,’" though he didn’t specify what he meant. "You can’t stop police from doing the job of strong & effective crime prevention because you want to guard against the occasional ‘rogue cop’ or ‘bad apple,’" he wrote in all caps. "Sometimes you just have to live with ‘great but slightly imperfect.’" Police officers are protected against lawsuits related to their official actions, called "qualified immunity." Without "complete & total presidential immunity," Trump wrote, "the authority & decisiveness of a president of the United States will be stripped & gone forever." Trump repeated the argument at a Jan. 20 Manchester, New Hampshire, rally and the following night in Rochester, New Hampshire, shortly before the state’s first-in-the-nation Republican primary. A three-judge federal appeals panel is considering Trump’s immunity request. Regardless of that panel’s decision, the case could go before the U.S. Supreme Court. Legal experts told PolitiFact that whatever the judicial ruling, Trump’s suggestion that he’s seeking what police officers already have is flawed. "What Trump seeks goes far beyond" the protections police officers have, said Ilya Somin, a George Mason University law professor. Trump’s campaign did not answer an inquiry for this article. What is qualified immunity? The legal protection that police officers and other government officials are afforded is known as "qualified immunity." It is intended to protect officers conducting official duties not only from being held financially liable for their actions but also from being forced to face trial over those actions. But as the "qualified" denotes, this type of immunity is not all-encompassing for key reasons: It applies to civil cases, not criminal charges. "It has nothing to do with criminal liability," said Joanna C. Schwartz, a UCLA law professor. If officers are charged with a crime, as happened with the officers in the 2020 death of Minneapolis resident George Floyd, they can stand trial. In civil cases, accused officers have to invoke qualified immunity as a defense, and the judge may or may not grant them protection. The accused officer can still be pursued in a civil lawsuit if the judge decides that that officer acted incompetently or knowingly violated the law. "If an action is deemed in direct violation of constitutional rights or illegal as known and understood by a reasonable person, qualified immunity would generally not apply," said Jillian E. Snider, a retired New York City police officer and a lecturer at John Jay College of Criminal Justice. Schwartz said the qualified immunity defense "is very strong, but it is not insurmountable." What kind of immunity is Trump seeking? Trump’s lawyers have said in court that they are seeking much broader immunity than what police officers receive. Trump "seeks full immunity, not just ‘qualified’ immunity," Somin said. "And he is seeking immunity for criminal conduct, not just civil violations." During oral arguments Jan. 9 before the three-judge federal panel, one judge asked Trump’s attorney, D. John Sauer, whether the president should, hypothetically, be immune from prosecution for ordering U.S. Navy commandos to assassinate a political rival. Sauer said that unless the president had been impeached first, such a prosecution would be invalid. In his rallies and Truth Social post, Trump didn’t mention the caveat about impeachment, saying instead that presidents should have "complete & total presidential immunity.’ Even Karen M. Blum, an emeritus professor at Suffolk University Law School and self-described longtime critic of qualified immunity, said that qualified immunity has firmer legal support than what Trump seeks. "Trump’s argument that no matter what, the president should be immune from any and all liability, civil or criminal, is not supported by any constitutional jurisprudence known to me," Blum said."""
chunked_article_list = tokenize_into_chunks(news_article, 30)

In [983]:
#getting models output from input and context provided
all_response_text = []
for statement in chunked_article_list:
    input = statement
    context = pf_vector_db.query(
        query_texts=[input],
        n_results=10,
    )

#generating response with prompt template

    responses = model.generate_content(f"""Answer the question below marked inside <<<>>> in a full sentence based on the
    knowledge you already have and provide a short explanation of why you chose that score.
    If you don't have enough information use the following context to answer the question: {context}.
    <<<
    Question: How true is the following statement on a scale of 1-100? + {input}.
    >>>
   """,
        generation_config=config,
        stream=True,
        safety_settings=safety_config,
    )

#printing responses
    response_text = ""
    for response in responses:
        response_text += response.text
    response_text = response_text.replace("\n\n", ". ")
    all_response_text.append(response_text)
entire_text_string = ""
for text in all_response_text:
    entire_text_string += text
cleaned_text = clean_text(entire_text_string)

In [984]:
unratable_sentences = []
rated_sentences = []

for response in all_response_text:
    if "article does not" in response.lower() or "context does not" in response.lower():
        unratable_sentences.append(response)
    else:
        rated_sentences.append(response)

In [985]:
not_enough_context = len(unratable_sentences)
enough_context = len(rated_sentences)
all_statements_count = len(all_response_text)

In [986]:
#final scoring model and output generation
scoring_model = generative_models.GenerativeModel("gemini-pro")
config = {"max_output_tokens": 2048, "temperature": 0.0}

safety_config = {
generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_NONE,
generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_NONE,
generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_NONE,
generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_NONE
}

chat = model.start_chat()

In [None]:
final_responses = model.generate_content(f"""Each entry in the list of statements provided below inside <<<>>> begins with a number
that explains how truthful a statement is and is followed by a text explanation to why that score was chosen. I need you to average up
all the numbers that are at the start of each list entry and then return that average number, followed by a short summary of that score
that is created from specific examples in the text following each number entry. 
Also, start your response by saying that "{not_enough_context} out of {all_statements_count} statements in the text could not be rated. 
The following score and explanation is based on the {enough_context} statements that could be rated
<<<
{rated_sentences}
>>>""",
    generation_config=config,
    stream=True,
    safety_settings=safety_config,
)

final_response_text = ""
for response in final_responses:
    final_response_text += response.text
final_response_text = final_response_text.replace("\n\n", ". ")
final_response_text