In [111]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import chromadb
from googleapiclient import discovery
import json
import vertexai
from vertexai.preview.generative_models import GenerativeModel, Image
import re

## Necessary Functions

In [2]:
def filter_short_strings(text):
    return '' if len(text) < 7 else text

In [3]:
def tokenize_into_sentences(text):
    return sent_tokenize(text)

In [4]:
def tokenize_into_chunks(text, min_words=75):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []

    for sentence in sentences:
        words = word_tokenize(sentence)
        if len(current_chunk) + len(words) < min_words:
            current_chunk.extend(words)
        else:
            if any(sentence.endswith(p) for p in ['.', '!', '?', '¡', '¿']):
                chunks.append(' '.join(current_chunk))
                current_chunk = words
            else:
                current_chunk.extend(words)

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

In [95]:
#cleaning text with regex to convert to more readable format
def clean_text(text):
   """
   Cleans up text with specific spacing rules.

   Args:
       text: The text to be cleaned.

   Returns:
       The cleaned text.
   """

   # Remove extra spaces around punctuation marks, except for commas.
   text = re.sub(r"\s+([^,\s\w])", r"\1", text)
   text = re.sub(r"([^\s\w])\s+", r"\1", text)

   # Add a space before quotation marks, but not after.
   text = re.sub(r"([^\s])\'", r"\1 '", text)
   text = re.sub(r'"\s', r'"', text)

   # Add a space after commas.
   text = re.sub(r",", r", ", text)

   # Remove double backticks and single quotes that aren't part of the intended quotation.
   text = re.sub(r"`([^`]+)`", r"\1", text)
   text = re.sub(r"'([^']+)'", r"\1", text)

   # Replace single backticks with standard apostrophes.
   text = text.replace("`", "'")

   return text

## Datasets

In [27]:
#Politifact articles
pf_articles = pd.read_csv("Data/Politifact_Data/CSV/politifact_articles.csv")
pf_articles = pf_articles.drop(columns='Unnamed: 0')
pf_articles.rename(columns={'Statement': 'Title'}, inplace=True)
pf_articles = pf_articles.dropna()

In [41]:
#Politifact truth datasets
pf_statements = pd.read_csv("Data/Politifact_Data/CSV/politifact_truthometer_df.csv")
pf_statements = pf_statements.drop(columns='Unnamed: 0')
pf_statements = pf_statements.drop(columns='Unnamed: 0.1')
pf_statements = pf_statements.dropna()
pf_statements_full = pf_statements
pf_statements = pf_statements.sample(frac=0.4, random_state=42)

In [29]:
factcheckorg_articles = pd.read_csv("Data/FactCheckOrg/factcheckorg_webscrape_200pages.csv")
factcheckorg_articles['List_data'].fillna('', inplace=True)
factcheckorg_articles['List_data'] = factcheckorg_articles['List_data'].apply(filter_short_strings)
factcheckorg_articles = factcheckorg_articles.dropna(subset=['Text'])
factcheckorg_articles['Text'] = factcheckorg_articles['Text'].str.replace('Para leer en español, vea esta traducción de Google Translate.', '')
factcheckorg_articles['Text'] = factcheckorg_articles['Text'].str.replace(r' Editor’s Note:.*$', '', regex=True)
factcheckorg_articles = factcheckorg_articles.reset_index()
factcheckorg_articles = factcheckorg_articles.drop(columns=['index'])
factcheckorg_articles['Title_and_Date'] = factcheckorg_articles['Title'] + ' , ' + factcheckorg_articles['Date']
factcheckorg_articles = factcheckorg_articles.drop(columns=['Title', 'Date'])

In [55]:
sciencefeedbackorg_articles = pd.read_csv("Data/ScienceFeedbackOrg/ScienceFeedbackOrg.csv")
sciencefeedbackorg_articles = sciencefeedbackorg_articles.drop(columns='Unnamed: 0')

In [68]:
scicheckorg_articles = pd.read_csv("Data/FactCheckOrg/scicheck_data.csv")
scicheckorg_articles['Title_and_Date'] = scicheckorg_articles['Title'] + ' , ' + scicheckorg_articles['Date']
scicheckorg_articles = scicheckorg_articles.drop(columns=['Title', 'Date'])
scicheckorg_articles.dropna(inplace=True)

## Chunking

In [30]:
#FactCheckOrg Article Chunking
factcheckorg_articles['chunks_text'] = factcheckorg_articles['Text'].apply(tokenize_into_chunks)
factcheckorg_articles['chunkslistdata'] = factcheckorg_articles['List_data'].apply(tokenize_into_chunks)

# Determine the maximum number of chunks across both columns
max_chunks_text = factcheckorg_articles['chunks_text'].apply(len).max()
max_chunks_list_data = factcheckorg_articles['chunkslistdata'].apply(len).max()
max_total_chunks = max(max_chunks_text, max_chunks_list_data)

# Create columns for each chunk in both 'Text' and 'List_data'
for i in range(1, max_total_chunks + 1):
    factcheckorg_articles[f'chunk_text_{i}'] = factcheckorg_articles['chunks_text'].apply(lambda x: x[i - 1] if len(x) >= i else None)
    factcheckorg_articles[f'chunklistdata{i}'] = factcheckorg_articles['chunkslistdata'].apply(lambda x: x[i - 1] if len(x) >= i else None)

# Drop unnecessary columns
factcheckorg_articles = factcheckorg_articles.drop(columns=['chunks_text', 'chunkslistdata', 'Text', 'List_data'])

  factcheckorg_articles[f'chunk_text_{i}'] = factcheckorg_articles['chunks_text'].apply(lambda x: x[i - 1] if len(x) >= i else None)
  factcheckorg_articles[f'chunklistdata{i}'] = factcheckorg_articles['chunkslistdata'].apply(lambda x: x[i - 1] if len(x) >= i else None)
  factcheckorg_articles[f'chunk_text_{i}'] = factcheckorg_articles['chunks_text'].apply(lambda x: x[i - 1] if len(x) >= i else None)
  factcheckorg_articles[f'chunklistdata{i}'] = factcheckorg_articles['chunkslistdata'].apply(lambda x: x[i - 1] if len(x) >= i else None)
  factcheckorg_articles[f'chunk_text_{i}'] = factcheckorg_articles['chunks_text'].apply(lambda x: x[i - 1] if len(x) >= i else None)
  factcheckorg_articles[f'chunklistdata{i}'] = factcheckorg_articles['chunkslistdata'].apply(lambda x: x[i - 1] if len(x) >= i else None)
  factcheckorg_articles[f'chunk_text_{i}'] = factcheckorg_articles['chunks_text'].apply(lambda x: x[i - 1] if len(x) >= i else None)
  factcheckorg_articles[f'chunklistdata{i}'] = factche

In [31]:
#Politifact Statement Text Chunking
pf_statements['chunks'] = pf_statements['Text'].apply(tokenize_into_chunks)

max_chunks = pf_statements['chunks'].apply(len).max()

for i in range(1, max_chunks + 1):
    pf_statements[f'chunk_{i}'] = pf_statements['chunks'].apply(lambda x: x[i - 1] if len(x) >= i else None)

pf_statements = pf_statements.drop(columns=['chunks', 'Tldr_text_statements', 'Text'])

In [32]:
#Politifact Articles Chunking
pf_articles['chunks'] = pf_articles['Text'].apply(tokenize_into_chunks)

max_chunks = pf_articles['chunks'].apply(len).max()

for i in range(1, max_chunks + 1):
    pf_articles[f'chunk_{i}'] = pf_articles['chunks'].apply(lambda x: x[i - 1] if len(x) >= i else None)

pf_articles = pf_articles.drop(columns=['chunks', 'Tldr_text_statements', 'Text'])

  pf_articles[f'chunk_{i}'] = pf_articles['chunks'].apply(lambda x: x[i - 1] if len(x) >= i else None)
  pf_articles[f'chunk_{i}'] = pf_articles['chunks'].apply(lambda x: x[i - 1] if len(x) >= i else None)
  pf_articles[f'chunk_{i}'] = pf_articles['chunks'].apply(lambda x: x[i - 1] if len(x) >= i else None)
  pf_articles[f'chunk_{i}'] = pf_articles['chunks'].apply(lambda x: x[i - 1] if len(x) >= i else None)
  pf_articles[f'chunk_{i}'] = pf_articles['chunks'].apply(lambda x: x[i - 1] if len(x) >= i else None)
  pf_articles[f'chunk_{i}'] = pf_articles['chunks'].apply(lambda x: x[i - 1] if len(x) >= i else None)
  pf_articles[f'chunk_{i}'] = pf_articles['chunks'].apply(lambda x: x[i - 1] if len(x) >= i else None)
  pf_articles[f'chunk_{i}'] = pf_articles['chunks'].apply(lambda x: x[i - 1] if len(x) >= i else None)
  pf_articles[f'chunk_{i}'] = pf_articles['chunks'].apply(lambda x: x[i - 1] if len(x) >= i else None)
  pf_articles[f'chunk_{i}'] = pf_articles['chunks'].apply(lambda x: x[i -

In [71]:
#SciCheckOrg Articles Chunking
scicheckorg_articles['chunks_text'] = scicheckorg_articles['Text'].apply(tokenize_into_chunks)

# Determine the maximum number of chunks across both columns
max_chunks_text = scicheckorg_articles['chunks_text'].apply(len).max()

# Create columns for each chunk in both 'Text' and 'List_data'
for i in range(1, max_chunks_text + 1):
    scicheckorg_articles[f'chunk_text_{i}'] = scicheckorg_articles['chunks_text'].apply(lambda x: x[i - 1] if len(x) >= i else None)

# Drop unnecessary columns
scicheckorg_articles = scicheckorg_articles.drop(columns=['chunks_text', 'Text'])

  scicheckorg_articles[f'chunk_text_{i}'] = scicheckorg_articles['chunks_text'].apply(lambda x: x[i - 1] if len(x) >= i else None)
  scicheckorg_articles[f'chunk_text_{i}'] = scicheckorg_articles['chunks_text'].apply(lambda x: x[i - 1] if len(x) >= i else None)
  scicheckorg_articles[f'chunk_text_{i}'] = scicheckorg_articles['chunks_text'].apply(lambda x: x[i - 1] if len(x) >= i else None)
  scicheckorg_articles[f'chunk_text_{i}'] = scicheckorg_articles['chunks_text'].apply(lambda x: x[i - 1] if len(x) >= i else None)
  scicheckorg_articles[f'chunk_text_{i}'] = scicheckorg_articles['chunks_text'].apply(lambda x: x[i - 1] if len(x) >= i else None)
  scicheckorg_articles[f'chunk_text_{i}'] = scicheckorg_articles['chunks_text'].apply(lambda x: x[i - 1] if len(x) >= i else None)
  scicheckorg_articles[f'chunk_text_{i}'] = scicheckorg_articles['chunks_text'].apply(lambda x: x[i - 1] if len(x) >= i else None)
  scicheckorg_articles[f'chunk_text_{i}'] = scicheckorg_articles['chunks_text'].app

## Vector Database

In [11]:
chroma_client = chromadb.Client()

In [12]:
RAG_CONTEXT_VDB = chroma_client.create_collection(name="RAG_CONTEXT_VDB")

In [42]:
RAG_STATEMENTS_VDB = chroma_client.create_collection(name="RAG_STATEMENTS_VDB")

In [33]:
#Adding pf statement justifications to Context VDB
ids_list = []
metadata_list = []
chunks_list = []
start_id = RAG_CONTEXT_VDB.count() + 1

for index, row in pf_statements.iterrows():
    statement = row['Statement']
    claimer = row['Claimer']
    for col in pf_statements.columns:
        if col.startswith('chunk_'):
            chunk = row[col]
            if chunk is not None:
                chunks_list.append(chunk)
                metadata_list.append({"Statement": statement, "Context": "Yes", "Claimer": claimer})
                ids_list.append(f"id{start_id}")
                start_id += 1

In [34]:
#Adding pf truth-o-meter justifications to vector database in batches of 5000 (max batch size is just over 5000)
start_size = 0
batch_size_increment = 5000
batch_size = 5000
for i in range(((len(chunks_list)//batch_size)+1)):
    RAG_CONTEXT_VDB.add(
        documents=chunks_list[start_size:batch_size],
        metadatas=metadata_list[start_size:batch_size],
        ids=ids_list[start_size:batch_size])
    start_size = start_size + batch_size_increment
    batch_size = batch_size + batch_size_increment
    print(start_size)

5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
55000
60000
65000
70000
75000
80000
85000
90000
95000
100000
105000
110000
115000
120000
125000
130000


In [43]:
#Adding politifact truth-o-meter statements to Statements VDB
statements_list = []
ids_list = []
metadata_list = []
start_id = RAG_STATEMENTS_VDB.count() + 1

for index, row in pf_statements_full.iterrows():
    truth_value = row['Truth_value']
    claimer = row['Claimer']
    statement = row['Statement']

    metadata_list.append({"Statements truthfulness":truth_value,"Claimer": claimer})
    statements_list.append(statement)
    
    ids_list.append(f"id{start_id}")
    start_id += 1

In [47]:
#Adding pf truth-o-meter statements to vector database in batches of 5000 (max batch size is just over 5000)
start_size = 0
batch_size_increment = 5000
batch_size = 5000
for i in range(((len(chunks_list)//batch_size)+1)):
    RAG_STATEMENTS_VDB.add(
        documents=statements_list[start_size:batch_size],
        metadatas=metadata_list[start_size:batch_size],
        ids=ids_list[start_size:batch_size])
    start_size = start_size + batch_size_increment
    batch_size = batch_size + batch_size_increment
    print(start_size)

5000
10000
15000
20000
25000


ValueError: Expected IDs to be a non-empty list, got []

In [48]:
#Adding factcheck.org data to Context VDB
chunks_list = []
titles_list = []
ids_list = []
start_id = RAG_CONTEXT_VDB.count() + 1

for index, row in factcheckorg_articles.iterrows():
    title = row['Title_and_Date']
    for col in factcheckorg_articles.columns:
        if col.startswith('chunk_'):
            chunk = row[col]
            if chunk is not None:
                chunks_list.append(chunk)
                titles_list.append({"Title_and_Date": title, "Context": "Yes"})
                ids_list.append(f"id{start_id}")
                start_id += 1
        elif col.startswith('chunklist'):
            chunk = row[col]
            if chunk is not None:
                chunks_list.append(chunk)
                titles_list.append({"Title_and_Date": title, "Context": "Yes"})
                ids_list.append(f"id{start_id}")
                start_id += 1

In [49]:
#Adding factcheckorg text to vector database in batches of 5000 (max batch size is just over 5000)
start_size = 0
batch_size_increment = 5000
batch_size = 5000
for i in range(((len(chunks_list)//batch_size)+1)):
    RAG_CONTEXT_VDB.add(
        documents=chunks_list[start_size:batch_size],
        metadatas=titles_list[start_size:batch_size],
        ids=ids_list[start_size:batch_size])
    start_size = start_size + batch_size_increment
    batch_size = batch_size + batch_size_increment
    print(start_size)

5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
55000
60000


In [74]:
#adding SciCheckOrg articles to Context VDB
chunks_list = []
titles_list = []
ids_list = []
start_id = RAG_CONTEXT_VDB.count() + 1

for index, row in scicheckorg_articles.iterrows():
    title = row['Title_and_Date']
    for col in scicheckorg_articles.columns:
        if col.startswith('chunk_'):
            chunk = row[col]
            if chunk is not None:
                chunks_list.append(chunk)
                titles_list.append({"Title_and_Date": title, "Context": "Yes"})
                ids_list.append(f"id{start_id}")
                start_id += 1

In [79]:
#Adding scicheckorg text to vector database in batches of 5000 (max batch size is just over 5000)
start_size = 0
batch_size_increment = 5000
batch_size = 5000
for i in range(((len(chunks_list)//batch_size)+1)):
    RAG_CONTEXT_VDB.add(
        documents=chunks_list[start_size:batch_size],
        metadatas=titles_list[start_size:batch_size],
        ids=ids_list[start_size:batch_size])
    start_size = start_size + batch_size_increment
    batch_size = batch_size + batch_size_increment
    print(start_size)

5000
10000
15000
20000
25000


In [92]:
#adding ScienceFeedbackOrg statements to Statements VDB
statements_list = []
ids_list = []
metadata_list = []
start_id = RAG_STATEMENTS_VDB.count() + 1

for index, row in sciencefeedbackorg_articles.iterrows():
    truth_value = row['label']
    statement = row['claim']

    metadata_list.append({"Statements truthfulness":truth_value})
    statements_list.append(statement)
    
    ids_list.append(f"id{start_id}")
    start_id += 1

In [93]:
#Adding pf sciencefeedback statements to vector database in batches of 5000 (max batch size is just over 5000)
start_size = 0
batch_size_increment = 5000
batch_size = 5000
for i in range(((len(chunks_list)//batch_size)+1)):
    RAG_STATEMENTS_VDB.add(
        documents=statements_list[start_size:batch_size],
        metadatas=metadata_list[start_size:batch_size],
        ids=ids_list[start_size:batch_size])
    start_size = start_size + batch_size_increment
    batch_size = batch_size + batch_size_increment
    print(start_size)

5000


ValueError: Expected IDs to be a non-empty list, got []

## FULL GEN AI MODEL

In [164]:
import os
import google.generativeai as genai
from IPython.display import display
from IPython.display import Markdown
from langchain_core.messages import HumanMessage
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.schema.runnable import RunnableMap
from vertexai.preview import generative_models
import multiprocessing

#### Perspective API Setup

In [165]:
PERSPECTIVE_API_KEY = 'AIzaSyCElMgVeT2_ng6hSnJMNHXt4t78fOv8J9U'

In [166]:
#thresholds of output
attributeThresholds = {
    'INSULT': 0.8,
    'TOXICITY': 0.8,
    'THREAT': 0.5,
    'SEXUALLY_EXPLICIT': 0.5,
    'PROFANITY': 0.8
}
requestedAttributes = {}
for key in attributeThresholds:
    requestedAttributes[key] = {}

# Liar Liar Dataset Testing

In [372]:
liar_liar_plus = pd.read_csv("Data/Liar_plus/train.tsv", delimiter='\t', header=None)
liar_liar_plus = liar_liar_plus[[3, 2]]
liar_liar_plus.dropna(inplace=True)
llp_statements = liar_liar_plus[3]
llp_labels = liar_liar_plus[2]

In [375]:
llp_statements

0        Says the Annies List political group supports ...
1        When did the decline of coal start? It started...
2        Hillary Clinton agrees with John McCain "by vo...
3        Health care reform legislation is likely to ma...
4        The economic turnaround started at the end of ...
                               ...                        
10237    There are a larger number of shark attacks in ...
10238    Democrats have now become the party of the [At...
10239    Says an alternative to Social Security that op...
10240    On lifting the U.S. Cuban embargo and allowing...
10241    The Department of Veterans Affairs has a manua...
Name: 3, Length: 10240, dtype: object

In [373]:
label_prediction = []
for i in llp_statements:
    try:
        label_prediction.append(GenAI_article_truth_processing(i,[])[0][0][1])
    except (ValueError, IndexError) as e:
        label_prediction.append(None)
        continue

HttpError: <HttpError 400 when requesting https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze?key=AIzaSyCElMgVeT2_ng6hSnJMNHXt4t78fOv8J9U&alt=json returned "Attribute SEXUALLY_EXPLICIT does not support request languages: de". Details: "[{'@type': 'type.googleapis.com/google.commentanalyzer.v1alpha1.Error', 'errorType': 'LANGUAGE_NOT_SUPPORTED_BY_ATTRIBUTE', 'languageNotSupportedByAttributeError': {'detectedLanguages': ['de'], 'attribute': 'SEXUALLY_EXPLICIT'}}]">

In [388]:
for i in range(len(label_prediction)):
    # Check if the current value matches the target value
    if label_prediction[i] == 'False':
        label_prediction[i] = 'false'
    elif label_prediction[i] == 'Half-true':
        label_prediction[i] = 'half-true'
    elif label_prediction[i] == 'Mostly-false':
        label_prediction[i] = 'barely-true'
    elif label_prediction[i] == 'Mostly-true':
        label_prediction[i] = 'mostly-true'
    elif label_prediction[i] == 'True':
        label_prediction[i] = 'true'
    elif label_prediction[i] == 'mostly-false':
        label_prediction[i] = 'barely-true'
    elif label_prediction[i] == 'mostly-true, mostly-true':
        label_prediction[i] = 'mostly-true'
    elif label_prediction[i] == 'Pants-on-fire':
        label_prediction[i] = 'pants-fire'    

In [389]:
unique_llp_labels = set(llp_labels)
unique_llp_labels

{'barely-true', 'false', 'half-true', 'mostly-true', 'pants-fire', 'true'}

In [390]:
unique_label_predictions = set(label_prediction)
unique_label_predictions

{'Barely true',
 'Barely-true',
 'Half True',
 'Half-True',
 'Mostly False',
 'Mostly True',
 'Mostly-False',
 'Mostly-True',
 None,
 'None provided',
 "['half-true', 'false']",
 "['half-true', 'mostly-true']",
 "['mostly-true', 'false']",
 "['true', 'false', 'mostly-true', 'mostly-true', 'half-true', 'true', 'half-true', 'half-true', 'mostly-true', 'mostly-true', 'pants-on-fire', 'pants-on-fire', 'mostly-true', 'true']",
 "['true', 'mostly-true', 'mostly-true']",
 "['true', 'mostly-true']",
 'barely-true',
 'false',
 'half-true',
 'half-true\nmostly-true',
 'half-true, false',
 'half-true, mostly-true',
 'half-true, pants-on-fire',
 'half-true, true',
 'mostly-false, mostly-true',
 'mostly-true',
 'mostly-true, half-true',
 'mostly-true, mostly-false',
 'mostly-true, true',
 'pants-fire',
 'pants-on-fire',
 'pants-on-fire, half-true',
 'pants-on-fire, true',
 'true',
 'true, mostly-true'}

In [392]:
num_matches = 0
for value1, value2 in zip(label_prediction, llp_labels):
    if value1 == value2:
        num_matches += 1

accuracy = num_matches / len(label_prediction) * 100

In [393]:
accuracy

36.44951140065146

## Cohere API implementation

In [524]:
pip install -U FlagEmbedding

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting FlagEmbedding
  Downloading FlagEmbedding-1.2.5.tar.gz (37 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting accelerate>=0.20.1
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Building wheels for collected packages: FlagEmbedding
  Building wheel for FlagEmbedding (setup.py) ... [?25ldone
[?25h  Created wheel for FlagEmbedding: filename=FlagEmbedding-1.2.5-py3-none-any.whl size=43013 sha256=e77a829aed3df41b6f92ab0ddbf4f8d0e659969843a3873d09d006fbfe73b3f1
  Stored in directory: /Users/nicholasshor/Library/Caches/pip/wheels/fa/88/19/e943a5c1531d0db10db20cfd3c124a881f2d7e06d7cf4aaac1
Successfully built FlagEmbedding
Installing collected packages: accelerate, FlagEmbedding
Successfully installed FlagEmbedding-1.2.5 accelerate-0.27.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0

In [525]:
from FlagEmbedding import FlagReranker
reranker = FlagReranker('BAAI/bge-reranker-large', use_fp16=True)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/801 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

In [521]:
chunked_article_list = tokenize_into_chunks(news_article, 50)

In [571]:
all_response_text = []
context_list = []
for i in range(len(chunked_article_list)):
    input = chunked_article_list[i]
    context = RAG_CONTEXT_VDB.query(
        query_texts=[input],
        n_results=8,
    )
    context_list.append(context)
        
fact_checks_list=[]
for i in range(len(chunked_article_list)):
    input = chunked_article_list[i]
    fact_checks = RAG_STATEMENTS_VDB.query(
        query_texts=[input],
        n_results=8,
    )
    fact_checks_list.append(fact_checks)

In [572]:
for i in range(len(context_list)):
    input=chunked_article_list[i]
    fact_checks = fact_checks_list[i]
    context = context_list[i]
    prev_chunk = chunked_article_list[i - 1] if i > 0 else None
    next_chunk = chunked_article_list[i + 1] if i + 1 < len(chunked_article_list) else None
    
    history = [prev_chunk, input, next_chunk]
    

    statement_rerank_list = []
    #Iterating through the fact_check data and context data and creating a dictionary for each individual statement for Gen AI processing
    for j in range(len(fact_checks['ids'][0])):
        reranking_statementSearch = [input, fact_checks['documents'][0][j]]
        statement_rerank_list.append(reranking_statementSearch)

    
    scores = reranker.compute_score(statement_rerank_list)
    combined_statement_scores = list(zip(scores, statement_rerank_list, fact_checks['metadatas'][0]))
    sorted_combined_data = sorted(combined_statement_scores, key=lambda x: x[0], reverse=True)
    sorted_statement_scores, sorted_statement_rerank_list, sorted_factCheck_metadata = zip(*sorted_combined_data)


    context_rerank_list = []
    for k in range(len(context['ids'][0])):
        reranking_contextSearch = [input, context['documents'][0][k]]
        context_rerank_list.append(reranking_contextSearch)
        
    scores = reranker.compute_score(context_rerank_list)
    combined_context_scores = list(zip(scores, context_rerank_list, context['metadatas'][0]))
    sorted_combined_data = sorted(combined_context_scores, key=lambda x: x[0], reverse=True)
    sorted_context_scores, sorted_context_rerank_list, sorted_context_metadata = zip(*sorted_combined_data)

context_window = 4
prepared_context = []
prepared_fact_checks = []
for i in range(context_window):
    prepared_context.append([sorted_context_metadata[i], sorted_context_rerank_list[i][1]])
    prepared_fact_checks.append([sorted_factCheck_metadata[i], sorted_statement_rerank_list[i][1]])

#### GEN AI Model Function

In [676]:
def GenAI_article_truth_processing(news_article, history):
    #getting input history for context and starting chat
    news_article = f"""{news_article}"""
    history = history or []
    #instantiating gemini pro
    PROJECT_ID = "gen-lang-client-0321728687"
    REGION = "us-central1"
    vertexai.init(project=PROJECT_ID, location=REGION)
    model = generative_models.GenerativeModel("gemini-pro")
    config = {"max_output_tokens": 2048, "temperature": 0.0}
    
    safety_config = {
    generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH,
    generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH,
    generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH,
    generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH
    }
    chat = model.start_chat()
    history = list(sum(history, ()))

    #PerspectiveAPI output check
    client = discovery.build(
      "commentanalyzer",
      "v1alpha1",
      developerKey=PERSPECTIVE_API_KEY,
      discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
      static_discovery=False,
    )
    
    
    #chunking the news article for improved processing
    chunked_article_list = tokenize_into_chunks(news_article, 50)
    
    #getting context and fact checks from vector database based on the provided input
    all_response_text = []
    context_list = []
    for i in range(len(chunked_article_list)):
        input = chunked_article_list[i]
        context = RAG_CONTEXT_VDB.query(
            query_texts=[input],
            n_results=7,
        )
        context_list.append(context)
        
    fact_checks_list=[]
    for i in range(len(chunked_article_list)):
        input = chunked_article_list[i]
        fact_checks = RAG_STATEMENTS_VDB.query(
            query_texts=[input],
            n_results=7,
        )
        fact_checks_list.append(fact_checks)


    for i in range(len(context_list)):
        input=chunked_article_list[i]
        fact_checks = fact_checks_list[i]
        context = context_list[i]
        prev_chunk = chunked_article_list[i - 1] if i > 0 else None
        next_chunk = chunked_article_list[i + 1] if i + 1 < len(chunked_article_list) else None
        
        history = [prev_chunk, input, next_chunk]
        
    
        statement_rerank_list = []
        #Iterating through the fact_check data and context data and creating a dictionary for each individual statement for Gen AI processing
        for j in range(len(fact_checks['ids'][0])):
            reranking_statementSearch = [input, fact_checks['documents'][0][j]]
            statement_rerank_list.append(reranking_statementSearch)
    
        
        scores = reranker.compute_score(statement_rerank_list)
        combined_statement_scores = list(zip(scores, statement_rerank_list, fact_checks['metadatas'][0]))
        sorted_combined_data = sorted(combined_statement_scores, key=lambda x: x[0], reverse=True)
        sorted_statement_scores, sorted_statement_rerank_list, sorted_factCheck_metadata = zip(*sorted_combined_data)
    
    
        context_rerank_list = []
        for k in range(len(context['ids'][0])):
            reranking_contextSearch = [input, context['documents'][0][k]]
            context_rerank_list.append(reranking_contextSearch)
            
        scores = reranker.compute_score(context_rerank_list)
        combined_context_scores = list(zip(scores, context_rerank_list, context['metadatas'][0]))
        sorted_combined_data = sorted(combined_context_scores, key=lambda x: x[0], reverse=True)
        sorted_context_scores, sorted_context_rerank_list, sorted_context_metadata = zip(*sorted_combined_data)
    
        context_window = 3
        prepared_context = []
        prepared_fact_checks = []
        for i in range(context_window):
            prepared_context.append([sorted_context_metadata[i], sorted_context_rerank_list[i][1]])
            prepared_fact_checks.append([sorted_factCheck_metadata[i], sorted_statement_rerank_list[i][1]])

        #Changing chunks from list of strings to one combined string for Gen AI processing
        chunk_history_string = ''
        for chunk in history:
            if chunk != None:
                chunk_history_string += chunk + " "


        #generating initial response with prompt template
        responses = model.generate_content(f"""Answer the question below marked inside <<<>>> in a full sentence based on the
        knowledge you already have access to answer the question.
    
        If you are not very sure of your answer to the question, then use the additional information I've provided below within the 
        ((())) symbols to help you.
        (((
        Refer to these fact checked statements as well to determine your answer and be sure to pay close attention to the 
        metadata that is provided: {prepared_fact_checks}.
        Use the following context to help answer the question: {prepared_context}.
        You may also use the chat history provided to help you understand the context better if available: {chunk_history_string}.
        Make sure you provide a short explanation of why you chose that score.
        )))
        <<<
        Question: How true is the following statement on a scale of 1-100? + {input}. You must provide the score in this format Score:XX., 
        followed by the input statement, and then followed by your short explanation.
        >>>
       """,
            generation_config=config,
            stream=True,
            safety_settings=safety_config,                          
        )

        
    #obtaining individual responses
        response_text = ""
        for response in responses:
            response_text += response.text
        response_text = response_text.replace("\n\n", ". ")
        all_response_text.append(response_text)
        

    #combining all responses    
    entire_text_string = ""
    for text in all_response_text:
        entire_text_string += text
    cleaned_text = clean_text(entire_text_string)
    
    #this section is finding and removing the statements that can't be rated by the chatbot
    unratable_sentences = []
    rated_sentences = []
    
    for response in all_response_text:
        if "article does not" in response.lower() or "context does not" in response.lower() or "statement is not" in response.lower():
            unratable_sentences.append(response)
        else:
            rated_sentences.append(response)
    
    not_enough_context = len(unratable_sentences)
    enough_context = len(rated_sentences)
    all_statements_count = len(all_response_text)

    # Regular expression pattern
    number_pattern = r'\b(\d+(?:\.\d+)?)(?:\.|%)'
    total_score = 0
    
    # Extracting the number value
    for statement in rated_sentences:
        match = re.search(number_pattern, statement)
        if match:
            number = match.group(0)[:-1]  # Removing the period
            number = int(number)
            total_score += number
            
    overall_score = total_score / len(rated_sentences)
    overall_score = round(float(overall_score), 1)
    
    #model generation for output to user
    final_responses = model.generate_content(f"""Each entry in the list of statements provided below inside <<<>>> begins with a number
    that explains how truthful a statement is and is followed by a text explanation to why that score was chosen. I need you to select the 
    three statements with the lowest scores and return them back with their explanations to why they received low scores. 
    I would like you to format your response like this, and continue to follow it for each statement you choose to include.
    
    "{enough_context} out of {all_statements_count} statements in the text could be rated. 
    The following score and explanation is based on these {enough_context} statements. The average truthfulness score from these {all_statements_count} statements is {overall_score}. Some of the lowest rated statements are provided below"
    
    \nScore: XX
    Statement: "Statement here"
    Explanation: "Explanation here"

    <<<
    {rated_sentences}
    >>>""",
        generation_config=config,
        stream=True,
        safety_settings=safety_config,
    )

    # final_responses = model.generate_content(f"""Each entry in the list of statements provided below inside <<<>>> begins with a number
    # that explains how truthful a statement is and is followed by a text explanation to why that score was chosen. I need you to convert that
    # number to a label based on the following categories. Scores from 0-16 should be labeled pants-on-fire, scores from 17-33 should be
    # labeled false, scores from 34-50 should be labeled mostly-false, scores from 50-66 should be labeled half-true, scores from 67-83 should be
    # labeled mostly-true, and scores from 84-100 should be labeled true. All I want you to return to me is the label.
    # <<<
    # {rated_sentences}
    # >>>""",
    #     generation_config=config,
    #     stream=True,
    #     safety_settings=safety_config,
    # )
    
    
    final_response_text = ""
    for response in final_responses:
        final_response_text += response.text
    output = final_response_text.replace("\n\n", ". ")

    #Perspective API output safety check
    analyze_request = {
      'comment': { 'text': output},
      'requestedAttributes': requestedAttributes
    }
    response = client.comments().analyze(body=analyze_request).execute()
    
    attributes_surpassed = []
    for key in response['attributeScores']:
        if response['attributeScores'][key]['summaryScore']['value'] > attributeThresholds[key]:
            attributes_surpassed.append((key, response['attributeScores'][key]['summaryScore']['value']))
    
    #crafting output warning message if necessary or regular output message  
    history_output = []
    if len(attributes_surpassed) == 1:
        attributes_violated = ""
        for i in attributes_surpassed:
            attributes_violated += i[0] + " "
        warning_message = f"""We're sorry, the output message surpasses our threshold for the {attributes_violated}category so we cannot safely provide a response. Please try again with a different input."""
        history_output.append([news_article, warning_message])
        
    elif len(attributes_surpassed) > 1:
        attributes_violated = ""
        counter = 1
        attributes_count = len(attributes_surpassed)
        for i in attributes_surpassed:
            attributes_violated += i[0] + " "
            if counter < attributes_count:
                attributes_violated += "and "
            counter += 1
        warning_message = f"""We're sorry, the output message surpasses our threshold for the {attributes_violated}categories so we cannot safely provide a response. Please try again with a different input."""
        history_output.append([news_article, warning_message])

    else:
        history_output.append([news_article, output])
    return history_output, history_output

In [657]:
news = """Months after leaving the White House, former President Donald Trump began plotting his return to Wall Street. That return, delayed by years of regulatory and legal hurdles, is now on the verge of becoming a reality — and it could make Trump a fortune.

US regulators have finally given the green light to a controversial merger between Truth Social owner Trump Media & Technology Group and a blank-check company. The blessing from the Securities and Exchange Commission removes the last major obstacle holding back the deal.

The merger, if approved by shareholders, would pave the way for Trump Media to become a publicly-traded company — one where Trump will own a dominant stake that could be worth billions.

Digital World Acquisition Corp., the blank-check firm, announced that on Wednesday the SEC signed off on the merger proxy for the deal. A date for a shareholder vote will be set by Friday.

“It does look like this deal is going to reach the finish line now — after more than two years of delays,” said Jay Ritter, a finance professor at the University of Florida.

Trump stake could be worth $4 billion
Shares of Digital World, a special purpose acquisition company, or SPAC, spiked 15% on the major milestone. The stock has nearly tripled this year, fueled by Trump’s political success in the Republican presidential primary, and now the merger progress.

Ritter estimates the merger could pave the way for about $270 million of cash coming into Trump Media, funds the company could fuel Truth Social’s growth.

Trump is set to hold a dominant position in the newly-combined company, owning roughly 79 million shares, according to new SEC filings.

The former president’s stake would be valued at $4 billion based on Digital World’s current trading price of about $50.

Of course, as Ritter notes, it would be very difficult for Trump to translate that paper wealth into actual cash.

Not only would Trump be subject to a lock-up period that would prevent he and other insiders from selling until six months after the merger, but the new company’s fortunes would be closely associated with the former president. That could make it difficult for Trump to sell even after the lock-up period expires.

‘This is a meme stock’
Moreover, there are major questions about the sky-high valuation being placed on this media company.

“This is a meme stock. The valuation is totally divorced from the fundamental value of the company,” said Ritter.

Digital World’s share price values the company at up to about $8 billion on a fully diluted basis, which includes all shares and options that could be converted to common stock, according to Ritter.

He described that valuation as “crazy” because Trump Media is generating little revenue and burning through cash.

New SEC filings indicate Trump Media’s revenue amounted to just $1.1 million during the third quarter. The company posted a loss of $26 million.

Since the merger was first proposed in October 2021, legal, regulatory and financial questions have swirled about the transaction.

In November, accountants warned that Trump Media was burning cash so rapidly that it might not survive unless the long-delayed merger with Digital World is completed soon.

Shareholder vote looms
Now, Trump execs are cheering the green light from the SEC.

“Truth Social was created to serve as a safe harbor for free expression and to give people their voices back,” Trump Media CEO Devin Nunes, a former Republican congressman, said in a statement. “Moving forward, we aim to accelerate our work to build a free speech highway outside the stifling stranglehold of Big Tech.”

Eric Swider, Digital World’s CEO, described the SEC approval as a “significant milestone” and said executives are “immensely proud of the strides we’ve taken towards advancing” the merger.

One of the final remaining hurdles is for Digital World shareholders to approve the merger in an upcoming vote.

The shareholders have enormous incentive to approve the deal because if the merger fails, the blank-check firm would be forced to liquidate. That would leave shareholders with just $10 a share, compared with $50 in the market today.

“Anyone who holds shares and votes against the merger is crazy,” said Ritter, the professor.

“Then again, I might argue that everyone holding DWAC shares is crazy,” he added, referring to the company’s thin revenue and hefty valuation.

Matthew Tuttle, CEO of Tuttle Capital Management, said he’s not surprised by the ups and downs surrounding this merger.

“The thing about Trump and anything related to Trump is, love him or hate him, there is going to be drama,” said Tuttle, who purchased options to buy Digital World shares in his personal account. “Really, I would not have expected anything less.”

Going forward, Tuttle said Trump Media’s share price will live and die by how everything plays out for Trump personally — from his legal troubles to his potential return to the White House.

“Anything bullish for Trump is going to be bullish for the stock,” said Tuttle.

Trump is no stranger to Wall Street, where he has a history, one marked by bankruptcies.

Although Trump has never filed for personal bankruptcy, he has filed four business bankruptcies — all of them linked to casinos he used to own in Atlantic City."""

In [677]:
final_response = GenAI_article_truth_processing(news, [])

In [678]:
final_response[0][0][1]

'26 out of 28 statements in the text could be rated. \n    The following score and explanation is based on these 26 statements. The average truthfulness score from these 28 statements is 85.8. Some of the lowest rated statements are provided below". Score: 50\n    Statement: “ Truth Social was created to serve as a safe harbor for free expression and to give people their voices back , ” Trump Media CEO Devin Nunes , a former Republican congressman , said in a statement ..\n    Explanation: The statement is made by Devin Nunes, who has a history of pushing social media companies to restrict speech that he objected to. He has also filed several defamation lawsuits against media companies and critics. This suggests that he may not be fully committed to free speech.. Score: 70\n    Statement: “ Then again , I might argue that everyone holding DWAC shares is crazy , ” he added , referring to the company ’ s thin revenue and hefty valuation.\\nExplanation: The statement is somewhat true beca

## Fine Tuning

In [599]:
model.generate_content()

<vertexai.preview.generative_models._PreviewGenerativeModel at 0x37b540310>

## Gradio (Website) Implementation

In [672]:
import gradio as gr

In [673]:
block = gr.Blocks()
prompt_placeholder = "Insert your news article here!"

In [674]:
with block:
    gr.Markdown("""<h1><center>Generative AI News Article Truthfulness Evaluator</center></h1>
    """)
    chatbot = gr.Chatbot()
    message = gr.Textbox(placeholder=prompt_placeholder)
    state = gr.State()
    submit = gr.Button("SEND")
    submit.click(GenAI_article_truth_processing, inputs=[message, state], outputs=[chatbot, state])

In [675]:
block.launch(debug = True, share = True)

Running on local URL:  http://127.0.0.1:7860


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Running on public URL: https://c9cf96d887b56476f1.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://c9cf96d887b56476f1.gradio.live


