# Logical fallacy extraction from live news articles

### What is this? 
This script returns any logical fallaces found in news articles, so you can feel enlightened and empowered and not manipulated

### How does this work?
1. You specify a topic to search the news on
2. Script uses the Google News API to return a list of articles on a topic from the last 7 days
2. Extracts the news text using Beautiful Soup
3. Creates a Sequential Chain using LangChain and OpenAI to analyze and return any logical fallacies found 

### How are logical fallacies defined?
This paper on Arxiv describes 19 categories found since Aristotle's original 13 - incorporated for RAG using OpenAI's embeddings model and FAISS vector db for quick retreival and analysis
<br> https://arxiv.org/pdf/2212.07425.pdf

### How do I deploy this on my machine?
Either run this notebook, or out of this same directory run ```streamlit run newsvalidation.py``` to open GUI courtesy of streamlit

### Requirements
Use your venv and requirements.txt 


In [56]:
import warnings
warnings.filterwarnings('ignore')

import os
import sys 
import openai 
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) 

openai.api_key  = os.environ['OPENAI_API_KEY']
serper_api_key  = os.environ['SERPER_API_KEY']

In [57]:
# import packages

from bs4 import BeautifulSoup
import faiss
import json
from langchain import OpenAI, VectorDBQA
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader, JSONLoader, UnstructuredFileLoader, WebBaseLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains import LLMChain, SequentialChain, RetrievalQA
from langchain.memory import VectorStoreRetrieverMemory
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain.utilities import GoogleSerperAPIWrapper
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.vectorstores import FAISS, Chroma
from langchain.schema import Document
from pathlib import Path
from pprint import pprint
import requests
import streamlit as st, tiktoken 
from unstructured.cleaners.core import clean_extra_whitespace


In [58]:
from datetime import datetime, timedelta
from langchain.docstore import InMemoryDocstore
from langchain.retrievers import TimeWeightedVectorStoreRetriever



## Load logical fallacies to memory
arxiv > dict > faiss

In [67]:
# import 19 logical fallacies
# source of extract: "https://arxiv.org/pdf/2212.07425.pdf
# will be used to evaluate results when using this as a datasource rather than the pdf 

fallacies = {
    "Logical Fallacy Name": ["Adhominem","Adpopulum","Appeal to Emotion","Fallacy of Extension",
                        "Intentional Fallacy","False Causality","False Dilemma","Hasty Generalization",
                        "Illogical Arrangement","Fallacy of Credibility","Circular Reasoning",
                        "Begging the Question","Trick Question","Overapplying","Equivocation","Amphiboly",
                        "Word Emphasis","Composition","Division"],
    "Description": ["attacks on the character or personal traits of the person making an argument rather than addressing the actual argument and evidence",
                   "the fallacy that something must be true or correct simply because many people believe it or do it, without actual facts or evidence to support",
                   "an attempt to win support for an argument by exploiting or manipulating people's emotions rather than using facts and reason",
                   "making broad, sweeping generalizations and extending the implications of an argument far beyond what the initial premises support",
                   "falsely supporting a conclusion by claiming to understand an author or creator's subconscious intentions without clear evidence",
                   "jumping to conclusions about causation between events or circumstances without adequate evidence to infer a causal relationship",
                   "presenting only two possible options or sides to a situation when there are clearly other alternatives that have not been considered or addressed",
                   "making a broad inference or generalization to situations, people, or circumstances that are not sufficiently similar based on a specific example or limited evidence",
                   "constructing an argument in a flawed, illogical way, so the premises do not connect to or lead to the conclusion properly",
                   "dismissing or attacking the credibility of the person making an argument rather than directly addressing the argument itself",
                 "supporting a premise by simply repeating the premise as the conclusion without giving actual proof or evidence",
                  "restating the conclusion of an argument as a premise without providing actual support for the conclusion in the first place",
                   "asking a question that contains or assumes information that has not been proven or substantiated",
                   "applying a general rule or generalization to a specific case it was not meant to apply to",
                   "using the same word or phrase in two different senses or contexts within an argument",
                   "constructing sentences such that the grammar or structure is ambiguous, leading to multiple interpretations",
                   "shifting the emphasis of a word or phrase to give it a different meaning than intended",
                   "erroneously inferring that something is true of the whole based on the fact that it is true of some part or parts",
                   "erroneously inferring that something is true of the parts based on the fact that it is true of the whole"]
}
json_str = json.dumps(fallacies, indent=4)


with open("fallacies.json", "w") as json_file:
    json_file.write(json_str)


In [68]:
# Load dict as JSON - could not quickly find an obvious method to load dict as same format that loader.load expects

loader = JSONLoader(
    file_path='./fallacies.json',
    jq_schema='.',
    text_content=False)

data = loader.load()

In [69]:
# Split dictionary using same methodology as the pdf - similar amount of docs from splits as the PDF

llm = ChatOpenAI(temperature=0, model='gpt-3.5-turbo', openai.api_key=open_api_key)
chunk_size=50
chunk_overlap=10
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    length_function=len
)
docs2 = text_splitter.split_documents(data)
len(docs2)

1

In [77]:
# Setup faiss retreiver and save for later references

db2 = FAISS.from_documents(docs2, embeddings)
retriever2 = db2.as_retriever(search_kwargs={"k":2})
model2 = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=retriever2)
save_directory2 = "FallacyDict"
db2.save_local(save_directory2)

In [81]:
# Test faiss call 

query = 'list the primary categories and subcategories of logical fallacies and define each one'
response = model2({"query":query}, return_only_outputs=True)
f_results = model2.run(query)
print(f_results)

The primary categories of logical fallacies are:

1. Ad Hominem: This fallacy involves attacks on the character or personal traits of the person making an argument rather than addressing the actual argument and evidence.

2. Ad Populum: This is the fallacy that something must be true or correct simply because many people believe it or do it, without actual facts or evidence to support.

3. Appeal to Emotion: This fallacy is an attempt to win support for an argument by exploiting or manipulating people's emotions rather than using facts and reason.

4. Fallacy of Extension: This fallacy involves making broad, sweeping generalizations and extending the implications of an argument far beyond what the initial premises support.

5. Intentional Fallacy: This fallacy involves falsely supporting a conclusion by claiming to understand an author or creator's subconscious intentions without clear evidence.

6. False Causality: This fallacy involves jumping to conclusions about causation between e

## Call API for News URLs by way of search terms or API
Using SerperAPI, search term results ranked by relevancy to key words in title

In [72]:
# search news - setting up the republican debate from this past week, using 3 articles to test
# this is the input textfield in streamlit but hardcoding here for demo in ipynb file 

search_query = 'covid'
num_results = 3

search = GoogleSerperAPIWrapper(type="news", tbs="qdr:w1", serper_api_key=serper_api_key)

try:
    result_dict = search.results(search_query)
    for i, item in zip(range(num_results), result_dict['news']):
        url = item.get('link','N/A') 
        if url == 'N/A':
            continue  
        loader = WebBaseLoader(url) # using bs4 
        try:
            datanews = loader.load()
        except Exception as e:
            print(f"Error fetching {item['link']}, exception: {e}")
except Exception as e:
    print(f"Error fetching search results, exception: {e}")

In [104]:
# Get fallacy text

result_text = ""

for fallacy_name, description in zip(fallacies['Logical Fallacy Name'], fallacies['Description']):
    result_text += f"Logical Fallacy Name: {fallacy_name}\n"
    result_text += f"Description: {description}\n\n"

In [99]:
# Vectorize for embeddings if/where needed
chunk_size=50
chunk_overlap=10

text_splitter_news = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=[" ", ",", "\n"]
    )

news = text_splitter_news.split_documents(datanews)


In [82]:
# Define model using OpenAI 
embeddings_model3 = OpenAIEmbeddings()
embedding_size = 1536
index = faiss.IndexFlatL2(embedding_size)
vectorstore3 = FAISS(embeddings_model3.embed_query, index, InMemoryDocstore({}), {})
retriever3 = TimeWeightedVectorStoreRetriever(vectorstore=vectorstore3, decay_rate=.999999, k=2)
memory = VectorStoreRetrieverMemory(retriever=retriever3) # memory might not be needed


In [115]:
 # Chain 1: Summarize any possible logical fallacies in the text of the article

template1 = """You are a communications expert who speaks nothing but truth and logic and is \
extremely clear for a wide audience.  Given a full news article, your job is to summarize \
it accurately and with brevity in one sentence, then find any logical fallacies that \
may exist and return examples in no more than 1 sentence per logical fallacy found.  \
If more than one logical fallacies are found, return the top 2, in order of logical strength, \
unless no logical fallacies are found, in which then state no strong logical fallacies are clearly evident. \
Article: {datanews} \
Communications expert: 
Summary:"""
prompt_template1 = PromptTemplate(input_variables=["datanews"], template=template1)
chain1 = LLMChain(llm=llm, prompt=prompt_template1, output_key='summary')

In [116]:
 # Chain 2: Analyze the implications of any logical fallacies in the article in relation to the article summary

template2 = """You are an engaging professor who only speaks with truth and sound logic \
while clearly conveying a point in as few words as possible.  Given the text of a news article as defined, if the title reads 'Access Denied' then state no access \
to the article is available. If the title does not read 'Access Denied', then create two outputs: Analysis and Counterfactual. \
For the Analysis output, it will be three parts.  The first part is labeled 'Summary' and returns {summary}.  The second part is labeled 'Analysis' and is two sentences.  First, you need \
to return the top ranked logical fallacy in the article, among any logical fallacies that may exist, ranked by order of logical strength, \
described with brevity in one sentence and confirming this logical fallacy is correct by extracting factual evidence \
from the article text, then finally referencing the extracted fact in the description. Be sure to state the strongest logical fallacy might not be strong, \
so is only to consider. Create the second sentence of the Analysis output by stating why this fallacy might be dangerous to the public or \
especially misleading in the context of the news article, with respect to how other readers could react. \
The third part is labeled 'Theoretical Counterfactual', explain any counterfactuals to the summary of the article ({summary}) that could hypothetically be true, \
based on logic and the limited facts presented in the article.  If more than one counterfactuals exist, only return  \
the top ranked counterfactual, ranked in order of logical strength and feasibility, described with brevity in 1 sentence. \ 
Professor: \
Summary: {summary}\
Analysis: \ 
Theoretical Counterfactual: """
prompt_template2 = PromptTemplate(input_variables=["summary"], template=template2)
chain2 = LLMChain(llm=llm, prompt=prompt_template2, output_key='analysis')

In [119]:
# search news with input string
# this is the input textfield in streamlit but hardcoding here for demo in ipynb file 

search_query = 'covid'
num_results = 4


search = GoogleSerperAPIWrapper(type="news",tbs="qdr:w1", serper_api_key=serper_api_key)

try:
    result_dict = search.results(search_query)
    for i, item in zip(range(num_results), result_dict['news']):
        url = item.get('link','N/A') 
        if url == 'N/A':
            continue  
        loader = WebBaseLoader(url) # bs4 
        try:
            datanews = loader.load()
            overall_chain1 = SequentialChain(chains=[chain1, chain2],
                input_variables=["datanews"],
                output_variables=["analysis"],
                verbose=True)
            first = (overall_chain1({"datanews":datanews}))
            analysis = first['analysis']
            print(analysis)
        except Exception as e:
            print(f"Error fetching {item['link']}, exception: {e}")
except Exception as e:
    print(f"Error fetching search results, exception: {e}")



[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m
If the COVID-19 virus mutates significantly, the current vaccines may not provide the same level of protection as they do now.


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m
If the vaccines were not updated to better protect against the currently circulating variants, it is possible that the efficacy of the vaccines could decrease, leading to a potential increase in COVID-19 cases.


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m
Analysis: 
The strongest logical fallacy in the article is the appeal to authority, as it relies on the CDC and FDA's authority to validate the vaccine's effectiveness without providing specific data or studies. This fallacy could be misleading to the public as it may lead them to believe that the vaccine's effectiveness is universally accepted and uncontested, potentially discouraging further research or questioning.

Theoreti

In [126]:
# Replicating above functionality but using Streamlit for input
st.subheader('Enter search terms:')

with st.sidebar:
    openai.api_key = st.text_input("OpenAI API Key", value="", type="password")
    serper_api_key = st.text_input("Serper API Key", value="", type="password")
    num_results = st.number_input("Number of Search Results", min_value=3, max_value=5)
    st.caption("*Search: Uses Serper & OpenAI APIs, summarizes each search result.*")
    st.caption("*URL Lookup: Analyzes a specific URL*")
search_query = st.text_input("Search Query", label_visibility="collapsed")
col1, col2 = st.columns(2)

# If the 'Search' button is clicked
if col1.button("Search"):
    # Validate inputs
    if not api_key.strip() or not serper_api_key.strip() or not search_query.strip():
        st.error(f"Please provide the API keys or the missing search terms.")
    else:
        try:
            with st.spinner("Analyzing articles..."):
                # Show the top X relevant news articles from the previous week using Google Serper API
                search = GoogleSerperAPIWrapper(type="news", tbs="qdr:w1", serper_api_key=serper_api_key)
                result_dict = search.results(search_query)

                if not result_dict['news']:
                    st.error(f"No search results for: {search_query}.")
                else:
                    for i, item in zip(range(num_results), result_dict['news']):
                        url = item.get('link','N/A') 
                        if url == 'N/A':
                            continue  
                        loader = WebBaseLoader(url) # bs4 
                        try:
                            datanews = loader.load()
                            overall_chain1 = SequentialChain(chains=[chain1, chain2],
                                input_variables=["datanews"],
                                output_variables=["analysis"],
                                verbose=True)
                            first = (overall_chain1({"datanews":datanews}))
                            st.success(f"Logical Fallacy Critique: {item['analysis']}\n\nLink: {item['link']}")
                        except Exception as e:
                            print(f"Error fetching {item['link']}, exception: {e}")
        except Exception as e:
            st.exception(f"Exception: {e}")

# If 'Search & Summarize' button is clicked
if col2.button("URL Lookup"):
    # Validate inputs
    if not api_key.strip() or not serper_api_key.strip() or not search_query.strip():
        st.error(f"Please provide the API keys or missing URL in the search term window.")
    else:
        try:
            with st.spinner("Analyzing articles..."):
                # Show the top X relevant news articles from the URL entered - lookup since URLs change
                search = GoogleSerperAPIWrapper(type="news", tbs="qdr:w1", serper_api_key=serper_api_key)
                result_dict = search.results(search_query)

                if not result_dict['news']:
                    st.error(f"No search results for: {search_query}.")
                else:
                    for i, item in zip(range(num_results), result_dict['news']):
                        url = item.get('link','N/A') 
                        if url == 'N/A':
                            continue  
                        loader = WebBaseLoader(url) # bs4 
                        try:
                            datanews = loader.load()
                            overall_chain1 = SequentialChain(chains=[chain1, chain2],
                                input_variables=["datanews"],
                                output_variables=["analysis"],
                                verbose=True)
                            first = (overall_chain1({"datanews":datanews}))
                            st.success(f"Logical Fallacy Critique: {item['analysis']}\n\nLink: {item['link']}")
                        except Exception as e:
                            print(f"Error fetching {item['link']}, exception: {e}")
        except Exception as e:
            st.exception(f"Exception: {e}")

