In [1]:
import langchain
import requests
from langchain import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI
from langchain.utilities import GoogleSerperAPIWrapper
from langchain.document_loaders import UnstructuredURLLoader
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.output_parsers import RegexParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from termcolor import colored
from bs4 import BeautifulSoup
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
# How many similar articles to use as reference
SIMILAR_COUNT = 3
# Original article url
ARTICLE_URL = "https://www.reuters.com/world/middle-east/turkey-votes-pivotal-elections-that-could-end-erdogans-20-year-rule-2023-05-13/"

In [3]:
response = requests.get(ARTICLE_URL)
soup = BeautifulSoup(response.text, 'html.parser')
title = soup.find('title').get_text().split('|')[0]

In [4]:
serp_tool = GoogleSerperAPIWrapper(tbs="qdr:m")

In [5]:
similar_articles_serp = serp_tool.results(f"{title} news articles")

In [6]:
similar_articles_final = []
for article_meta in similar_articles_serp["organic"]:
    link = article_meta["link"]
    if not link.startswith('https://www.youtube.com') and not link.startswith('https://youtube.com'):
        similar_articles_final.append(link)
        if len(similar_articles_final) == SIMILAR_COUNT:
            break
print(similar_articles_final)

['https://www.cnbc.com/2023/05/14/turkey-election-rivals-both-claim-early-lead-but-runoff-likely.html', 'https://www.reuters.com/world/middle-east/turkey-votes-pivotal-elections-that-could-end-erdogans-20-year-rule-2023-05-13/', 'https://www.wsj.com/articles/having-established-turkey-on-the-world-stage-erdogan-faces-risky-vote-at-home-fd0793d9']


In [7]:
total_articles = [ARTICLE_URL]
total_articles.extend(similar_articles_final)

In [8]:
loader = UnstructuredURLLoader(urls=total_articles)
articles_data = loader.load()

In [9]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 250,
    chunk_overlap  = 30,
    length_function = len
)

In [10]:
article_documents = []
for article_content in articles_data[1:]:
    article_documents.extend(text_splitter.create_documents([article_content.page_content]))

In [11]:
primary_texts = text_splitter.split_text([articles_data[0].page_content])

In [12]:
embeddings = OpenAIEmbeddings()
article_db = FAISS.from_documents(article_documents, embeddings)

In [13]:
article_retriever = article_db.as_retriever()

In [14]:
article_retriever.get_relevant_documents("who is the current the current president of Turkey?")

[Document(page_content="Turkey headed for a runoff vote after President Tayyip Erdogan led over his opposition rival Kemal Kilicdaroglu in Sunday's election but fell short of an outright majority to extend his 20-year rule of the NATO-member country.", metadata={}),
 Document(page_content='The presidential vote will decide not only who leads Turkey but also whether it reverts to a more secular, democratic path, how it will handle its severe cost of living crisis, and manage key relations with Russia, the Middle East and the West.', metadata={}),
 Document(page_content='The presidential vote will decide not only who leads Turkey but also whether it reverts to a more secular, democratic path, how it will handle its severe cost of living crisis, and manage key relations with Russia, the Middle East and the West.', metadata={}),
 Document(page_content="Kilicdaroglu, who said he would prevail in the runoff, urged his supporters to be patient and accused Erdogan's party of interfering with t

In [15]:
# TODO: LLM/Agent reads through article, whenever encountering document with potential misinformation/skewed narrative,
# use get_relevant_documents tool to compare against other articles and determine if a bias is present.

In [16]:
primary_embeddings = OpenAIEmbeddings()
docsearch = FAISS.from_texts(primary_texts, primary_embeddings, metadatas=[{"source": str(i)} for i in range(len(primary_texts))])
docs = docsearch.similarity_search(title)

In [18]:
output_parser = RegexParser(
    regex=r"(.*?)\nScore: (.*)",
    output_keys=["answer", "score"],
)

prompt_template = """Use the following excerpt from a news article titled "{question}" to determine whether any signals of political bias exist within the news article.

If you detect any signals of political bias, concisely explain what the signals are. If you can't find any signals, just say that you can't find any, don't try to make up signals.

In addition to giving an explanation of the political bias signals, also return a score of how confident you are that political bias exists in the excerpt. This should be in the following format:

Political Bias Signals: [explination of political bias signals here]
Score: [confidence score between 0 and 100]

Begin!

excerpt:
---------
{context}
---------
Political Bias Signals:"""
PROMPT = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"],
    output_parser=output_parser,
)
chain = load_qa_with_sources_chain(ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0.0), chain_type="map_rerank", metadata_keys=['source'], return_intermediate_steps=True, prompt=PROMPT)
result = chain({"input_documents": docs, "question": title}, return_only_outputs=False)

In [None]:
# TODO: Modify map_rerank prompt
# Score: How politically biased section piece of context
# Answer: Why llm thinks context is politically biased

In [19]:
result

{'input_documents': [Document(page_content="ISTANBUL, May 14 (Reuters) - Turkey headed for a runoff vote after President Tayyip Erdogan led over his opposition rival Kemal Kilicdaroglu in Sunday's election but fell short of an outright majority to extend his 20-year rule of the NATO-member", metadata={'source': '2'}),
  Document(page_content='Middle East\n\n5 minute read\n\nMay 15, 2023\n\n5:46 AM UTC\n\nLast Updated  ago\n\nTurkey faces runoff election with Erdogan leading\n\nBy \n\nOrhan Coskun\n\n, \n\nEce Toksabay\n\n and \n\nAli Kucukgocmen\n\nSummary', metadata={'source': '0'}),
  Document(page_content="ERDOGAN HAS EDGE\n\nThe results reflected deep polarization in a country at a political crossroads. The vote was set to hand Erdogan's ruling alliance a majority in parliament, giving him a potential edge heading into the runoff.", metadata={'source': '12'}),
  Document(page_content="With almost 97% of ballot boxes counted, Erdogan led with 49.39% of votes and Kilicdaroglu had 44.

In [None]:
# Use zero shot agent in with tool of article_retriever.get_relevant_documents() to determine whether original article contains political bias in comparison to relavent data from other articles