In [1]:
import langchain
import requests
from langchain import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.utilities import GoogleSerperAPIWrapper
from langchain.document_loaders import UnstructuredURLLoader
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from termcolor import colored
from bs4 import BeautifulSoup
from dotenv import load_dotenv

load_dotenv()

In [2]:
# How many similar articles to use as reference
SIMILAR_COUNT = 3
# Original article url
ARTICLE_URL = "https://www.reuters.com/world/middle-east/turkey-votes-pivotal-elections-that-could-end-erdogans-20-year-rule-2023-05-13/"

In [3]:
response = requests.get(ARTICLE_URL)
soup = BeautifulSoup(response.text, 'html.parser')
title = soup.find('title').get_text().split('|')[0]

In [4]:
serp_tool = GoogleSerperAPIWrapper(tbs="qdr:m")

In [5]:
similar_articles_serp = serp_tool.results(f"{title} news articles")

In [19]:
similar_articles_final = []
for article_meta in similar_articles_serp["organic"]:
    link = article_meta["link"]
    if not link.startswith('https://www.youtube.com') and not link.startswith('https://youtube.com'):
        similar_articles_final.append(link)
        if len(similar_articles_final) == SIMILAR_COUNT:
            break
print(similar_articles_final)

['https://www.reuters.com/world/middle-east/turkey-votes-pivotal-elections-that-could-end-erdogans-20-year-rule-2023-05-13/', 'https://nationalpost.com/pmn/news-pmn/turkey-faces-runoff-election-as-erdogan-survives-first-test-but-remains-at-risk', 'https://gazette.com/news/us-world/turkey-faces-runoff-election-as-erdogan-survives-first-test/article_1cce87e4-06f1-5378-9084-6330decef2b2.html']


In [7]:
total_articles = [ARTICLE_URL]
total_articles.extend(similar_articles_final)

In [8]:
loader = UnstructuredURLLoader(urls=total_articles)
articles_data = loader.load()

In [9]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 250,
    chunk_overlap  = 30,
    length_function = len
)

In [10]:
article_documents = []
for article_content in articles_data[1:]:
    article_documents.extend(text_splitter.create_documents([article_content.page_content]))

In [11]:
primary_documents = text_splitter.create_documents([articles_data[0].page_content])

In [12]:
embeddings = OpenAIEmbeddings()
article_db = FAISS.from_documents(article_documents, embeddings)

In [13]:
article_retriever = article_db.as_retriever()

In [18]:
article_retriever.get_relevant_documents("who is the current the current president of Turkey?")

[Document(page_content='CAGLA GURDOGAN\n\nTurkish President Tayyip Erdogan holds a present for supporters ahead of the May 14 presidential and parliamentary elections, in Istanbul, Turkey May 13, 2023. REUTERS/Dylan Martinez\n\nDYLAN MARTINEZ', metadata={}),
 Document(page_content="Kemal Kilicdaroglu, presidential candidate of Turkey's main opposition alliance, waves next to his wife Selvi Kilicdaroglu as he leaves the polling station after voting during the presidential and parliamentary elections, in Ankara, Turkey May 14,", metadata={}),
 Document(page_content='CAGLA GURDOGAN\n\nA person casts a ballot during the presidential and parliamentary elections in Istanbul, Turkey May 14, 2023. REUTERS/Dilara Senkaya\n\nDILARA SENKAYA', metadata={}),
 Document(page_content='CAGLA GURDOGAN\n\nFILE PHOTO: Turkish President Tayyip Erdogan addresses his supporters during a rally ahead of the May 14 presidential and parliamentary elections, in Ankara, Turkey April 30, 2023. REUTERS/Cagla Gurdoga

In [15]:
# TODO: LLM/Agent reads through article, whenever encountering document with potential misinformation/skewed narrative,
# use get_relevant_documents tool to compare against other articles and determine if a bias is present.

In [16]:
llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0.0)