In [15]:
from bs4 import BeautifulSoup
from typing import Any, List
from langchain_openai import ChatOpenAI
from langchain import hub
from langchain_core.prompts import PromptTemplate
from langchain_core.tools import tool, Tool
from langchain_core.output_parsers.json import JsonOutputParser
from langchain.agents.react.output_parser import ReActOutputParser
from langchain.tools.render import render_text_description
from pprint import pprint

import pandas as pd
import requests 
import os
from dotenv import load_dotenv
import time
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.environ["OPENAI_API_KEY_PERSONAL"]

In [2]:
@tool
def scrape_website_table(url: str) -> int:
    """
        This function scrapes website and return only the html table section
    """
    attempt = 0
    response_code = None
    while (response_code != 200) | (attempt == 3):
        try:
            headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"}
            response = requests.get(url, timeout = 10, headers=headers)
            response.raise_for_status()
            response_code = response.status_code
            if response_code!= 200:
                print("Inside scrape_website, Retrying to scrape")
            soup = BeautifulSoup(response.text, "html.parser")
            tables = soup.find_all("table")
            full_text = " ".join([table.get_text() for table in tables])
            print("\n\nInside scrape_website, Returning scraped source_code\n\n")
            return full_text
        except Exception as e:
            print("Inside scrape_website, Error fetching the content", e)
            return " "
        attemp += 1

In [3]:
def find_function(tool_list, tool_name):
    for tool_func in tool_list:
        if tool_name == tool_func.name:
            return tool_func

    raise ValueError("No Such tool found")

In [4]:
react_prompt = """
Answer the following questions as best you can. You have access to the following tools:

{tools}

Use the following format:

Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question
Return the response and `Final Answer` only in json format.
If there is a table, return the table in json format

Begin!

Question: {input}
Thought:{agent_scratchpad}
"""
tools = [scrape_website_table]
llm = ChatOpenAI(temperature=0, model="gpt-4o")
prompt = PromptTemplate.from_template(react_prompt, 
                                      partial_variables = {"tools" : render_text_description(tools), 
                                                          "tool_names" : ", ".join([t.name for t in tools])}
                                      )

In [5]:
agent = (
    {
        "input" : lambda x: x["input"],
        "agent_scratchpad": lambda x: x["agent_scratchpad"]
    }
    | prompt
    | llm
    | JsonOutputParser()
)

In [6]:
def setlogs(sample_agent, step):

    print("*"*8, f"Step{step}", "*"*8)
    print(sample_agent)
    print("*"*8, "End", "*"*8, "\n\n")

In [30]:
def write_to_disk(sample_data, key="table"):
    data = pd.DataFrame(sample_data)
    time_identifier = str(round(int(time.time())))
    path = f"output_data/{key}_{time_identifier}.csv"
    data.to_csv(path, index=False)
    print(f"\n\nOutput file at {path}\n\n")
    return data

In [31]:
def get_dataframe(dict_sample):
    response, data = dict_sample['Final Answer'], ""
    if isinstance(response, dict):
        response_keys = response.keys()
        try:
            for key in response_keys:
                dict_df = response[key]
                data = write_to_disk(dict_df, key)
                time.sleep(1)
        except Exception as e:
            print("This exception")
            print(f"Error Occured in parsing the results for {key}")
        
    elif isinstance(response, list):
        try:
            data = write_to_disk(response)
        except Exception as e:
            print(f"Error Occured in parsing the results", e)

    else:
        pass
            
    return data

In [19]:
tool_name

'scrape_website_table'

In [20]:
agent_step, intermediate_step, step = "", [], 0
while ("Final Answer" not in agent_step):

    observation = None
    agent_step = agent.invoke(
        {
            "input": "Extract me the table from URL: https://tradingeconomics.com/country-list/inflation-rate",
            "agent_scratchpad": intermediate_step
        }
    )
    step += 1
    setlogs(agent_step, step)
    if ("Action" in agent_step) and ("Action Input" in agent_step):
        tool_name = agent_step['Action']
        tool_input = agent_step['Action Input']
        tool = find_function(tools, tool_name)
        observation = tool.func(tool_input)
        print(observation)

    if observation:
        intermediate_step.append([agent_step, observation])
    else:
        intermediate_step.append([agent_step])      


#get_dataframe(agent_step)

******** Step1 ********
{'Thought': 'I need to extract the table from the given URL.', 'Action': 'scrape_website_table', 'Action Input': 'https://tradingeconomics.com/country-list/inflation-rate'}
******** End ******** 




Inside scrape_website, Returning scraped source_code





Country
Last 
Previous 
Reference
 Unit





                                        Argentina
                                    
47.3
55.9
Apr/25
%




                                        Australia
                                    
2.4
2.4
Mar/25
%




                                        Brazil
                                    
5.53
5.48
Apr/25
%




                                        Canada
                                    
1.7
2.3
Apr/25
%




                                        China
                                    
-0.1
-0.1
Apr/25
%




                                        Euro Area
                                    
2.2
2.2
Apr/25
%




                             

In [32]:
get_dataframe(agent_step)



Output file at output_data/table_1748720418.csv




Unnamed: 0,Country,Last,Previous,Reference,Unit
0,Argentina,47.3,55.9,Apr/25,%
1,Australia,2.4,2.4,Mar/25,%
2,Brazil,5.53,5.48,Apr/25,%
3,Canada,1.7,2.3,Apr/25,%
4,China,-0.1,-0.1,Apr/25,%
5,Euro Area,2.2,2.2,Apr/25,%
6,France,0.7,0.8,May/25,%
7,Germany,2.1,2.1,May/25,%
8,India,3.16,3.34,Apr/25,%
9,Indonesia,1.95,1.03,Apr/25,%


In [14]:
pd.DataFrame(agent_step["Final Answer"])

Unnamed: 0,Country,Last,Previous,Reference,Unit
0,Argentina,47.3,55.9,Apr/25,%
1,Australia,2.4,2.4,Mar/25,%
2,Brazil,5.53,5.48,Apr/25,%
3,Canada,1.7,2.3,Apr/25,%
4,China,-0.1,-0.1,Apr/25,%
5,Euro Area,2.2,2.2,Apr/25,%
6,France,0.7,0.8,May/25,%
7,Germany,2.1,2.1,May/25,%
8,India,3.16,3.34,Apr/25,%
9,Indonesia,1.95,1.03,Apr/25,%


In [18]:
agent_step['Final Answer']

{'Calendar': [{'Date': '2025-04-15',
   'Time': '10:30 AM',
   'Event': 'Inflation Rate YoY',
   'Reference': 'Mar',
   'Actual': '3.34%',
   'Previous': '3.61%',
   'Consensus': '3.60%',
   'TEForecast': '3.9%'},
  {'Date': '2025-05-13',
   'Time': '10:30 AM',
   'Event': 'Inflation Rate YoY',
   'Reference': 'Apr',
   'Actual': '3.16%',
   'Previous': '3.34%',
   'Consensus': '3.27%',
   'TEForecast': '3.1%'},
  {'Date': '2025-06-12',
   'Time': '10:30 AM',
   'Event': 'Inflation Rate YoY',
   'Reference': 'May',
   'Actual': '3.16%'}],
 'Related': [{'Indicator': 'Consumer Price Index CPI',
   'Last': '192.60',
   'Previous': '192.00',
   'Unit': 'points',
   'Reference': 'Apr 2025'},
  {'Indicator': 'CPI Housing Utilities',
   'Last': '185.30',
   'Previous': '183.60',
   'Unit': 'points',
   'Reference': 'Apr 2025'},
  {'Indicator': 'CPI Transportation',
   'Last': '172.50',
   'Previous': '172.30',
   'Unit': 'points',
   'Reference': 'Apr 2025'},
  {'Indicator': 'Food Inflation',

In [20]:
pd.DataFrame(agent_step)

{'Final Answer': {'Schedule of Releases for the Producer Price Index': [{'Reference Month': 'October 2024',
    'Release Date': 'Nov. 14, 2024',
    'Release Time': '08:30 AM'},
   {'Reference Month': 'November 2024',
    'Release Date': 'Dec. 12, 2024',
    'Release Time': '08:30 AM'},
   {'Reference Month': 'December 2024',
    'Release Date': 'Jan. 14, 2025',
    'Release Time': '08:30 AM'},
   {'Reference Month': 'January 2025',
    'Release Date': 'Feb. 13, 2025',
    'Release Time': '08:30 AM'},
   {'Reference Month': 'February 2025',
    'Release Date': 'Mar. 13, 2025',
    'Release Time': '08:30 AM'},
   {'Reference Month': 'March 2025',
    'Release Date': 'Apr. 11, 2025',
    'Release Time': '08:30 AM'},
   {'Reference Month': 'April 2025',
    'Release Date': 'May 15, 2025',
    'Release Time': '08:30 AM'},
   {'Reference Month': 'May 2025',
    'Release Date': 'Jun. 12, 2025',
    'Release Time': '08:30 AM'},
   {'Reference Month': 'June 2025',
    'Release Date': 'Jul. 16,

In [10]:
pd.DataFrame(agent_step["Final Answer"]["table"])

Unnamed: 0,Reference Month,Release Date,Release Time
0,October 2024,"Nov. 14, 2024",08:30 AM
1,November 2024,"Dec. 12, 2024",08:30 AM
2,December 2024,"Jan. 14, 2025",08:30 AM
3,January 2025,"Feb. 13, 2025",08:30 AM
4,February 2025,"Mar. 13, 2025",08:30 AM
5,March 2025,"Apr. 11, 2025",08:30 AM
6,April 2025,"May 15, 2025",08:30 AM
7,May 2025,"Jun. 12, 2025",08:30 AM
8,June 2025,"Jul. 16, 2025",08:30 AM
9,July 2025,"Aug. 14, 2025",08:30 AM


In [35]:
intermediate_step

[[{'Thought': 'I need to scrape the website to extract the table from the given URL.',
   'Action': 'scrape_website',
   'Action Input': 'https://www.bls.gov/schedule/news_release/cpi.htm#'},
  "\n\n\n\n\n\n\n\nBY MONTH\n\n\nJANUARY 2025\nFEBRUARY 2025\nMARCH 2025\nAPRIL 2025\nMAY 2025\nJUNE 2025\nJULY 2025\nAUGUST 2025\nSEPTEMBER 2025\nOCTOBER 2025\nNOVEMBER 2025\nDECEMBER 2025\nENTIRE YEAR, 2025\nPRIOR YEARS\n\n\n\n\n\n\nBY NEWS RELEASE\nBUSINESS EMPLOYMENT DYNAMICS \nCONSUMER PRICE INDEX \nCOUNTY EMPLOYMENT AND WAGES\nEMPLOYMENT COST INDEX \nEMPLOYMENT SITUATION \nJOB OPENINGS AND LABOR TURNOVER \nPRODUCER PRICE INDEX \nPRODUCTIVITY AND COSTS \nREAL EARNINGS \nU.S. EXPORT AND IMPORT PRICE INDEXES \nMETROPOLITAN AREA EMPLOYMENT AND UNEMPLOYMENT \nSTATE EMPLOYMENT AND UNEMPLOYMENT \n\n\n\n\n\n\n\n\n\n\n\nSchedule of Releases for the Consumer Price Index\n\n\n\n\nReference Month\nRelease Date\nRelease Time\n\n\n\n\nOctober 2024\nNov. 13, 2024\n08:30 AM\n\n\nNovember 2024\nDec. 11, 2024

In [18]:
#if not isinstance(agent_step, AgentAction):
tool_name = agent_step['Action']
tool_input = agent_step['Action Input']
tool = find_function(tools, tool_name)
observation = tool.func(tool_input)

In [22]:
agent_step

{'Thought': 'I need to scrape the website to extract the table from the given URL.',
 'Action': 'scrape_website',
 'Action Input': 'https://www.bls.gov/schedule/news_release/cpi.htm#'}

In [20]:
intermediate_step.append([agent_step, observation])

In [24]:
agent_step = agent.invoke(
    {
        "input": "Extract me the table from URL: https://www.bls.gov/schedule/news_release/cpi.htm#",
        "agent_scratchpad": intermediate_step
    }
)

In [25]:
intermediate_step.append([agent_step, observation])

In [27]:
agent_step = agent.invoke(
    {
        "input": "Extract me the table from URL: https://www.bls.gov/schedule/news_release/cpi.htm#",
        "agent_scratchpad": intermediate_step
    }
)

In [28]:
agent_step

{'Thought': 'I need to scrape the website to extract the table from the given URL.',
 'Action': 'scrape_website',
 'Action Input': 'https://www.bls.gov/schedule/news_release/cpi.htm#',
 'Observation': 'The table has been successfully extracted.',
 'Final Answer': {'table': [{'Reference Month': 'October 2024',
    'Release Date': 'Nov. 13, 2024',
    'Release Time': '08:30 AM'},
   {'Reference Month': 'November 2024',
    'Release Date': 'Dec. 11, 2024',
    'Release Time': '08:30 AM'},
   {'Reference Month': 'December 2024',
    'Release Date': 'Jan. 15, 2025',
    'Release Time': '08:30 AM'},
   {'Reference Month': 'January 2025',
    'Release Date': 'Feb. 12, 2025',
    'Release Time': '08:30 AM'},
   {'Reference Month': 'February 2025',
    'Release Date': 'Mar. 12, 2025',
    'Release Time': '08:30 AM'},
   {'Reference Month': 'March 2025',
    'Release Date': 'Apr. 10, 2025',
    'Release Time': '08:30 AM'},
   {'Reference Month': 'April 2025',
    'Release Date': 'May 13, 2025',


In [26]:
agent_step

{'Thought': 'I have successfully scraped the website and extracted the table.',
 'Final Answer': {'table': [{'Reference Month': 'October 2024',
    'Release Date': 'Nov. 13, 2024',
    'Release Time': '08:30 AM'},
   {'Reference Month': 'November 2024',
    'Release Date': 'Dec. 11, 2024',
    'Release Time': '08:30 AM'},
   {'Reference Month': 'December 2024',
    'Release Date': 'Jan. 15, 2025',
    'Release Time': '08:30 AM'},
   {'Reference Month': 'January 2025',
    'Release Date': 'Feb. 12, 2025',
    'Release Time': '08:30 AM'},
   {'Reference Month': 'February 2025',
    'Release Date': 'Mar. 12, 2025',
    'Release Time': '08:30 AM'},
   {'Reference Month': 'March 2025',
    'Release Date': 'Apr. 10, 2025',
    'Release Time': '08:30 AM'},
   {'Reference Month': 'April 2025',
    'Release Date': 'May 13, 2025',
    'Release Time': '08:30 AM'},
   {'Reference Month': 'May 2025',
    'Release Date': 'Jun. 11, 2025',
    'Release Time': '08:30 AM'},
   {'Reference Month': 'June 

In [85]:
intermediate_step

[[{'Thought': 'I need to scrape the website to extract the table from the given URL.',
   'Action': 'scrape_website',
   'Action Input': 'https://www.bls.gov/schedule/news_release/cpi.htm#'},
  "\n\n\n\n\n\n\n\nBY MONTH\n\n\nJANUARY 2025\nFEBRUARY 2025\nMARCH 2025\nAPRIL 2025\nMAY 2025\nJUNE 2025\nJULY 2025\nAUGUST 2025\nSEPTEMBER 2025\nOCTOBER 2025\nNOVEMBER 2025\nDECEMBER 2025\nENTIRE YEAR, 2025\nPRIOR YEARS\n\n\n\n\n\n\nBY NEWS RELEASE\nBUSINESS EMPLOYMENT DYNAMICS \nCONSUMER PRICE INDEX \nCOUNTY EMPLOYMENT AND WAGES\nEMPLOYMENT COST INDEX \nEMPLOYMENT SITUATION \nJOB OPENINGS AND LABOR TURNOVER \nPRODUCER PRICE INDEX \nPRODUCTIVITY AND COSTS \nREAL EARNINGS \nU.S. EXPORT AND IMPORT PRICE INDEXES \nMETROPOLITAN AREA EMPLOYMENT AND UNEMPLOYMENT \nSTATE EMPLOYMENT AND UNEMPLOYMENT \n\n\n\n\n\n\n\n\n\n\n\nSchedule of Releases for the Consumer Price Index\n\n\n\n\nReference Month\nRelease Date\nRelease Time\n\n\n\n\nOctober 2024\nNov. 13, 2024\n08:30 AM\n\n\nNovember 2024\nDec. 11, 2024

In [77]:
import pandas as pd

In [78]:
pd.DataFrame(agent_step['Final Answer']['table'])

Unnamed: 0,Reference Month,Release Date,Release Time
0,October 2024,"Nov. 13, 2024",08:30 AM
1,November 2024,"Dec. 11, 2024",08:30 AM
2,December 2024,"Jan. 15, 2025",08:30 AM
3,January 2025,"Feb. 12, 2025",08:30 AM
4,February 2025,"Mar. 12, 2025",08:30 AM
5,March 2025,"Apr. 10, 2025",08:30 AM
6,April 2025,"May 13, 2025",08:30 AM
7,May 2025,"Jun. 11, 2025",08:30 AM
8,June 2025,"Jul. 15, 2025",08:30 AM
9,July 2025,"Aug. 12, 2025",08:30 AM


In [19]:
agent_step

AgentAction(tool='scrape_website', tool_input='https://www.bls.gov/schedule/news_release/cpi.htm#', log='Sure, I will extract the table from the given URL using the scrape_website tool.\n\nAction: scrape_website\nAction Input: "https://www.bls.gov/schedule/news_release/cpi.htm#"')

In [42]:
tools = [scrape_website]
render_text_description(tools)

'scrape_website(url: str) -> Any - This function scrapes website and return only the html table section'

In [44]:
", ".join([t.name for t in tools])

'scrape_website'

In [8]:
print(react_prompt)

Answer the following questions as best you can. You have access to the following tools:

{tools}

Use the following format:

Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question

Begin!

Question: {input}
Thought:{agent_scratchpad}


In [22]:
source_code = await scrape_website(url="https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm")

In [25]:
source_code

﻿<!DOCTYPE html>

<html class="no-js" lang="en">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible">
<meta content="width=device-width, initial-scale=1.0, minimum-scale=1.0 maximum-scale=1.6, user-scalable=1" name="viewport"/>
<meta content="Board of Governors of the Federal Reserve System, Federal Reserve Board of Governors, Federal Reserve Board, Federal Reserve" name="keywords"/>
<meta content="The Federal Reserve Board of Governors in Washington DC." name="description"/>
<meta content="article" property="og:type"/>
<meta content="Meeting calendars and information" property="og:title"/>
<meta content="https://www.federalreserve.gov/images/social-media/social-default-image-opengraph.jpg" property="og:image"/>
<meta content="Board of Governors of the Federal Reserve System" property="og:image:alt"/>
<meta content="The Federal Reserve Board of Governors in Washington DC." property="og:description"/>
<meta content="https://www.federalreserve.gov/monetary

In [17]:
await scrape_website(url="https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm")

''

In [18]:
import requests

url = "https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
}

response = requests.get(url, headers=headers)
with open("sample_file.txt", "w") as fo:
    print(response.text, file=fo)
    
print(response.status_code)
print(response.text)

200
ï»¿<!doctype html>
<html lang="en" class="no-js">
    <head>
        <meta charset="utf-8">
        <meta http-equiv="X-UA-Compatible" content="IE=edge"/>
        <meta name="viewport" content="width=device-width, initial-scale=1.0, minimum-scale=1.0 maximum-scale=1.6, user-scalable=1"/>
        <meta name="keywords" content="Board of Governors of the Federal Reserve System, Federal Reserve Board of Governors, Federal Reserve Board, Federal Reserve" />
        <meta name="description" content="The Federal Reserve Board of Governors in Washington DC." />
        <meta property="og:type" content="article" /> 
        <meta property="og:title" content="Meeting calendars and information"/>    
        <meta property="og:image" content="https://www.federalreserve.gov/images/social-media/social-default-image-opengraph.jpg" />
        <meta property="og:image:alt" content="Board of Governors of the Federal Reserve System" />
        <meta property="og:description" content="The Federal Res

In [6]:
requests.get("https://www.bls.gov/schedule/news_release/cpi.htm#")

<Response [403]>