## **Installation Of Required Library**

In [2]:
!pip install comet_llm
!pip install nest_asyncio langchain-openai langchain_community
!pip install nest_asyncio langchain-openai langchain_community
!pip install playwright
!playwright install

Collecting comet_llm
  Downloading comet_llm-2.2.4-py3-none-any.whl (72 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting comet-ml>=3.40.0 (from comet_llm)
  Downloading comet_ml-3.42.0-py3-none-any.whl (663 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m663.1/663.1 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting flatten-dict (from comet_llm)
  Downloading flatten_dict-0.4.2-py2.py3-none-any.whl (9.7 kB)
Collecting types-requests (from comet_llm)
  Downloading types_requests-2.31.0.20240406-py3-none-any.whl (15 kB)
Collecting everett[ini]<3.2.0,>=1.0.1 (from comet-ml>=3.40.0->comet_llm)
  Downloading everett-3.1.0-py2.py3-none-any.whl (35 kB)
Collecting python-box<7.0.0 (from comet-ml>=3.40.0->comet_llm)
  Downloading python_box-6.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

## **Import Required Library**

In [8]:
from google.colab import userdata
import nest_asyncio
nest_asyncio.apply()

# Import required modules from langchain
from langchain_openai import ChatOpenAI
from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_community.document_transformers import BeautifulSoupTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import create_extraction_chain
import time
import pandas as pd
import comet_llm

# Initialize a Comet project
comet_llm.init(project="langchain-web-scraping",
               api_key=MY_COMET_KEY,
               )

[1;38;5;39mCOMET INFO:[0m Valid Comet API Key saved in /content/drive/MyDrive/.comet.config (set COMET_CONFIG to change where it is saved).


To get the OpenAI API key : Go to https://help.openai.com/en/articles/4936850-where-do-i-find-my-api-key

To get the Comet API Key: Go to https://www.comet.com/docs/v2/api-and-sdk/rest-api/overview/

In [9]:
#api key acces
MY_OPENAI_KEY=userdata.get('MY_OPENAI_KEY')
MY_COMET_KEY=userdata.get('MY_COMET_KEY')

## **Web scrapping Using OpenAI GPT-3.5 Turbo LLM**








In [10]:
import nest_asyncio
import time
import pandas as pd
from langchain_openai import ChatOpenAI
from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_community.document_transformers import BeautifulSoupTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import create_extraction_chain
from google.colab import userdata
import comet_llm

nest_asyncio.apply()

MY_OPENAI_KEY = "YOUR_OPENAI_KEY"
MY_COMET_KEY = "YOUR_COMET_KEY"

comet_llm.init(project="langchain-web-scraping", api_key=MY_COMET_KEY)

html_tags = [
    "h1", "h2", "h3", "h4", "h5", "h6",
    "p",
    "span",
    "div",
    "ul", "ol", "li",
    "table", "tr", "th", "td",
    "a",
    "b", "strong",
    "i", "em",
    "blockquote", "q", "cite",
    "code", "pre",
    "form", "input", "textarea", "label",
    "dl", "dt", "dd",
    "article",
    "section",
    "nav",
    "aside",
    "header",
    "footer",
    "main",
    "figure", "figcaption",
    "details", "summary",
    "mark",
    "time"
]

def extract_url(url):
    print(url)
    url_loader = AsyncChromiumLoader([url])
    url_docs = url_loader.load()
    bs_transformer = BeautifulSoupTransformer()
    url_transfornm = bs_transformer.transform_documents(
        url_docs, tags_to_extract=["a"]
    )
    llm = ChatOpenAI(openai_api_key=MY_OPENAI_KEY)
    url_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=15000, chunk_overlap=0)
    url_splits = url_splitter.split_documents(url_transfornm)

    url_schema = {
        "properties": {
            "url": {"type": "string"},
        },
        "required": ["url"],
    }
    url_list = []

    if len(url_splits) > 0:
        start_time = time.time()
        extracted_content = create_extraction_chain(schema=url_schema, llm=llm).run(url_splits[0].page_content)
        end_time = time.time()
        comet_llm.log_prompt(
            prompt=str(url_splits[0].page_content),
            metadata={"schema": url_schema},
            output=extracted_content,
            duration=end_time - start_time,
        )
        url_list = [d['url'] for d in extracted_content]

    return url_list

def extract_content(urls):
    contents = []
    for url in urls:
        url_loader = AsyncChromiumLoader([url])
        content_docs = url_loader.load()
        bs_transformer = BeautifulSoupTransformer()
        content_transfornm = bs_transformer.transform_documents(content_docs, tags_to_extract=html_tags)
        content = content_transfornm[0].page_content
        contents.append(content)
    return contents

class Document:
    def __init__(self, page_content, metadata):
        self.page_content = page_content
        self.metadata = metadata

    def __repr__(self):
        return f'Document(page_content="{self.page_content},metadata={self.metadata} ")'

def generate_output(resulted_content):
    llm = ChatOpenAI(openai_api_key=MY_OPENAI_KEY)
    splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=15000, chunk_overlap=0)
    splits = splitter.split_documents(resulted_content)
    schema = {
        "properties": {
            "original_id": {"type": "string", "description": "Unique from source"},
            "aug_id": {"type": "string", "description": "Augmented identifier from the context"},
            "country_name": {"type": "string", "description": "Name of the Country"},
            "country_code": {"type": "string", "description": "ISO 3-letter Country Code"},
            "map_coordinates": {
                "type": "object",
                "description": "Geo Point of the region formatted as {'type': 'Point', 'coordinates': [longitude, latitude]}",
                "properties": {"type": {"type": "string"}, "coordinates": {"type": "array", "items": {"type": "number"}}},
            },
            "url": {"type": "string", "description": "Url of the website of the source", "format": "uri"},
            "region_name": {"type": "string", "description": "Region Name for a Country according to World Bank Standards"},
            "region_code": {"type": "string", "description": "Region code for a Region according to World Bank Standards"},
            "Project_title": {"type": "string", "description": "A title for this tender/project used as a headline"},
            "Project_description": {"type": "string", "description": "A summary description of the tender/project"},
            "status": {"type": "string", "description": "The current status of the tender/project from the closed tenderStatus codelist"},
            "stages": {"type": "string", "description": "Stages of the tender/project"},
            "date": {"type": "string", "description": "The date on which the information was first recorded or published", "format": "date"},
            "procurementMethod": {"type": "string", "description": "The procedure used to purchase the relevant works, goods or services"},
            "budget": {"type": "number", "description": "The total upper estimated value of the procurement"},
            "currency": {"type": "string", "description": "The currency for each amount specified using the uppercase 3-letter code from ISO4217"},
            "buyer": {"type": "string", "description": "Entity whose budget will be used to pay for related goods, works or services"},
            "sector": {"type": "string", "description": "A high-level categorization of the main sector this procurement process relates to"},
            "subsector": {"type": "string", "description": "A further subdivision of the sector the procurement process belongs to"},
        },
        "required": [
            "original_id", "aug_id", "country_name", "country_code", "map_coordinates", "url",
            "region_name", "region_code", "title", "description", "status", "stages", "date",
            "procurementMethod", "budget", "currency", "buyer", "sector", "subsector"
        ]
    }

    if len(splits) > 0:
        start_time = time.time()
        extracted_content = create_extraction_chain(schema=schema, llm=llm).run(splits[0].page_content)
        end_time = time.time()

        comet_llm.log_prompt(
            prompt=str(splits[0].page_content),
            metadata={"schema": schema},
            output=extracted_content,
            duration=end_time - start_time,
        )

    return extracted_content






[1;38;5;39mCOMET INFO:[0m Valid Comet API Key saved in /content/drive/MyDrive/.comet.config (set COMET_CONFIG to change where it is saved).


**Scraping  the data from the given URL and reseachered and found URL for construction and infrastructure projects and tenders in the state of California.**

In [11]:
Given_12url_list=['https://www.ci.richmond.ca.us/1404/Major-Projects', 'https://www.bakersfieldcity.us/518/Projects-Programs', 'https://www.cityofwasco.org/311/Current-Projects', 'https://www.eurekaca.gov/744/Upcoming-Projects', 'https://www.cityofarcata.org/413/Current-City-Construction-Projects', 'https://www.mckinleyvillecsd.com/news-and-project-updates', 'https://www.cityofsanrafael.org/major-planning-projects-2/', 'https://www.novato.org/government/community-development/planning-division/planning-projects?locale=en', 'https://www.cityofmillvalley.org/258/Projects', 'https://riversideca.gov/utilities/projects', 'https://www.moval.org/cdd/documents/about-projects.html', 'https://www.coronaca.gov/government/departments-divisions/department-of-water-and-power/construction', 'http://www.cityofsacramento.org/public-works/engineering-services/projects']
own_8url_list=['https://data.ca.gov/dataset/?q=construction+and+infrastructure+projects&sort=score+desc%2C+metadata_modified+desc', 'https://dot.ca.gov/programs/procurement-and-contracts/bid-opportunities', 'https://www.cityofarcata.org/413/Current-City-Construction-Projects', 'https://www.cityofsanrafael.org/major-planning-projects-2/', 'https://www.elkgrovecity.org/southeast-policy-area/development-projects', 'https://www.fluor.com/market-reach/industries/infrastructure', 'https://www.fluor.com/projects', 'https://www.toaks.org/departments/public-works/construction']
Final_list=Given_12url_list + own_8url_list

In [12]:
print(Final_list)
print(len(Final_list))

['https://www.ci.richmond.ca.us/1404/Major-Projects', 'https://www.bakersfieldcity.us/518/Projects-Programs', 'https://www.cityofwasco.org/311/Current-Projects', 'https://www.eurekaca.gov/744/Upcoming-Projects', 'https://www.cityofarcata.org/413/Current-City-Construction-Projects', 'https://www.mckinleyvillecsd.com/news-and-project-updates', 'https://www.cityofsanrafael.org/major-planning-projects-2/', 'https://www.novato.org/government/community-development/planning-division/planning-projects?locale=en', 'https://www.cityofmillvalley.org/258/Projects', 'https://riversideca.gov/utilities/projects', 'https://www.moval.org/cdd/documents/about-projects.html', 'https://www.coronaca.gov/government/departments-divisions/department-of-water-and-power/construction', 'http://www.cityofsacramento.org/public-works/engineering-services/projects', 'https://data.ca.gov/dataset/?q=construction+and+infrastructure+projects&sort=score+desc%2C+metadata_modified+desc', 'https://dot.ca.gov/programs/procure

In [13]:
finalDict=[]
for url in Final_list:
  url_list=[]
  url_list=extract_url(url)
  url_list.append(url)

  output_content=extract_content(url_list)
  concatenated_content = ' '.join(output_content)
  merged_document = Document(page_content=concatenated_content,metadata="")
  resulted_content = [merged_document]


  final_output=generate_output(resulted_content)
  if(len(final_output)>0):
    finalDict.append(final_output[0])



https://www.ci.richmond.ca.us/1404/Major-Projects
https://www.bakersfieldcity.us/518/Projects-Programs


  soup = BeautifulSoup(html_content, "html.parser")
  soup = BeautifulSoup(html_content, "html.parser")
  soup = BeautifulSoup(html_content, "html.parser")


https://www.cityofwasco.org/311/Current-Projects


  soup = BeautifulSoup(html_content, "html.parser")
  soup = BeautifulSoup(html_content, "html.parser")
  soup = BeautifulSoup(html_content, "html.parser")


https://www.eurekaca.gov/744/Upcoming-Projects


  soup = BeautifulSoup(html_content, "html.parser")
  soup = BeautifulSoup(html_content, "html.parser")
  soup = BeautifulSoup(html_content, "html.parser")


https://www.cityofarcata.org/413/Current-City-Construction-Projects
https://www.mckinleyvillecsd.com/news-and-project-updates
https://www.cityofsanrafael.org/major-planning-projects-2/
https://www.novato.org/government/community-development/planning-division/planning-projects?locale=en
https://www.cityofmillvalley.org/258/Projects
https://riversideca.gov/utilities/projects


  soup = BeautifulSoup(html_content, "html.parser")
  soup = BeautifulSoup(html_content, "html.parser")
  soup = BeautifulSoup(html_content, "html.parser")


https://www.moval.org/cdd/documents/about-projects.html


  soup = BeautifulSoup(html_content, "html.parser")
  soup = BeautifulSoup(html_content, "html.parser")
  soup = BeautifulSoup(html_content, "html.parser")


https://www.coronaca.gov/government/departments-divisions/department-of-water-and-power/construction
http://www.cityofsacramento.org/public-works/engineering-services/projects
https://data.ca.gov/dataset/?q=construction+and+infrastructure+projects&sort=score+desc%2C+metadata_modified+desc


  soup = BeautifulSoup(html_content, "html.parser")
  soup = BeautifulSoup(html_content, "html.parser")
  soup = BeautifulSoup(html_content, "html.parser")


https://dot.ca.gov/programs/procurement-and-contracts/bid-opportunities
https://www.cityofarcata.org/413/Current-City-Construction-Projects


  soup = BeautifulSoup(html_content, "html.parser")
  soup = BeautifulSoup(html_content, "html.parser")
  soup = BeautifulSoup(html_content, "html.parser")


https://www.cityofsanrafael.org/major-planning-projects-2/
https://www.elkgrovecity.org/southeast-policy-area/development-projects
https://www.fluor.com/market-reach/industries/infrastructure


  soup = BeautifulSoup(html_content, "html.parser")
  soup = BeautifulSoup(html_content, "html.parser")
  soup = BeautifulSoup(html_content, "html.parser")


https://www.fluor.com/projects


  soup = BeautifulSoup(html_content, "html.parser")
  soup = BeautifulSoup(html_content, "html.parser")
  soup = BeautifulSoup(html_content, "html.parser")


https://www.toaks.org/departments/public-works/construction


In [14]:
finalDict

[{'original_id': '#ccf5602c1c-2e20-4773-89f3-08f921ac655f',
  'aug_id': 'Mathieu Court Alley Play Street',
  'country_name': 'United States',
  'country_code': 'USA',
  'map_coordinates': {'type': 'Point',
   'coordinates': [-122.344475, 37.933013]},
  'url': 'https://www.ci.richmond.ca.us/4486/Travel-Safe-Richmond',
  'region_name': 'North America',
  'region_code': 'NA',
  'Project_title': 'Mathieu Court Alley Play Street',
  'Project_description': 'The City of Richmond proposes to construct the Richmond-Greenway Gap Closure Project, a multi-use trail connection in Richmond, California. The Richmond-Ohlone Greenway Gap Closure Project represents the third phase of the Richmond Greenway Project, which would provide a continuous bicycle and pedestrian pathway in Richmond from Garrard Boulevard and the Richmond Parkway to San Pablo Avenue. The Richmond Greenway would connect the San Francisco Bay Trail at the west with the Ohlone Greenway in the City of El Cerrito at the east. The propo

# **ouput Dictionary to Dataframe**

In [15]:
df=pd.DataFrame(finalDict)

In [16]:
df

Unnamed: 0,original_id,aug_id,country_name,country_code,map_coordinates,url,region_name,region_code,Project_title,Project_description,status,stages,date,procurementMethod,budget,currency,buyer,sector,subsector
0,#ccf5602c1c-2e20-4773-89f3-08f921ac655f,Mathieu Court Alley Play Street,United States,USA,"{'type': 'Point', 'coordinates': [-122.344475,...",https://www.ci.richmond.ca.us/4486/Travel-Safe...,North America,,Mathieu Court Alley Play Street,The City of Richmond proposes to construct the...,Ongoing,Construction,2023-02-07,Public Tender,,USD,City of Richmond,Infrastructure,Urban Development
1,1,About Google Translate,Google,GOO,"{'type': 'Point', 'coordinates': [37.422, -122...",https://translate.google.com/about/?hl=en-US,Tech,TEC,Google Translate,"Translate text, images, documents, and websites",active,Ongoing,2022-03-15,Online,100000.0,USD,Google,Technology,Translation
2,98,GIS-Specialist-55,United States,USA,"{'type': 'Point', 'coordinates': [-119.3528, 3...",https://www.cityofwasco.org/Jobs.aspx?UniqueId...,North America,,GIS Specialist,Install and maintain a variety of computer-bas...,Open,"Posted January 6, 2023 8:00 AM | Open Until Fi...",2023-01-06,Unknown,,Unknown,City of Wasco,Technology,Information Technology
3,1,1,United States,USA,"{'type': 'Point', 'coordinates': [-124.16367, ...",https://www.eurekaca.gov,,,Capital Improvement Program Report 2024,The City of Eureka’s Five-Year Capital Improve...,ongoing,planning,2024-01-01,planning,250000.0,USD,City of Eureka,Infrastructure,Public Works
4,1,cc8bd95fde-da5f-45d7-9e90-d6e2eb24437f,United States,USA,"{'type': 'Point', 'coordinates': [-124.0828, 4...",https://www.cityofarcata.org/856/Wastewater-Tr...,North America,,Arcata Wastewater Treatment Facility Improveme...,The City of Arcata proposes to construct upgra...,Ongoing,Phase 1,2021-07-01,Construction,54000000.0,USD,City of Arcata,Water and Sanitation,Wastewater Treatment
5,1,1,United States,USA,"{'type': 'Point', 'coordinates': [-124.1073, 4...",https://mckinleyvillecsd.ca.gov/,North America,,McKinleyville BMX Track and Park Construction,Request for Bids for the Construction of the M...,Open,Bids and RFPs,2024-05-23,Bids,,USD,McKinleyville Community Services District,Construction,BMX Track and Park
6,1,San Rafael,United States,USA,"{'type': 'Point', 'coordinates': [-122.5268, 3...",https://www.cityofsanrafael.org/,North America,,Planning Division,The Planning Division provides information on ...,active,ongoing,2024-05-09,,0.0,,City of San Rafael,Urban Planning,Land Use and Zoning
7,18.52532217.1716126980.5773ca1c,18.52532217.1716126980.5773ca1c,,,,,,,,,,,,,,,,,
8,cc9b263b9d-a7e2-4e23-b747-b6d686e76be0,27,Mill Valley,MVL,"{'type': 'Point', 'coordinates': [37.9062, -12...",https://www.cityofmillvalley.org/,,,,,,,,,,,,,
9,1,RPU,United States,USA,"{'type': 'Point', 'coordinates': [-117.396156,...",https://www.riversideca.gov/media-press-releas...,North America,,Riverside Public Utilities Recognized for Exce...,Riverside Public Utilities has been recognized...,closed,completed,2022-11-20,open,0.0,USD,Riverside Public Utilities,Utilities,Communications


# **Dataframe to Json**

In [17]:
df_json = df.to_json(orient='records', indent=4)

In [18]:
df_json

'[\n    {\n        "original_id":"#ccf5602c1c-2e20-4773-89f3-08f921ac655f",\n        "aug_id":"Mathieu Court Alley Play Street",\n        "country_name":"United States",\n        "country_code":"USA",\n        "map_coordinates":{\n            "type":"Point",\n            "coordinates":[\n                -122.344475,\n                37.933013\n            ]\n        },\n        "url":"https:\\/\\/www.ci.richmond.ca.us\\/4486\\/Travel-Safe-Richmond",\n        "region_name":"North America",\n        "region_code":"NA",\n        "Project_title":"Mathieu Court Alley Play Street",\n        "Project_description":"The City of Richmond proposes to construct the Richmond-Greenway Gap Closure Project, a multi-use trail connection in Richmond, California. The Richmond-Ohlone Greenway Gap Closure Project represents the third phase of the Richmond Greenway Project, which would provide a continuous bicycle and pedestrian pathway in Richmond from Garrard Boulevard and the Richmond Parkway to San Pabl

# **Save the JSON**

In [19]:
with open('/content/drive/MyDrive/Colab Notebooks/Scraped.json', 'w') as file:
    file.write(df_json)

# **Save the Dataframe as a CSV File**

In [20]:
df.to_csv('/content/drive/MyDrive/Colab Notebooks/Scraped_URL_Data.csv', index=False)

# **Run Streamlit web app in Collab notebook**

**Below is the code saved in Web_scrapping_streamlit.py.py. It is a streamlit web app for user interaction**

> Add blockquote



In [None]:
import nest_asyncio
import time
import pandas as pd
from langchain_openai import ChatOpenAI
from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_community.document_transformers import BeautifulSoupTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import create_extraction_chain
from google.colab import userdata
import comet_llm
import streamlit as st

nest_asyncio.apply()

MY_OPENAI_KEY = ""
MY_COMET_KEY = ""

comet_llm.init(project="langchain-web-scraping", api_key=MY_COMET_KEY)

html_tags = [
    "h1", "h2", "h3", "h4", "h5", "h6",
    "p",
    "span",
    "div",
    "ul", "ol", "li",
    "table", "tr", "th", "td",
    "a",
    "b", "strong",
    "i", "em",
    "blockquote", "q", "cite",
    "code", "pre",
    "form", "input", "textarea", "label",
    "dl", "dt", "dd",
    "article",
    "section",
    "nav",
    "aside",
    "header",
    "footer",
    "main",
    "figure", "figcaption",
    "details", "summary",
    "mark",
    "time"
]

def extract_url(url):
    print(url)
    url_loader = AsyncChromiumLoader([url])
    url_docs = url_loader.load()
    bs_transformer = BeautifulSoupTransformer()
    url_transfornm = bs_transformer.transform_documents(
        url_docs, tags_to_extract=["a"]
    )
    llm = ChatOpenAI(openai_api_key=MY_OPENAI_KEY)
    url_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=15000, chunk_overlap=0)
    url_splits = url_splitter.split_documents(url_transfornm)

    url_schema = {
        "properties": {
            "url": {"type": "string"},
        },
        "required": ["url"],
    }
    url_list = []

    if len(url_splits) > 0:
        start_time = time.time()
        extracted_content = create_extraction_chain(schema=url_schema, llm=llm).run(url_splits[0].page_content)
        end_time = time.time()
        comet_llm.log_prompt(
            prompt=str(url_splits[0].page_content),
            metadata={"schema": url_schema},
            output=extracted_content,
            duration=end_time - start_time,
        )
        url_list = [d['url'] for d in extracted_content]

    return url_list

def extract_content(urls):
    contents = []
    for url in urls:
        url_loader = AsyncChromiumLoader([url])
        content_docs = url_loader.load()
        bs_transformer = BeautifulSoupTransformer()
        content_transfornm = bs_transformer.transform_documents(content_docs, tags_to_extract=html_tags)
        content = content_transfornm[0].page_content
        contents.append(content)
    return contents

class Document:
    def __init__(self, page_content, metadata):
        self.page_content = page_content
        self.metadata = metadata

    def __repr__(self):
        return f'Document(page_content="{self.page_content},metadata={self.metadata} ")'

def generate_output(resulted_content):
    llm = ChatOpenAI(openai_api_key=MY_OPENAI_KEY)
    splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=15000, chunk_overlap=0)
    splits = splitter.split_documents(resulted_content)
    schema = {
        "properties": {
            "original_id": {"type": "string", "description": "Unique from source"},
            "aug_id": {"type": "string", "description": "Augmented identifier from the context"},
            "country_name": {"type": "string", "description": "Name of the Country"},
            "country_code": {"type": "string", "description": "ISO 3-letter Country Code"},
            "map_coordinates": {
                "type": "object",
                "description": "Geo Point of the region formatted as {'type': 'Point', 'coordinates': [longitude, latitude]}",
                "properties": {"type": {"type": "string"}, "coordinates": {"type": "array", "items": {"type": "number"}}},
            },
            "url": {"type": "string", "description": "Url of the website of the source", "format": "uri"},
            "region_name": {"type": "string", "description": "Region Name for a Country according to World Bank Standards"},
            "region_code": {"type": "string", "description": "Region code for a Region according to World Bank Standards"},
            "Project_title": {"type": "string", "description": "A title for this tender/project used as a headline"},
            "Project_description": {"type": "string", "description": "A summary description of the tender/project"},
            "status": {"type": "string", "description": "The current status of the tender/project from the closed tenderStatus codelist"},
            "stages": {"type": "string", "description": "Stages of the tender/project"},
            "date": {"type": "string", "description": "The date on which the information was first recorded or published", "format": "date"},
            "procurementMethod": {"type": "string", "description": "The procedure used to purchase the relevant works, goods or services"},
            "budget": {"type": "number", "description": "The total upper estimated value of the procurement"},
            "currency": {"type": "string", "description": "The currency for each amount specified using the uppercase 3-letter code from ISO4217"},
            "buyer": {"type": "string", "description": "Entity whose budget will be used to pay for related goods, works or services"},
            "sector": {"type": "string", "description": "A high-level categorization of the main sector this procurement process relates to"},
            "subsector": {"type": "string", "description": "A further subdivision of the sector the procurement process belongs to"},
        },
        "required": [
            "original_id", "aug_id", "country_name", "country_code", "map_coordinates", "url",
            "region_name", "region_code", "title", "description", "status", "stages", "date",
            "procurementMethod", "budget", "currency", "buyer", "sector", "subsector"
        ]
    }

    if len(splits) > 0:
        start_time = time.time()
        extracted_content = create_extraction_chain(schema=schema, llm=llm).run(splits[0].page_content)
        end_time = time.time()

        comet_llm.log_prompt(
            prompt=str(splits[0].page_content),
            metadata={"schema": schema},
            output=extracted_content,
            duration=end_time - start_time,
        )

    return extracted_content

def main():
    st.title("web scrapper")
    st.write("Enter a URL")
    urls = st.text_area("List Of urls", height=200)
    st.write("Enter your URL List  separated by commas (Ex: url1, url3, url3)")
    urls = [item.strip() for item in urls.split(",")]

    # Submit button
    if st.button("Submit"):
      finalDict=[]
      for url in urls:
        url_list=extract_url(url)
        url_list.append(url)
        output_content=extract_content(url_list)
        concatenated_content = ' '.join(output_content)
        merged_document = Document(page_content=concatenated_content,metadata="")
        resulted_content = [merged_document]
        final_output=generate_output(resulted_content)

        if(len(final_output)>0):
          finalDict.append(final_output[0])

      df=pd.DataFrame(finalDict)
      st.dataframe(df)


if __name__ == "__main__":
    main()


In [23]:
!pip install streamlit
!pip install pytube
!npm install localtunnel

[K[?25h[37;40mnpm[0m [0m[30;43mWARN[0m [0m[35msaveError[0m ENOENT: no such file or directory, open '/content/package.json'
[0m[37;40mnpm[0m [0m[30;43mWARN[0m [0m[35menoent[0m ENOENT: no such file or directory, open '/content/package.json'
[0m[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m content No description
[0m[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m content No repository field.
[0m[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m content No README data
[0m[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m content No license field.
[0m
+ localtunnel@2.0.2
updated 1 package and audited 36 packages in 0.659s

3 packages are looking for funding
  run `npm fund` for details

found 2 [93mmoderate[0m severity vulnerabilities
  run `npm audit fix` to fix them, or `npm audit` for details
[K[?25h

In [10]:
!streamlit run /content/drive/MyDrive/ColabNotebooks/Web_scrapping_streamlit.py &>/content/logs.txt &

In [11]:
!wget -q -O - https://loca.lt/mytunnelpassword

35.185.1.147

In [12]:
!npx localtunnel --port 8501

[K[?25hnpx: installed 22 in 3.073s
your url is: https://mean-trees-relate.loca.lt
^C
