# WebBasedLoaders

In [29]:
from langchain_community.document_loaders import WebBaseLoader
import textwrap

In [2]:
urls = [
'https://www.ft.com/crypto',
'https://www.ft.com/ft-trading-room',
'https://www.livemint.com/money',
'https://www.livemint.com/market'
    ]

In [3]:
loader = WebBaseLoader(urls)

In [4]:
loader

<langchain_community.document_loaders.web_base.WebBaseLoader at 0x1deec519870>

In [6]:
docs = []
async for doc in loader.alazy_load():
    docs.append(doc)

In [7]:
len(docs)

4

In [10]:
def format_docs(docs):
    return "\n\n".join([x.page_content for x in docs])

In [17]:
context = format_docs(docs)
len(context)

54826

In [18]:
import re

def text_clean(text):
    text = re.sub(r'\n\n', '\n', text)
    text = re.sub(r'\t+', '\t', text)
    text = re.sub(r'\s+', ' ', text)
    return text

In [19]:
context = text_clean(context)
len(context)

52080

In [31]:
textwrap.wrap(context)

['CryptofinanceAccessibility helpSkip to navigationSkip to contentSkip',
 'to footerSign InSubscribeOpen side navigation menuOpen search',
 'barFinancial TimesSubscribeSign InSearch the FTSearchClose search',
 'barCloseHomeWorldSectionsWorld HomeMiddle East warGlobal',
 'EconomyUKUSChinaAfricaAsia PacificEmerging MarketsEuropeWar in',
 'UkraineAmericasMiddle East & North AfricaMost ReadTrump tells Vance to',
 'shore up support for Gaetz as misconduct allegations swirlChinese',
 'vessel spotted where Baltic Sea cables were severedUkraine strikes',
 'Russia with US-made long-range missiles for first timeTrump picks',
 'Howard Lutnick to run commerce departmentTrump’s demolition of the US',
 'stateUSSectionsUS HomeUS EconomyInvesting in AmericaUS CompaniesUS',
 'Politics & PolicyUS Presidential Election 2024Most ReadTrump tells',
 'Vance to shore up support for Gaetz as misconduct allegations',
 'swirlTrump picks Howard Lutnick to run commerce departmentTrump’s',
 'demolition of the US st

In [None]:
from langchain_ollama import ChatOllama
from langchain_core.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, PromptTemplate, ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser


base_url = "http://localhost:11434"
model = "llama3.2:3b"

llm = ChatOllama(base_url=base_url,
                 model=model,
                 temperature=0.1,
                 num_predict=256
                 )

system = SystemMessagePromptTemplate.from_template("""
You are a helpful AI assistant who answer user question based on provide context.
""")

prompt = """Answer user question based on the provided context ONLY! if do not know the answer, just sai "I don't know".
### Context:
{context}

### Question:
{question}

### Answer:
"""

prompt = HumanMessagePromptTemplate.from_template(prompt)

messages = [system, prompt]
template = ChatPromptTemplate(messages)

qna_chain = template | llm | StrOutputParser()

In [None]:
def ask_llm(context, question):
    return qna_chain.invoke({'context':context, 'question':question}) 

In [38]:
answer = ask_llm(context[:10_000], 'extract crypto market news from the given text')
textwrap.wrap(answer)

['Here are the crypto-related news extracted from the text:  1. Binance',
 'boss hails crypto ‘golden age’ as Trump win fires up industry 2.',
 'Bitcoin hits record above $75,000 as crypto industry celebrates Trump',
 'win 3. Trump-aligned fund manager Strive jumps into bitcoin 4.',
 '‘Satoshi Nakamoto’ says he also designed the Twitter logo 5. Crypto',
 'exchanges turn to derivatives to lure cautious investors 6. Nigeria',
 'drops money laundering charges against detained Binance executive 7.',
 'Japan maintains cautious stance on crypto ETFs 8. House & HomeCrypto',
 'has designs on real estate Buying blockchain-listed ‘slices’ of',
 'properties is a nascent, niche and troubled market. 9. Donald Trump',
 'has a new crypto venture. The industry is not impressed 10. EU markets',
 'watchdog pushes for extra cyber defences in new crypto rules 11. FBI',
 'creates its own crypto token to nab suspects in alleged fraud scheme',
 '12. Crypto.com sues SEC after receiving legal threat from US re

In [39]:
def chunk_text(text, chunk_size, overlap=100):
    chunks = []
    for i in range(0, len(text), chunk_size - overlap):
        chunks.append(text[i:i + chunk_size])
    return chunks

In [40]:
chunks = chunk_text(context, 10_000)

In [45]:
question = 'extract bitcoin news, personal financem loans and trusts from the given text'
chunk_summary = []
for chunk in chunks:
    response = ask_llm(chunk, question)
    chunk_summary.append(response)

In [46]:
chunk_summary

['Here are the extracted news:\n\n**Bitcoin News:**\n\n1. Bitcoin hits record above $75,000 as crypto industry celebrates Trump win\n2. Binance boss hails crypto ‘golden age’ as Trump win fires up industry\n3. Bitcoin’s shift towards respectability should concern us all\n4. Examining MicroStrategy’s record-shattering $21bn A\n\n**Personal Finance:**\n\n1. Trump-aligned fund manager Strive jumps into bitcoin\n2. Wealth group doubles down on themes foremost in the ex-president’s financial community as election day nears\n\n**Loans and Trusts:**\n\n1. Grayscale Ethereum Trust (ETH)Can a crypto ETF die of apathy?\n2. Caroline Ellison gets 2-year prison sentence for FTX fraud',
 'Here are the extracted topics:\n\n* Bitcoin News:\n\t+ "Trump victory tipped to break logjam of exotic US crypto ETF filings"\n\t+ "Binance boss hails crypto ‘golden age’ as Trump win fires up industry"\n\t+ "Endgame for Endgame?Trump vs Basel III"\n* Personal Finance:\n\t+ "No work phone? Companies tell staff to b