Geting All data from this WebPage: https://www.angelone.in/support

In [7]:
# --> Scrapped Data Without Cleaning


import requests
from bs4 import BeautifulSoup

url = "https://www.angelone.in/support"
headers = {"User-Agent": "Mozilla/5.0"}

response = requests.get(url, headers=headers)
html_content = response.text

soup = BeautifulSoup(html_content, "html.parser")
quick_ten_links = soup.find_all("div", class_="carousel-cell")

data = []

for enum, div in enumerate(quick_ten_links, 1):
    if div.a:
        text = div.a.text.strip()
        link = div.a.get("href")

        # Make full URL if relative
        if link.startswith("/"):
            link = "https://www.angelone.in" + link

        try:
            response1 = requests.get(link, headers=headers)
            response1.raise_for_status()
            soup1 = BeautifulSoup(response1.text, "html.parser")
            detail_text = soup1.get_text(strip=True)

            # Add to list as dictionary
            data.append({
                "Text": text,
                "Link": link,
                "Details": detail_text
            })

            print(enum, "✓ Scraped:", text)
        except Exception as e:
            print(enum, "✗ Failed:", text, "| Error:", e)

# Optional: Save to JSON or print
import json

with open("angelone_quick_10_links_support_data.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2, ensure_ascii=False)


1 ✓ Scraped: Track Application Status
2 ✓ Scraped: Segment Activation Process
3 ✓ Scraped: Fund Payout Status
4 ✓ Scraped: Check Order Status
5 ✓ Scraped: Brokerage and Charges
6 ✓ Scraped: Fund Addition
7 ✓ Scraped: Bank Account Addition
8 ✓ Scraped: Profile Modification
9 ✓ Scraped: DDPI (POA) process
10 ✓ Scraped: Refer and Earn


In [8]:
# --> Scrapped Data With Cleaning


import requests
from bs4 import BeautifulSoup
import json
import re

url = "https://www.angelone.in/support"
headers = {"User-Agent": "Mozilla/5.0"}

response = requests.get(url, headers=headers)
html_content = response.text

soup = BeautifulSoup(html_content, "html.parser")
quick_ten_links = soup.find_all("div", class_="carousel-cell")

data = []

def clean_text(text):
    # Remove excessive whitespace, newlines, and tabs
    text = re.sub(r"\s+", " ", text)
    # Remove any unwanted characters or HTML entities
    text = text.replace("\xa0", " ").strip()
    return text

for enum, div in enumerate(quick_ten_links, 1):
    if div.a:
        text = div.a.text.strip()
        link = div.a.get("href")

        if link.startswith("/"):
            link = "https://www.angelone.in" + link

        try:
            response1 = requests.get(link, headers=headers)
            response1.raise_for_status()
            soup1 = BeautifulSoup(response1.text, "html.parser")

            # Focus on the main content instead of all text
            main_content = soup1.find("main") or soup1.body
            if main_content:
                detail_text = clean_text(main_content.get_text(separator=" ", strip=True))
            else:
                detail_text = "Content not found"

            data.append({
                "Text": text,
                "Link": link,
                "Details": detail_text
            })

            print(enum, "✓ Scraped:", text)
        except Exception as e:
            print(enum, "✗ Failed:", text, "| Error:", e)

with open("angelone_quick_10_links_support_data.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2, ensure_ascii=False)


1 ✓ Scraped: Track Application Status
2 ✓ Scraped: Segment Activation Process
3 ✓ Scraped: Fund Payout Status
4 ✓ Scraped: Check Order Status
5 ✓ Scraped: Brokerage and Charges
6 ✓ Scraped: Fund Addition
7 ✓ Scraped: Bank Account Addition
8 ✓ Scraped: Profile Modification
9 ✓ Scraped: DDPI (POA) process
10 ✓ Scraped: Refer and Earn


Data Gathering from AngelOne Website

In [10]:
import requests
from bs4 import BeautifulSoup
import json
import re

def clean_text(text):
    # Replace multiple whitespace characters with a single space
    text = re.sub(r'\s+', ' ', text)
    # Replace non-breaking spaces and strip leading/trailing spaces
    text = text.replace('\xa0', ' ').strip()
    return text

url = "https://www.angelone.in/support"
headers = {"User-Agent": "Mozilla/5.0"}

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

grid_divs = soup.find_all("div", class_="grid")
data = []

for div in grid_divs:
    link_tag = div.find("a", href=True)
    if not link_tag:
        continue

    link = link_tag["href"]
    if link.startswith("/"):
        link = "https://www.angelone.in" + link

    heading_tag = div.find("h2")
    description_tag = div.find("p")

    heading = clean_text(heading_tag.get_text()) if heading_tag else ""
    description = clean_text(description_tag.get_text()) if description_tag else ""

    try:
        page_response = requests.get(link, headers=headers)
        page_response.raise_for_status()
        page_soup = BeautifulSoup(page_response.text, "html.parser")

        # Use main content if available
        main_content = page_soup.find("main") or page_soup.body
        page_text = clean_text(main_content.get_text(separator=" ", strip=True)) if main_content else "Content not found"
    except Exception as e:
        page_text = f"Error fetching page: {e}"

    data.append({
        "Text": heading,
        "Description": description,
        "Link": link,
        "Details": page_text
    })

# Print summary preview
for item in data:
    print(f"Title: {item['Text']}")
    print(f"Description: {item['Description']}")
    print(f"Link: {item['Link']}")
    print(f"Details (excerpt): {item['Details'][:300]}...\n")

with open("angelone_support_full_data.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2, ensure_ascii=False)


Title: Add and Withdraw Funds
Description: Available margin to trade, funds balance, withdrawable balance
Link: https://www.angelone.in/support/add-and-withdraw-funds
Details (excerpt): We are here to help you Quick Links (10) Track Application Status Know how to track your account opening application status on Angel One application Learn More Segment Activation Process Learn how to activate F&O, Commodity and Currency segments on Angel One Learn More Fund Payout Status Know all ab...

Title: Angel One Recommendations
Description: Research recommendations, charges, frequency of recommendation
Link: https://www.angelone.in/support/angel-one-recommendations
Details (excerpt): We are here to help you Quick Links (10) Track Application Status Know how to track your account opening application status on Angel One application Learn More Segment Activation Process Learn how to activate F&O, Commodity and Currency segments on Angel One Learn More Fund Payout Status Know all ab...

Title: Charg

Scrapping Data from PDF

In [13]:
import os
import json
import re
from langchain.document_loaders import PyPDFLoader

# === Settings ===
folder_path = "Insurance PDFs"
save_per_pdf = True   # Save pages grouped by PDF file
save_flat_list = True  # Save all pages in a flat list
output_per_pdf = "insurance_pdfs_by_file.json"
output_flat_list = "insurance_pdfs_flat.json"

# === Helper ===
def clean_text(text):
    text = re.sub(r"\s+", " ", text)  # Remove extra spaces/newlines
    return text.strip()

# === PDF Extraction ===
all_documents = {}
all_pages = []

for filename in os.listdir(folder_path):
    if filename.lower().endswith(".pdf"):
        pdf_path = os.path.join(folder_path, filename)
        print(f"🔄 Loading: {filename}")

        loader = PyPDFLoader(pdf_path)
        documents = loader.load()

        pages = []
        for doc in documents:
            page_content = clean_text(doc.page_content)
            page_data = {
                "Text": filename,
                "page_number": doc.metadata.get("page", None),
                "Details": page_content
            }

            pages.append({
                "page_number": page_data["page_number"],
                "content": f"[{filename}] {page_content}"
            })

            all_pages.append(page_data)

        all_documents[filename] = pages

# === Save Outputs ===
# if save_per_pdf:
#     with open(output_per_pdf, "w", encoding="utf-8") as f:
#         json.dump(all_documents, f, indent=2, ensure_ascii=False)
#     print(f"✅ Saved grouped content to '{output_per_pdf}'")

if save_flat_list:
    with open(output_flat_list, "w", encoding="utf-8") as f:
        json.dump(all_pages, f, indent=2, ensure_ascii=False)
    print(f"✅ Saved flat content to '{output_flat_list}'")


🔄 Loading: America's_Choice_2500_Gold_SOB (1) (1).pdf
🔄 Loading: America's_Choice_5000_Bronze_SOB (2).pdf
🔄 Loading: America's_Choice_5000_HSA_SOB (2).pdf
🔄 Loading: America's_Choice_7350_Copper_SOB (1) (1).pdf
🔄 Loading: America's_Choice_Medical_Questions_-_Modified_(3) (1).pdf
✅ Saved flat content to 'insurance_pdfs_flat.json'
