In [5]:
from bs4 import BeautifulSoup
import json

# Load the HTML content
with open("Pasted_Text_1750885383620.txt", "r", encoding="utf-8") as f:
    html_content = f.read()

soup = BeautifulSoup(html_content, 'html.parser')


# =============================
# Extract Title
# =============================
title_section = soup.find('section', id="for_table")
if title_section:
    title_parts = title_section.find_all('h3', class_='ff-suissintl-bold')
    title = " ".join([h3.get_text(strip=True) for h3 in title_parts])
else:
    title = "باقات eHome DSL"


# =============================
# Extract DSL Packages
# =============================
def extract_dsl_packages():
    dsl_data = []

    # Extract data unit (e.g. جيجابايت)
    data_unit_elem = soup.find('p', class_='ff-suissintl-bold fs-16')
    data_unit = "جيجابايت"
    if data_unit_elem:
        data_text = data_unit_elem.get_text(strip=True)
        if "جيجابايت" in data_text:
            data_unit = "جيجابايت"

    # Extract currency unit (e.g. جنيه)
    currency_unit_elem = soup.find('sub', class_='plan-currency')
    currency_unit = "جنيه"
    if currency_unit_elem:
        currency_text = currency_unit_elem.get_text(strip=True)
        if "جنيه" in currency_text:
            currency_unit = "جنيه"

    speed_tabs = {
        "one_tab": "30 Mbps",
        "two_tab": "70 Mbps",
        "three_tab": "100 Mbps",
        "four_tab": "200 Mbps"
    }

    for tab_id, speed in speed_tabs.items():
        tab = soup.find('div', {'id': tab_id})
        if not tab:
            continue

        rows = tab.find_all('tr')
        for i, row in enumerate(rows):
            cells = row.find_all('td')

            if len(cells) == 0:
                continue

            for j, cell in enumerate(cells):
                try:
                    name_elem = cell.find('h5', class_='plan-name')
                    price_elem = cell.find('h5', class_='plan-price')

                    if not name_elem or not price_elem:
                        continue

                    data_value = name_elem.get_text(strip=True).split()[0]
                    price_value = price_elem.get_text(strip=True).split()[0]

                    # Use dynamic units
                    data_with_unit = f"{data_value} {data_unit}"
                    price_with_unit = f"{price_value} {currency_unit}"

                    benefit = ""
                    if i + 1 < len(rows):
                        benefit_cells = rows[i + 1].find_all('td')
                        if j < len(benefit_cells):
                            benefit = benefit_cells[j].get_text(strip=True)

                    validity = "شهر"

                    dsl_data.append({
                        "speed": speed,
                        "data_gb": data_with_unit,
                        "price_egp": price_with_unit,
                        "benefit": benefit,
                        "validity": validity
                    })
                except Exception as e:
                    continue

    return dsl_data




# =============================
# Extract Terms and Conditions
# =============================
def extract_terms_and_conditions():
    terms_sections = soup.find_all('section', id="for_features_and_terms")
    terms_list = []

    for section in terms_sections:
        title_div = section.find('div', class_='for__sectionTitles')
        if title_div:
            full_title = ' '.join([h.get_text(strip=True) for h in title_div.find_all('h3')])
            if "الشروط و الاحكام للباقة" in full_title:
                items = section.select(".col-sm-12.col-md-6.col-lg-4.my-3.d-flex")
                for item in items:
                    span = item.find('span')
                    p_tag = item.find('p', class_='fs-16')
                    if span and p_tag:
                        number = span.get_text(strip=True)
                        text = p_tag.get_text(strip=True)
                        terms_list.append({"number": number, "text": text})
                break

    return terms_list


# =============================
# Extract Favorite Packages (Streaming, Gaming, Off-Peak)
# =============================
def extract_favorite_packages():
    def clean_numeric_value(value):
        """Extract digits from string."""
        return ''.join(filter(str.isdigit, value))

    result = {
        "section_title": "الباقات المفضلة",
        "packages": {
            "streaming": [],
            "gaming": [],
            "off_peak": []
        }
    }

    tabs = {
        "streaming-tab": "streaming",
        "social-tab": "gaming",
        "off-peak-tab": "off_peak"
    }

    tab_titles = {
        "streaming": "المشاهدة",
        "gaming": "الألعاب",
        "off_peak": "خارج أوقات الذروة"
    }

    # Scrape dynamic note
    note_element = soup.find('p', class_='plan-hint mediumGrey-color')
    if note_element:
        dynamic_note = note_element.get_text(strip=True).replace("يمكنكاضافه", "يمكنك اضافه").replace("تلقائياكل", "تلقائيا كل")
    else:
        dynamic_note = "يمكنك اضافه الباقه مره واحده او تجدد تلقائيا كل 30 يوم"

    # Data unit
    data_unit_elem = soup.find('p', class_='ff-suissintl-bold fs-16')
    data_unit = "جيجابايت"
    if data_unit_elem:
        data_text = data_unit_elem.get_text(strip=True)
        if "جيجابايت" in data_text:
            data_unit = "جيجابايت"

    # Currency unit
    currency_unit_elem = soup.find('sub', class_='plan-currency')
    currency_unit = "جنيه"
    if currency_unit_elem:
        currency_text = currency_unit_elem.get_text(strip=True)
        if "جنيه" in currency_text:
            currency_unit = "جنيه"

    for tab_id, category in tabs.items():
        tab_section = soup.find('div', id=tab_id)
        if not tab_section:
            print(f"Tab '{tab_id}' not found.")
            continue

        description_p = tab_section.find('p', class_='mt-30 ff-suissintl-light fs-16')
        description = description_p.get_text(strip=True) if description_p else ""

        package_group = {
            "title": tab_titles[category],
            "description": description,
            "items": []
        }

        # From tables
        tables = tab_section.find_all('table')
        for table in tables:
            rows = table.find_all('tr')
            if len(rows) < 2:
                continue
            name_row, data_row = rows[0], rows[1]
            name_cells = name_row.find_all('td')
            data_cells = data_row.find_all('td')
            for name_cell, data_cell in zip(name_cells, data_cells):
                try:
                    name_elem = name_cell.find('h5', class_='plan-name')
                    price_elem = name_cell.find('h5', class_='plan-price')
                    if name_elem and price_elem:
                        name = name_elem.get_text(strip=True).replace('\n', ' ').strip()
                        price = clean_numeric_value(price_elem.get_text(strip=True))
                        data_value = data_cell.get_text(strip=True)
                        data_gb = clean_numeric_value(data_value)

                        package_group["items"].append({
                            "name": name,
                            "data_gb": f"{data_gb} {data_unit}",
                            "price_egp": f"{price} {currency_unit}",
                            "note": dynamic_note
                        })
                except:
                    continue

        # From cards
        cards = tab_section.find_all('div', class_='card')
        for card in cards:
            name_elem = card.find('h5', class_='plan-name')
            price_elem = card.find('h5', class_='plan-price')
            data_elem = card.find('p', class_='ff-suissintl-bold.fs-16')
            if name_elem and price_elem and data_elem:
                try:
                    name = name_elem.get_text(strip=True).replace('\n', ' ').strip()
                    price = clean_numeric_value(price_elem.get_text(strip=True))
                    data_gb = clean_numeric_value(data_elem.get_text(strip=True))
                    package_group["items"].append({
                        "name": name,
                        "data_gb": f"{data_gb} {data_unit}",
                        "price_egp": f"{price} {currency_unit}",
                        "note": dynamic_note
                    })
                except:
                    continue

        result["packages"][category] = package_group

    return result


# =============================
# Extract Extra Bundles
# =============================
def extract_extra_bundles():
    extra_section = soup.find('section', id='for_textContainer')
    if not extra_section:
        return {}

    title_div = extra_section.find('div', class_='for__sectionTitles')
    titles = title_div.find_all('h3') if title_div else []
    section_title = ' '.join([t.get_text(strip=True) for t in titles]) or "الباقات الإضافية"

    description_p = extra_section.find('p', class_='fs-16')
    description = description_p.get_text(strip=True) if description_p else ""

    bundles = []
    cards = extra_section.select(".col-sm-12.col-md-6.col-lg-3.my-2.my-lg-0 .text-container")

    for card in cards:
        name_elem = card.find('h5', class_='extra-name')
        data_elem = card.find('h6', class_='extra-price')
        price_elem = card.find('h6', class_='ff-suissintl-bold')

        if name_elem and data_elem and price_elem:
            try:
                name = name_elem.get_text(strip=True)
                data_gb = clean_numeric_value(data_elem.get_text(strip=True).split()[0])
                price_egp = clean_numeric_value(price_elem.get_text(strip=True).split()[0])

                bundles.append({
                    "name": name,
                    "data_gb": f"{data_gb} جيجابايت",
                    "price_egp": f"{price_egp} جنيه"
                })
            except Exception as e:
                print(f"Error parsing bundle: {e}")
                continue

    return {
        "section_title": section_title,
        "description": description,
        "bundles": bundles
    }


# =============================
# Extract Emerald Offers
# =============================
def extract_emerald_offers():
    emerald_section = None
    for sec in soup.find_all('section', id='for_table'):
        h3s = sec.find_all('h3')
        titles = [h.get_text(strip=True) for h in h3s]
        if any("فقط لعملاء أميريلد" in t for t in titles):
            emerald_section = sec
            break

    if not emerald_section:
        return {}

    # Get plan names
    header_row = emerald_section.find('tr', style=lambda s: 'height' in str(s))
    if not header_row:
        return {}

    plan_names = [
        td.find('h5', class_='plan-name').get_text(strip=True) if td.find('h5', class_='plan-name') else f"Plan {i+1}"
        for i, td in enumerate(header_row.find_all('td'))
    ]

    # Get offer rows
    offer_rows = emerald_section.find_all('tr')[1:]

    emerald_offers = []
    for idx, row in enumerate(offer_rows):
        offers = [td.get_text(strip=True) for td in row.find_all('td')]
        emerald_offers.append({
            "plan": plan_names[idx],
            "offers": offers
        })

    return {
        "section_title": "فقط لعملاء أميريلد",
        "section_subtitle": "استمتع بخصومات حصرية على باقات ال eHome DSL",
        "emerald_offers": emerald_offers
    }


# =============================
# ✅ New Function: Extract Service Features
# =============================
def extract_service_features():
    features_section = soup.find('section', id='for_features_and_terms')

    if not features_section:
        return {
            "section_title": "مميزات الخدمة",
            "description": "",
            "features": []
        }

    # Try to find the title block inside for__sectionTitles
    title_div = features_section.find('div', class_='for__sectionTitles')
    if title_div:
        titles = title_div.find_all('h3')
        section_title = ''.join([t.get_text(strip=True) for t in titles])
    else:
        section_title = "مميزات الخدمة"

    # Description is empty in current HTML
    description = ""

    # Extract feature items from div.col-sm-12.col-md-6.col-lg-4.my-3
    feature_items = features_section.select(".col-sm-12.col-md-6.col-lg-4.my-3")
    features = []

    for item in feature_items:
        p_tag = item.find('p', class_=False)  # Get <p> without class
        if p_tag:
            text = p_tag.get_text(strip=True)

            # Extract number manually if it starts with a digit
            first_word = text.split()[0]
            if first_word.isdigit():
                number = first_word
                clean_text = text[len(number):].strip()
            else:
                number = ""
                clean_text = text

            features.append({
                "number": number,
                "text": clean_text
            })

    return {
        "section_title": section_title,
        "description": description,
        "features": features
    }

#----------
#
#--------------

# Helper function to extract only digits from a string
def clean_numeric_value(value):
    """Extract digits from a string."""
    return ''.join(filter(str.isdigit, value))

# Function to extract extra packages
def extract_extra_packages():
    # Extract data unit dynamically
    data_unit_elem = soup.find('small', class_='extra-currency')
    data_unit = "جيجابايت"  # fallback
    if data_unit_elem:
        data_text = data_unit_elem.get_text(strip=True)
        if "جيجابايت" in data_text:
            data_unit = "جيجابايت"

    # Extract price unit dynamically
    price_unit_elem = soup.find('h6', class_='ff-suissintl-bold')
    price_unit = "جنيه"  # fallback
    if price_unit_elem:
        price_sub = price_unit_elem.find('sub', class_='plan-currency')
        if price_sub:
            currency_text = price_sub.get_text(strip=True)
            if "جنيه" in currency_text:
                price_unit = "جنيه"

    # Find extra packages section
    extra_packages_section = soup.find('section', id='for_textContainer')
    if not extra_packages_section:
        return {"description": "", "packages": []}

    description_p = extra_packages_section.find('p', class_='fs-16')
    description = description_p.get_text(strip=True) if description_p else ""

    cards = extra_packages_section.select(".col-sm-12.col-md-6.col-lg-3.my-2.my-lg-0 .text-container")
    packages = []

    for card in cards:
        name_elem = card.find('h5', class_='extra-name')
        data_elem = card.find('h6', class_='extra-price')
        price_elem = card.find('h6', class_='ff-suissintl-bold')

        if name_elem and data_elem and price_elem:
            try:
                name = name_elem.get_text(strip=True)
                data_raw = data_elem.get_text(strip=True)
                price_raw = price_elem.get_text(strip=True)

                data_value = clean_numeric_value(data_raw.split()[0])
                price_value = clean_numeric_value(price_raw.split()[0])

                packages.append({
                    "name": name,
                    "data_gb": f"{data_value} {data_unit}",
                    "price_egp": f"{price_value} {price_unit}"
                })
            except Exception as e:
                print(f"Error parsing package: {e}")
                continue

    return {
        "description": description,
        "packages": packages
    }

# Function to extract terms and conditions for extra packages
def extract_extra_packages_terms_and_conditions():
    # Find all sections with id="for_features_and_terms"
    terms_sections = soup.find_all('section', id='for_features_and_terms')

    for section in terms_sections:
        title_div = section.find('div', class_='for__sectionTitles')
        if not title_div:
            continue

        titles = title_div.find_all('h3')
        full_title = ''.join([t.get_text(strip=True) for t in titles])

        # Look for "للباقة الإضافية" or "للباقة الاضافية" (both forms exist in HTML)
        if "للباقة الإضافية" in full_title or "للباقة الاضافية" in full_title:
            items = section.select(".col-sm-12.col-md-6.col-lg-4.my-3.d-flex")
            terms = []

            for item in items:
                span = item.find('span')
                p_tag = item.find('p', class_='fs-16')
                if span and p_tag:
                    number = span.get_text(strip=True)
                    text = p_tag.get_text(strip=True)
                    terms.append({
                        "number": number,
                        "text": text
                    })

            return {
                "section_title": "الشروط و الاحكام للباقة الإضافية",
                "terms": terms
            }

    # Fallback
    return {
        "section_title": "الشروط و الاحكام للباقة الإضافية",
        "terms": []
    }

# Run all extractions
dsl_packages = extract_dsl_packages()
terms_conditions = extract_terms_and_conditions()
favorite_packages = extract_favorite_packages()
extra_bundles = extract_extra_bundles()
emerald_offers = extract_emerald_offers()
service_features = extract_service_features()  # ✅ Added here
#extra_packages_data = extract_extra_packages()
extra_packages_terms_data = extract_extra_packages_terms_and_conditions()

# Build final JSON output
output = {
    "service_features": service_features , # ✅ Add extracted service features
    "title": title,
    "packages": dsl_packages,
    "terms_and_conditions": terms_conditions,
    "favorite_packages": favorite_packages["packages"],
    "extra_bundles": extra_bundles,
     #"extra_packages": extra_packages_data,
    "extra_packages_terms_and_conditions": extra_packages_terms_data,
    "emerald_offers": emerald_offers,
}


# Save to JSON file
with open("full_ehome_dsl_data.json", "w", encoding="utf-8") as f:
    json.dump(output, f, ensure_ascii=False, indent=4)

print("✅ Data successfully extracted and saved to full_ehome_dsl_data.json")

✅ Data successfully extracted and saved to full_ehome_dsl_data.json
