In [42]:
pip install python-bidi arabic-reshaper

Collecting python-bidi
  Downloading python_bidi-0.6.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Collecting arabic-reshaper
  Downloading arabic_reshaper-3.0.0-py3-none-any.whl.metadata (12 kB)
Downloading python_bidi-0.6.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (292 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m292.9/292.9 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading arabic_reshaper-3.0.0-py3-none-any.whl (20 kB)
Installing collected packages: python-bidi, arabic-reshaper
Successfully installed arabic-reshaper-3.0.0 python-bidi-0.6.6


In [43]:
from bs4 import BeautifulSoup
import json
import re
from bidi.algorithm import get_display
import arabic_reshaper
from bidi.algorithm import get_display
import arabic_reshaper

file_path = "/content/drive/MyDrive/Grad. Project/Etisalat.txt"
with open(file_path, "r", encoding="utf-8") as f:
    html_content = f.read()

In [31]:
soup = BeautifulSoup(html_content, 'html.parser')

def get_main_header(file_path):
    """
    Extracts the main header (e.g., 'اميرالد العيله') from the given HTML file.

    Args:
        file_path (str): Path to the HTML file.

    Returns:
        str or None: The main header text if found, else None.
    """
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            html_content = f.read()

        soup = BeautifulSoup(html_content, 'html.parser')

        # CSS selector mimicking the XPath: /html/body/main/article/section[1]/div/div/div/h1
        main_header = soup.select_one("main > article > section:nth-of-type(1) .page-title-content h1")

        if main_header:
            return main_header.get_text(strip=True)
        else:
            print("Main header not found.")
            return None

    except Exception as e:
        print(f"Error: {e}")
        return None



def get_description_text(file_path):
    """
    Extracts the description paragraph from the given HTML file.

    Args:
        file_path (str): Path to the HTML file.

    Returns:
        str or None: The description text if found, else None.
    """
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            html_content = f.read()

        soup = BeautifulSoup(html_content, 'html.parser')

        # Navigate using CSS selector mimicking the XPath provided
        description = soup.select_one("main > article > section:nth-of-type(1) div.container div.row div.page-title-content p.fs-14")

        if description:
            return description.get_text(strip=True)
        else:
            print("Description not found.")
            return None

    except Exception as e:
        print(f"Error: {e}")
        return None


def clean_text(text):
    """Helper to clean and normalize Arabic text."""
    return re.sub(r'\s+', ' ', text.strip())

def extract_plan_features(file_path):
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            html_content = f.read()

        soup = BeautifulSoup(html_content, 'html.parser')

        target_section = soup.select_one("main > article > section:nth-of-type(2) div.container div.tabsContainer .tab-content.show")
        if not target_section:
            print("Target section not found.")
            return None

        plans_data = {}
        plan_cards = target_section.select(".card-container .slick-slide .card")

        for idx, card in enumerate(plan_cards):
            plan_name_tag = card.find("h5", class_="plan-name")
            plan_price_tag = card.find("h5", class_="plan-price")
            plan_items = card.select(".list-group-item")

            features = []

            # Extract price
            price = ""
            if plan_price_tag:
                price_text = clean_text(plan_price_tag.get_text())
                price_match = re.search(r'(\d+)(?:\s|&nbsp;)+جنيه', price_text)
                if price_match:
                    price = price_match.group(1)

            # Add price as first feature
            if price:
                features.append(f"السعر : {price} جنيه")

            # Extract other features
            for item in plan_items:
                label = item.find("small")
                value = item.find("p", class_="ff-suissintl-bold fs-16")

                if label and value:
                    label_text = clean_text(label.get_text()).strip(":").strip()
                    value_text = clean_text(value.get_text()).strip()
                    features.append(f"{label_text} : {value_text}")

            # Normalize plan name
            plan_name = clean_text(plan_name_tag.get_text()) if plan_name_tag else f"Plan_{idx + 1}"
            plan_name_cleaned = re.sub(r'\s*\d+\s*$', '', plan_name).strip()  # Remove trailing number

            # Use full name (with number) as key
            plan_key = plan_name

            plans_data[plan_key] = features

        return plans_data

    except Exception as e:
        print(f"Error: {e}")
        return None


def extract_titles_and_descriptions(file_path):
    """
    Extracts titles and descriptions from the 'Points Program' section.

    Args:
        file_path (str): Path to the HTML file.

    Returns:
        list of dicts: Each dict contains 'title' and 'description'
    """
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            html_content = f.read()

        soup = BeautifulSoup(html_content, 'html.parser')

        # Locate the correct section by ID instead of assuming structure
        section = soup.find("section", {"id": "for_readAbout"})
        if not section:
            print("Section 'for_moreProgram' not found.")
            return []

        # Find all items inside the slider_1items div
        slider_div = section.find("div", class_="slider_1items")
        if not slider_div:
            print("Slider container not found inside the section.")
            return []

        items = slider_div.find_all("div", class_="item")
        results = []

        for item in items:
            title_tag = item.find("h5", class_="ff-suissintl-bold")
            desc_tags = item.find_all("p")

            title = title_tag.get_text(strip=True) if title_tag else None
            description = " ".join([p.get_text(strip=True) for p in desc_tags]) if desc_tags else None

            if title or description:
                results.append({
                    "title": title or "Untitled",
                    "description": description or "No description"
                })

        return results

    except Exception as e:
        print(f"Error: {e}")
        return []

def extract_read_about_section(file_path):
    """
    Extracts the 'Read About' section (id='for_readAbout') from the HTML file.

    Args:
        file_path (str): Path to the HTML file.

    Returns:
        list of dicts: Each dict contains 'title', 'description', and 'logo_url'
    """
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            html_content = f.read()

        soup = BeautifulSoup(html_content, "html.parser")

        # Locate the correct section by ID
        section = soup.find("section", {"id": "for_readAbout"})
        if not section:
            print("Section 'for_readAbout' not found.")
            return []

        # Find the grid container that holds all items
        grid_container = section.find("div", class_="grid")
        if not grid_container:
            print("Grid container not found inside 'for_readAbout'.")
            return []

        items = grid_container.find_all("div", recursive=False)
        results = []

        for item in items:
            logo_img = item.find("img")
            title_tag = item.find("p", class_="my-2")
            desc_tag = item.find_next("p", class_=lambda x: x and "grey-2-color" in x)

            logo_url = logo_img["src"] if logo_img and "src" in logo_img.attrs else None
            title = title_tag.get_text(strip=True) if title_tag else "Untitled"
            description = desc_tag.get_text(strip=True) if desc_tag else "No description"

            results.append({
                "title": title,
                "description": description,
                "logo_url": logo_url
            })

        return results

    except Exception as e:
        print(f"Error: {e}")
        return []


def extract_gto_emerald_offer_section(file_path):
    """
    Extracts the 'عروض GTO وإي آند مصراميرالد' section from the HTML file.

    Args:
        file_path (str): Path to the HTML file.

    Returns:
        dict: A dictionary containing 'title' and 'description' if found, else None.
    """
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            html_content = f.read()

        soup = BeautifulSoup(html_content, "html.parser")

        # Try to find the title by partial match
        title_tag = None
        for tag in soup.find_all(["h5", "h6"]):
            if "عروض GTO" in tag.get_text(strip=True) and "إي آند مصراميرالد" in tag.get_text(strip=True):
                title_tag = tag
                break

        if not title_tag:
            print("Section title not found.")
            return None

        # Find the next paragraph that contains meaningful Arabic text
        desc_tag = None
        for sibling in title_tag.find_next_siblings():
            if sibling.name == "p":
                desc_text = sibling.get_text(strip=True)
                if len(desc_text) > 20:  # Ensure it's not empty or too short
                    desc_tag = sibling
                    break

        if not desc_tag:
            print("Description not found.")
            return None

        # Extract and clean the text
        title = " ".join(title_tag.get_text(strip=True).split())
        description = " ".join(desc_tag.get_text(strip=True).split())

        return {
            "title": title,
            "description": description
        }

    except Exception as e:
        print(f"Error: {e}")
        return None


def extract_family_section(file_path):



    """
    Extracts the 'العائله' (Family) section from the HTML file.

    Args:
        file_path (str): Path to the HTML file.

    Returns:
        dict: A dictionary containing 'title', 'description', and 'image_url'
    """
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            html_content = f.read()

        soup = BeautifulSoup(html_content, "html.parser")

        # Try to find the card that contains the family section
        family_card = None
        for card in soup.find_all("div", class_="card"):
            title_tag = card.find("h5", class_="ff-suissintl-semi-bold")
            if title_tag and "العائله" in title_tag.get_text(strip=True):
                family_card = card
                break

        if not family_card:
            print("Family section not found.")
            return None

        # Extract title and clean it
        title_tag = family_card.find("h5", class_="ff-suissintl-semi-bold")
        title = title_tag.get_text(strip=True) if title_tag else "العائله"
        title = " ".join(title.split())  # Clean extra spaces

        # Extract description and clean it
        desc_tag = family_card.find("p", class_="fs-16")
        description = desc_tag.get_text(strip=True) if desc_tag else None
        if description:
            description = " ".join(description.split())  # Clean extra spaces

        return {
            "title": title,
            "description": description,
        }

    except Exception as e:
        print(f"Error: {e}")
        return None


def extract_exclusive_privileges_section(file_path):
    """
    Extracts the 'اميرالد Exclusive Privileges' section from the HTML file.

    Args:
        file_path (str): Path to the HTML file.

    Returns:
        dict: A dictionary containing 'title', 'description', and 'image_url'
    """
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            html_content = f.read()

        soup = BeautifulSoup(html_content, "html.parser")

        # Find the card that contains the exclusive privileges section
        exclusive_card = None
        for card in soup.find_all("div", class_="card"):
            title_tag = card.find("h5", class_="ff-suissintl-semi-bold")
            if title_tag and "Exclusive Privileges" in title_tag.get_text(strip=True):
                exclusive_card = card
                break

        if not exclusive_card:
            print("Exclusive Privileges section not found.")
            return None

        # Extract title and clean it
        title_tag = exclusive_card.find("h5", class_="ff-suissintl-semi-bold")
        title = title_tag.get_text(strip=True) if title_tag else "اميرالد Exclusive Privileges"
        title = " ".join(title.split())  # Clean extra spaces

        # Extract description and clean it
        desc_tags = exclusive_card.find_all("p", class_="fs-16")
        description = ""
        for p in desc_tags:
            p_text = p.get_text(strip=True)
            if p_text:
                description += p_text + " "
        description = " ".join(description.strip().split())  # Clean whitespace

        return {
            "title": title,
            "description": description,

        }

    except Exception as e:
        print(f"Error: {e}")
        return None



from bs4 import BeautifulSoup

def extract_entertainment_experience_section(file_path):
    """
    Extracts the 'مع تجربه الترفيهيه' section from the HTML file.

    Args:
        file_path (str): Path to the HTML file.

    Returns:
        dict: A dictionary containing 'title', 'description', and 'image_url'
    """
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            html_content = f.read()

        soup = BeautifulSoup(html_content, "html.parser")

        # Find the card that contains the entertainment experience section
        experience_card = None
        for card in soup.find_all("div", class_="card"):
            title_tag = card.find("h5", class_="ff-suissintl-semi-bold")
            if title_tag and "تجربه الترفيهيه" in title_tag.get_text(strip=True):
                experience_card = card
                break

        if not experience_card:
            print("Entertainment experience section not found.")
            return None

        # Extract title and clean it
        title_tag = experience_card.find("h5", class_="ff-suissintl-semi-bold")
        title = title_tag.get_text(strip=True) if title_tag else "مع تجربه الترفيهيه"
        title = " ".join(title.split())  # Clean extra spaces

        # Extract description and clean it
        desc_tag = experience_card.find("p", class_="")
        description = desc_tag.get_text(strip=True) if desc_tag else None
        if description:
            description = " ".join(desc_tag.get_text(strip=True).split())  # Clean extra spaces

        return {
            "title": title,
            "description": description,

        }

    except Exception as e:
        print(f"Error: {e}")
        return None


def extract_terms_and_conditions_section(file_path):
    """
    Extracts the 'الشروط و الأحكام' (Terms and Conditions) section from the HTML file.

    Args:
        file_path (str): Path to the HTML file.

    Returns:
        list of dicts: Each dict contains 'heading' and 'content' if found
    """
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            html_content = f.read()

        soup = BeautifulSoup(html_content, "html.parser")

        # Find the main terms section by ID or structure
        terms_section = soup.find("section", {"id": "for_features_and_terms"})
        if not terms_section:
            print("Terms & Conditions section not found.")
            return []

        container = terms_section.find("div", {"id": "Terms"})
        if not container:
            print("Terms container not found.")
            return []

        # Extract all meaningful blocks inside the container
        content_blocks = container.find_all(["h3", "h4", "h5", "p", "ul", "li"])

        results = []
        current_heading = None

        for block in content_blocks:
            if block.name in ["h3", "h4", "h5"]:
                # New heading found
                heading_text = " ".join(block.stripped_strings)
                if heading_text:
                    current_heading = heading_text
            elif block.name == "p":
                paragraph = " ".join(block.stripped_strings)
                if paragraph:
                    results.append({
                        "heading": current_heading or "ملاحظات",
                        "content": paragraph
                    })
            elif block.name == "li":
                list_item = " ".join(block.stripped_strings)
                if list_item:
                    results.append({
                        "heading": current_heading or "قائمة",
                        "content": list_item
                    })
            elif block.name == "ul":
                list_items = block.find_all("li")
                for li in list_items:
                    item_text = " ".join(li.stripped_strings)
                    if item_text:
                        results.append({
                            "heading": current_heading or "قائمة",
                            "content": item_text
                        })

        return results

    except Exception as e:
        print(f"Error: {e}")
        return []



In [69]:
def run_all_extraction(file_path):
    data = {}

    # Run each extraction function and store result
    try:
        main_header = get_main_header(file_path)
        if main_header:
            data["main_header"] = fix_rtl_text(main_header)
    except Exception as e:
        print("Error extracting main header:", e)

    try:
        description = get_description_text(file_path)
        if description:
            data["description"] = fix_rtl_text(description)
    except Exception as e:
        print("Error extracting description:", e)

    try:
        gto_offer = extract_gto_emerald_offer_section(file_path)
        if gto_offer:
            data["gto_emerald_offer"] = {
                "title": fix_rtl_text(gto_offer["title"]),
                "description": fix_rtl_text(gto_offer["description"])
            }
    except Exception as e:
        print("Error extracting GTO Emerald Offer:", e)

    try:
        family_section = extract_family_section(file_path)
        if family_section:
            data["family_section"] = {
                "title": fix_rtl_text(family_section["title"]),
                "description": fix_rtl_text(family_section["description"]),
                "image_url": family_section.get("image_url")
            }
    except Exception as e:
        print("Error extracting Family section:", e)

    try:
        entertainment = extract_entertainment_experience_section(file_path)
        if entertainment:
            data["entertainment_experience"] = {
                "title": fix_rtl_text(entertainment["title"]),
                "description": fix_rtl_text(entertainment["description"]),
                "image_url": entertainment.get("image_url")
            }
    except Exception as e:
        print("Error extracting Entertainment Experience:", e)

    try:
        terms = extract_terms_and_conditions_section(file_path)
        cleaned_terms = [
            {
                "heading": fix_rtl_text(t["heading"]),
                "content": fix_rtl_text(t["content"])
            } for t in terms if t["heading"] and t["content"]
        ]
        if cleaned_terms:
            data["terms_and_conditions"] = cleaned_terms
    except Exception as e:
        print("Error extracting Terms & Conditions:", e)

    try:
        plan_features = extract_plan_features(file_path)
        if plan_features:
            data["plan_features"] = plan_features
    except Exception as e:
        print("Error extracting Plan Features:", e)

    try:
        points_program = extract_points_program_section(file_path)
        if points_program:
            data["points_program"] = {
                "title": fix_rtl_text(points_program["title"]),
                "description": fix_rtl_text(points_program["description"]),
                "image_url": points_program.get("image_url"),
                "more_info_link": points_program.get("more_info_link")
            }
    except Exception as e:
        print("Error extracting Points Program:", e)

    try:
        exclusive_privileges = extract_exclusive_privileges_section(file_path)
        if exclusive_privileges:
            data["exclusive_privileges"] = {
                "title": fix_rtl_text(exclusive_privileges["title"]),
                "description": fix_rtl_text(exclusive_privileges["description"])
            }
    except Exception as e:
        print("Error extracting Exclusive Privileges:", e)

    try:
        read_about = extract_read_about_section(file_path)
        if read_about:
            data["read_about_section"] = [
                {
                    "title": fix_rtl_text(item["title"]),
                    "description": fix_rtl_text(item["description"])
                } for item in read_about
            ]
    except Exception as e:
        print("Error extracting Read About Section:", e)

    return data

In [70]:
if __name__ == "__main__":
    file_path = "/content/drive/MyDrive/Grad. Project/Etisalat.txt"

    final_data = run_all_extraction(file_path)

    output_file = "emerald_data.json"
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(final_data, f, ensure_ascii=False, indent=2)

    print(f"✅ Extraction complete. Saved to '{output_file}'")

Grid container not found inside 'for_readAbout'.
✅ Extraction complete. Saved to 'emerald_data.json'


In [71]:


def fix_rtl_text(text):
    if not text:
        return text
    # Reshape Arabic letters
    reshaped_text = arabic_reshaper.reshape(text)
    # Apply BiDi algorithm
    return get_display(reshaped_text)

def fix_nested_dict(d):
    for key, value in d.items():
        if isinstance(value, dict):
            fix_nested_dict(value)
        elif isinstance(value, list):
            for item in value:
                if isinstance(item, dict):
                    fix_nested_dict(item)
                elif isinstance(item, str):
                    idx = value.index(item)
                    value[idx] = fix_rtl_text(item)
        elif isinstance(value, str):
            d[key] = fix_rtl_text(value)
    return d

# Load your existing JSON file
input_file = "/content/drive/MyDrive/Grad. Project/emerald_data.json"
output_file = "emerald_data_fixed.json"

with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

# Fix RTL display issues
fixed_data = fix_nested_dict(data)

# Save fixed JSON
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(fixed_data, f, ensure_ascii=False, indent=2)

print(f"✅ Fixed JSON saved to '{output_file}'")

✅ Fixed JSON saved to 'emerald_data_fixed.json'
