# Egypt Adventures Travel Blog Posts after clearing and apply the preprocessing part

In [12]:
import requests
from bs4 import BeautifulSoup

# Step 1: Send a request to the website
url = 'https://www.egyptadventurestravel.com/blog'
response = requests.get(url)

# Step 2: Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')

# Step 3: Extract relevant blog content (e.g., articles)
articles = soup.find_all('article', class_='hentry')  # Find all blog articles

# Step 4: Extract article details
for article in articles:
    title = article.find('h1', class_='blog-title').get_text(strip=True)
    link = article.find('a')['href']
    date = article.find('time', class_='blog-date').get_text(strip=True)
    excerpt = article.find('div', class_='blog-excerpt-wrapper').get_text(strip=True)

    print(f"Title: {title}")
    print(f"Link: {link}")
    print(f"Date: {date}")
    print(f"Excerpt: {excerpt}")
    print("-" * 80)


Title: Grand Egyptian Museum Opening July 3!
Link: /blog/grand-egyptian-museum-opening
Date: 1/30/25
Excerpt: The Grand Egyptian Museum, the largest archaeological museum in the world, has its opening date set for July 3! Or, so they say. Here's what we know about the Grand Egyptian Museum opening and when to expect Egypt’s newest museum to open.
--------------------------------------------------------------------------------
Title: Egypt Itinerary 10 Days with Nile Cruise
Link: /blog/egypt-itinerary-10-days-with-nile-cruise
Date: 1/22/25
Excerpt: Use this Egypt Itinerary 10 Days with Nile Cruise to help plan your trip to Egypt, see what you can do with a 10 full day trip, and check all of those ancient Egyptian sites off of your bucket list in Cairo, Luxor, Aswan, Abu Simbel, and Alexandria!
--------------------------------------------------------------------------------
Title: Things to See in Luxor Egypt
Link: /blog/things-to-see-in-luxor-egypt
Date: 1/12/25
Excerpt: Luxor is the se

In [13]:
import re

def clean_text(text):
    # Remove HTML tags using regex
    cleaned_text = re.sub(r'<.*?>', '', text)
    # Remove special characters
    cleaned_text = re.sub(r'[^A-Za-z0-9\s]', '', cleaned_text)
    # Remove extra spaces
    cleaned_text = ' '.join(cleaned_text.split())
    return cleaned_text

# Example of cleaning the excerpt
for article in articles:
    excerpt = article.find('div', class_='blog-excerpt-wrapper').get_text(strip=True)
    cleaned_excerpt = clean_text(excerpt)
    print(f"Cleaned Excerpt: {cleaned_excerpt}")


Cleaned Excerpt: The Grand Egyptian Museum the largest archaeological museum in the world has its opening date set for July 3 Or so they say Heres what we know about the Grand Egyptian Museum opening and when to expect Egypts newest museum to open
Cleaned Excerpt: Use this Egypt Itinerary 10 Days with Nile Cruise to help plan your trip to Egypt see what you can do with a 10 full day trip and check all of those ancient Egyptian sites off of your bucket list in Cairo Luxor Aswan Abu Simbel and Alexandria
Cleaned Excerpt: Luxor is the secondmost important place to visit in Egypt aside from the Giza Pyramids and the Sphinx Places like Karnak Temple the Valley of the Kings Hatshepsuts Temple the tomb of King Tut and doing a sunrise hot air balloon over the Nile River are just a few of the experiences this city in Upper Egypt has to offer during your trip Lets talk about the must see attractions in Luxor the best Luxor restaurants best hotels in Luxor and answer all your frequently asked que

In [14]:
def chunk_text(text, chunk_size=500):
    # Split the text into chunks of a specified size
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

# Example of chunking the excerpt
for article in articles:
    excerpt = article.find('div', class_='blog-excerpt-wrapper').get_text(strip=True)
    cleaned_excerpt = clean_text(excerpt)
    chunks = chunk_text(cleaned_excerpt)
    print(f"Text Chunks: {chunks}")


Text Chunks: ['The Grand Egyptian Museum the largest archaeological museum in the world has its opening date set for July 3 Or so they say Heres what we know about the Grand Egyptian Museum opening and when to expect Egypts newest museum to open']
Text Chunks: ['Use this Egypt Itinerary 10 Days with Nile Cruise to help plan your trip to Egypt see what you can do with a 10 full day trip and check all of those ancient Egyptian sites off of your bucket list in Cairo Luxor Aswan Abu Simbel and Alexandria']
Text Chunks: ['Luxor is the secondmost important place to visit in Egypt aside from the Giza Pyramids and the Sphinx Places like Karnak Temple the Valley of the Kings Hatshepsuts Temple the tomb of King Tut and doing a sunrise hot air balloon over the Nile River are just a few of the experiences this city in Upper Egypt has to offer during your trip Lets talk about the must see attractions in Luxor the best Luxor restaurants best hotels in Luxor and answer all your frequently asked quest

In [15]:
# Extract metadata for the whole page (title, author, etc.)
page_title = soup.find('title').get_text(strip=True)
print(f"Page Title: {page_title}")

# Extract author information (if available)
author = articles[0].find('span', class_='blog-author').get_text(strip=True)
print(f"Author: {author}")


Page Title: Egypt Adventures Travel Blog — Egypt Adventures Travel
Author: Gus Gleiter


In [18]:
# Extract article details
articles = soup.find_all('article', class_='hentry')

for article in articles:
    title = article.find('h1', class_='blog-title').get_text(strip=True)
    link = article.find('a')['href']
    date = article.find('time', class_='blog-date').get_text(strip=True)
    author = article.find('span', class_='blog-author').get_text(strip=True)
    excerpt = article.find('div', class_='blog-excerpt-wrapper').get_text(strip=True)

    # Clean and chunk the excerpt
    cleaned_excerpt = clean_text(excerpt)
    chunks = chunk_text(cleaned_excerpt)

    print(f"\nTitle: {title}")
    print(f"Link: {link}")
    print(f"Date: {date}")
    print(f"Author: {author}")
    print(f"Cleaned Excerpt: {cleaned_excerpt}")
    print(f"Text Chunks: {chunks}")
    print("-" * 80)



Title: Grand Egyptian Museum Opening July 3!
Link: /blog/grand-egyptian-museum-opening
Date: 1/30/25
Author: Gus Gleiter
Cleaned Excerpt: The Grand Egyptian Museum the largest archaeological museum in the world has its opening date set for July 3 Or so they say Heres what we know about the Grand Egyptian Museum opening and when to expect Egypts newest museum to open
Text Chunks: ['The Grand Egyptian Museum the largest archaeological museum in the world has its opening date set for July 3 Or so they say Heres what we know about the Grand Egyptian Museum opening and when to expect Egypts newest museum to open']
--------------------------------------------------------------------------------

Title: Egypt Itinerary 10 Days with Nile Cruise
Link: /blog/egypt-itinerary-10-days-with-nile-cruise
Date: 1/22/25
Author: Gus Gleiter
Cleaned Excerpt: Use this Egypt Itinerary 10 Days with Nile Cruise to help plan your trip to Egypt see what you can do with a 10 full day trip and check all of those

In [17]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

# Send a request to the blog URL
url = 'https://www.egyptadventurestravel.com/blog'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Function to clean text
def clean_text(text):
    cleaned_text = re.sub(r'<.*?>', '', text)
    cleaned_text = re.sub(r'[^A-Za-z0-9\s]', '', cleaned_text)
    cleaned_text = ' '.join(cleaned_text.split())
    return cleaned_text

# Function to chunk text
def chunk_text(text, chunk_size=500):
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

# Extract article details
articles = soup.find_all('article', class_='hentry')

# Create a list to store article data
data = []

for article in articles:
    title = article.find('h1', class_='blog-title').get_text(strip=True)
    link = article.find('a')['href']
    date = article.find('time', class_='blog-date').get_text(strip=True)
    author = article.find('span', class_='blog-author').get_text(strip=True)
    excerpt = article.find('div', class_='blog-excerpt-wrapper').get_text(strip=True)

    # Clean and chunk the excerpt
    cleaned_excerpt = clean_text(excerpt)
    chunks = chunk_text(cleaned_excerpt)

    # Append the data to the list
    data.append([title, link, date, author, cleaned_excerpt, "; ".join(chunks)])

# Convert the list of data into a DataFrame
df = pd.DataFrame(data, columns=['Title', 'Link', 'Date', 'Author', 'Excerpt', 'Text Chunks'])

# Save the DataFrame to CSV
df.to_csv('egypt_blog_data.csv', index=False, encoding='utf-8')

print("Data has been saved to 'egypt_blog_data.csv'.")


Data has been saved to 'egypt_blog_data.csv'.
