In [4]:
import requests
from bs4 import BeautifulSoup
from tenacity import retry, wait_fixed, stop_after_attempt, retry_if_exception_type
import json

# Define headers to mimic a browser request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1'
}

@retry(wait=wait_fixed(2), stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.exceptions.RequestException))
def fetch_url(url):
    response = requests.get(url, headers=headers)
    response.raise_for_status()  # Raise an HTTPError for bad responses
    return response.content

def extract_address_from_script(script_content):
    # Look for the "onEmbedLoad" function
    if "function onEmbedLoad()" in script_content:
        lines = script_content.splitlines()
        for line in lines:
            if '"/maps/embed/record204"' in line:
                # Skip everything before "/maps/embed/record204"
                parts = line.split('"/maps/embed/record204"')[1].split('"')
                if len(parts) > 1:
                    return parts[5]
    return "Address not found in script."

def get_address_from_iframe(url):
    try:
        # Step 1: Fetch the HTML content of the main page
        print(f"Fetching main page content from {url}...")
        main_page_content = fetch_url(url)
        
        # Step 2: Parse the HTML to find the iframe
        print("Parsing main page content...")
        soup = BeautifulSoup(main_page_content, 'html.parser')
        iframe_tag = soup.find('iframe', src=True)

        if iframe_tag:
            iframe_url = iframe_tag['src']
            print(f"Iframe URL found: {iframe_url}")

            # Step 3: Fetch the iframe content
            print("Fetching iframe content...")
            iframe_content = fetch_url(iframe_url)

            # Step 4: Parse the iframe HTML to extract the address
            print("Parsing iframe content...")
            iframe_soup = BeautifulSoup(iframe_content, 'html.parser')

            # Extract the script content
            script_tag = iframe_soup.find('script', text=lambda text: text and 'onEmbedLoad' in text)
            if script_tag:
                script_content = script_tag.get_text()
                address = extract_address_from_script(script_content)
                return address
            else:
                return "none"
        else:
            return "null"
    except requests.exceptions.RequestException as e:
        return f"Error fetching the URL: {e}"

def process_theme_parks(file_path):
    with open(file_path, 'r') as file:
        theme_parks = json.load(file)

    results = []

    for park in theme_parks:
        url = park['link']
        details = park['details']
        address = get_address_from_iframe(url)
        
        result = {
            'title': details['title'],
            'image': details['image'],
            'introText': details['introText'],
            'mapLink': details['mapLink'],
            'Address': address
        }
        results.append(result)

    with open('59.Data.json', 'w') as outfile:
        json.dump(results, outfile, indent=4)

# Example usage
process_theme_parks('theme_parks_details.json')


Fetching main page content from https://www.themeparks-uk.com/uk-theme-parks/england/adventure-island...
Parsing main page content...
Iframe URL found: https://maps.google.co.uk/maps?q=adventure+island&oe=utf-8&client=firefox-a&channel=sb&ie=UTF8&hq=&hnear=&ll=51.532394,0.716916&spn=0.006295,0.006295&t=h&iwloc=A&output=embed
Fetching iframe content...
Parsing iframe content...
Fetching main page content from https://www.themeparks-uk.com/uk-theme-parks/england/adventure-wonderland...
Parsing main page content...
Iframe URL found: https://www.google.com/maps/embed?pb=!1m18!1m12!1m3!1d1623.3048089362055!2d-1.8452566424510355!3d50.775162471816984!2m3!1f0!2f0!3f0!3m2!1i1024!2i768!4f13.1!3m3!1m2!1s0x487398e13af9a715%3A0x15f1acb138e4c5a8!2sAdventure+Wonderland!5e1!3m2!1sen!2suk!4v1494175585303
Fetching iframe content...
Parsing iframe content...
Fetching main page content from https://www.themeparks-uk.com/uk-theme-parks/wales/barry-island-pleasure-park...
Parsing main page content...
Iframe

IndexError: list index out of range

In [5]:
import requests
from bs4 import BeautifulSoup
from tenacity import retry, wait_fixed, stop_after_attempt, retry_if_exception_type
import json

# Define headers to mimic a browser request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1'
}

@retry(wait=wait_fixed(2), stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.exceptions.RequestException))
def fetch_url(url):
    response = requests.get(url, headers=headers)
    response.raise_for_status()  # Raise an HTTPError for bad responses
    return response.content

def extract_address_from_script(script_content):
    try:
        # Look for the "onEmbedLoad" function
        if "function onEmbedLoad()" in script_content:
            lines = script_content.splitlines()
            for line in lines:
                if '"/maps/embed/record204"' in line:
                    # Skip everything before "/maps/embed/record204"
                    parts = line.split('"/maps/embed/record204"')[1].split('"')
                    if len(parts) > 5:
                        return parts[5]
    except Exception as e:
        print(f"Error extracting address from script: {e}")
    return None

@retry(wait=wait_fixed(2), stop=stop_after_attempt(3), retry=retry_if_exception_type(Exception))
def get_address_from_iframe(url):
    try:
        # Step 1: Fetch the HTML content of the main page
        print(f"Fetching main page content from {url}...")
        main_page_content = fetch_url(url)
        
        # Step 2: Parse the HTML to find the iframe
        print("Parsing main page content...")
        soup = BeautifulSoup(main_page_content, 'html.parser')
        iframe_tag = soup.find('iframe', src=True)

        if iframe_tag:
            iframe_url = iframe_tag['src']
            print(f"Iframe URL found: {iframe_url}")

            # Step 3: Fetch the iframe content
            print("Fetching iframe content...")
            iframe_content = fetch_url(iframe_url)

            # Step 4: Parse the iframe HTML to extract the address
            print("Parsing iframe content...")
            iframe_soup = BeautifulSoup(iframe_content, 'html.parser')

            # Extract the script content
            script_tag = iframe_soup.find('script', text=lambda text: text and 'onEmbedLoad' in text)
            if script_tag:
                script_content = script_tag.get_text()
                address = extract_address_from_script(script_content)
                return address
            else:
                return None
        else:
            return None
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the URL: {e}")
        return None
    except Exception as e:
        print(f"Error processing iframe: {e}")
        return None

def process_theme_parks(file_path):
    with open(file_path, 'r') as file:
        theme_parks = json.load(file)

    results = []

    for park in theme_parks:
        url = park['link']
        details = park['details']
        address = get_address_from_iframe(url)
        
        result = {
            'title': details['title'],
            'image': details['image'],
            'introText': details['introText'],
            'mapLink': details['mapLink'],
            'Address': address if address else "Address not found"
        }
        results.append(result)

    with open('59.Data.json', 'w') as outfile:
        json.dump(results, outfile, indent=4)

# Example usage
process_theme_parks('theme_parks_details.json')


Fetching main page content from https://www.themeparks-uk.com/uk-theme-parks/england/adventure-island...
Parsing main page content...
Iframe URL found: https://maps.google.co.uk/maps?q=adventure+island&oe=utf-8&client=firefox-a&channel=sb&ie=UTF8&hq=&hnear=&ll=51.532394,0.716916&spn=0.006295,0.006295&t=h&iwloc=A&output=embed
Fetching iframe content...
Parsing iframe content...
Fetching main page content from https://www.themeparks-uk.com/uk-theme-parks/england/adventure-wonderland...
Parsing main page content...
Iframe URL found: https://www.google.com/maps/embed?pb=!1m18!1m12!1m3!1d1623.3048089362055!2d-1.8452566424510355!3d50.775162471816984!2m3!1f0!2f0!3f0!3m2!1i1024!2i768!4f13.1!3m3!1m2!1s0x487398e13af9a715%3A0x15f1acb138e4c5a8!2sAdventure+Wonderland!5e1!3m2!1sen!2suk!4v1494175585303
Fetching iframe content...
Parsing iframe content...
Fetching main page content from https://www.themeparks-uk.com/uk-theme-parks/wales/barry-island-pleasure-park...
Parsing main page content...
Iframe