In [22]:
import os
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Initialize a Chrome webdriver (you need to have Chrome driver installed)
driver = webdriver.Chrome()

# Fetch HTML content of the main page
url = "https://owasp.org/www-project-web-security-testing-guide/latest"
driver.get(url)

# Wait for the page to load and JavaScript to execute
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.TAG_NAME, "a")))

# Parse HTML content
html_content = driver.page_source

# Close the webdriver
driver.quit()

# Find all links using BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")
links = soup.find_all('a', href=True)

# Extract URLs and save them in an array
link_list = []
for link in links:
    href = link['href']
    if href.startswith('http'):  # Check if it's a full URL
        link_list.append(href)
    else:
        link_list.append("https://owasp.org" + href)  # Construct full URL if relative

# Output folder to save text files
output_folder = "/home/ubuntu/Downloads/ows_data"

# Create directory if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Function to extract main content body from a URL
def extract_main_content(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')

            # Find the main content body
            main_content = soup.find('section', id='div-main')  # Assuming main content is within <section id="div-main">

            # If main content is found, extract all visible text
            if main_content:
                text_content = ' '.join(main_content.stripped_strings)
                return text_content
            else:
                print(f"Main content not found for {url}")
                return None
        else:
            print(f"Failed to retrieve {url}. Status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error retrieving {url}: {e}")
        return None

# Iterate through each link and download text content
for link_url in link_list:
    # Download content only if it's a link to a document (you may want to add more checks)
    if link_url.startswith("https://owasp.org/www-project-web-security-testing-guide/latest"):
        # Extract main content body
        text_content = extract_main_content(link_url)
        if text_content:
            # Create filename from the link URL's last segment
            filename = link_url.split('/')[-1] + ".txt"
            # Write text content to file
            with open(os.path.join(output_folder, filename), 'w', encoding='utf-8') as f:
                f.write(text_content)
            print(f"Downloaded: {filename}")

print("All text content downloaded and saved.")

Downloaded: .txt
Downloaded: README.txt
Downloaded: README#The-OWASP-Testing-Project.txt
Downloaded: README#Principles-of-Testing.txt
Downloaded: README#Testing-Techniques-Explained.txt
Downloaded: README#Manual-Inspections-and-Reviews.txt
Downloaded: README#Threat-Modeling.txt
Downloaded: README#Source-Code-Review.txt
Downloaded: README#Penetration-Testing.txt
Downloaded: README#The-Need-for-a-Balanced-Approach.txt
Downloaded: README#Deriving-Security-Test-Requirements.txt
Downloaded: README#Security-Tests-Integrated-in-Development-and-Testing-Workflows.txt
Downloaded: README#Security-Test-Data-Analysis-and-Reporting.txt
Downloaded: 0-The_Web_Security_Testing_Framework.txt
Downloaded: 0-The_Web_Security_Testing_Framework#Phase-1-Before-Development-Begins.txt
Downloaded: 0-The_Web_Security_Testing_Framework#Phase-2-During-Definition-and-Design.txt
Downloaded: 0-The_Web_Security_Testing_Framework#Phase-3-During-Development.txt
Downloaded: 0-The_Web_Security_Testing_Framework#Phase-4-Dur